Merge branch 'master' into jon/tall-and-skinny-matmul

2023-04-15 19:57:48 +08:00 · 2023-04-15 19:57:48 +08:00 · 69511b2c4a
commit 69511b2c4a
parent 73e7601bf3 aa485cee33
29 changed files with 1998 additions and 1440 deletions
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -5,9 +5,10 @@ FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip
 COPY requirements.txt requirements.txt
 RUN pip install --upgrade pip setuptools wheel \
-    && pip install numpy requests sentencepiece tqdm \
+    && pip install -r requirements.txt
    && pip install torch --index-url https://download.pytorch.org/whl/cpu
 WORKDIR /app
--- a/.editorconfig
+++ b/.editorconfig
@ -14,3 +14,6 @@ indent_size = 4
 [Makefile]
 indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset
--- a/.gitignore
+++ b/.gitignore
@ -23,6 +23,7 @@ models/*
 /result
 /perplexity
 /embedding
 /benchmark-q4_0-matmult
 /Pipfile
 arm_neon.h
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -56,6 +56,10 @@ option(LLAMA_AVX                    "llama: enable AVX"
 option(LLAMA_AVX2                   "llama: enable AVX2"                                    ON)
 option(LLAMA_AVX512                 "llama: enable AVX512"                                  OFF)
 option(LLAMA_FMA                    "llama: enable FMA"                                     ON)
 # in MSVC F16C is implied with AVX2/AVX512
 if (NOT MSVC)
    option(LLAMA_F16C               "llama: enable F16C"                                    ON)
 endif()
 # 3rd party libs
 option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"                    ON)
@ -116,6 +120,21 @@ if (LLAMA_OPENBLAS)
        add_compile_definitions(GGML_USE_OPENBLAS)
        add_link_options(${BLAS_LIBRARIES})
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
        # find header file
        set(OPENBLAS_INCLUDE_SEARCH_PATHS
            /usr/include
            /usr/include/openblas
            /usr/include/openblas-base
            /usr/local/include
            /usr/local/include/openblas
            /usr/local/include/openblas-base
            /opt/OpenBLAS/include
            $ENV{OpenBLAS_HOME}
            $ENV{OpenBLAS_HOME}/include
            )
        find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
        add_compile_options(-I${OPENBLAS_INC})
    else()
        message(WARNING "OpenBLAS not found")
    endif()
@ -207,7 +226,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
            add_compile_options(/arch:AVX)
        endif()
    else()
-        add_compile_options(-mf16c)
+        if (LLAMA_F16C)
            add_compile_options(-mf16c)
        endif()
        if (LLAMA_FMA)
            add_compile_options(-mfma)
        endif()
@ -247,7 +268,6 @@ endif()
 add_library(llama
            llama.cpp
            llama.h
            llama_internal.h
            llama_util.h)
 target_include_directories(llama PUBLIC .)
--- a/27
+++ b/27
@ -140,41 +140,46 @@ default: main quantize perplexity embedding
 #
 ggml.o: ggml.c ggml.h
-	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
+	$(CC)  $(CFLAGS)   -c $< -o $@
-llama.o: llama.cpp llama.h llama_util.h llama_internal.h
+llama.o: llama.cpp ggml.h llama.h llama_util.h
-	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 common.o: examples/common.cpp examples/common.h
-	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 clean:
-	rm -vf *.o main quantize quantize-stats perplexity embedding
+	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
 main: examples/main/main.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 quantize: examples/quantize/quantize.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 libllama.so: llama.o ggml.o
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 #
 # Tests
 #
 benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o
 	$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
 	./benchmark-q4_0-matmult
 .PHONY: tests
 tests:
 	bash ./tests/run-tests.sh
--- a/README.md
+++ b/README.md
@ -49,6 +49,7 @@ New features will probably be added mostly through community contributions.
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
 **UI:**
@ -149,30 +150,52 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
 ## Usage
-Here are the step for the LLaMA-7B model:
+Here are the step for the LLaMA-7B model.
 ### Get the Code
 ```bash
 # build this repo
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
-make
+```
-#For Windows and CMake, use the following command instead:
+### Build
 cd <path_to_llama_folder>
 mkdir build
 cd build
 cmake ..
 cmake --build . --config Release
 Note: For Windows, CMake or Zig can be used.
 1. Use `make`
    ```bash
    make
    ```
 1. Use CMake
    ```bash
    mkdir build
    cd build
    cmake ..
    cmake --build . --config Release
    ```
 1. Use Zig
    ```bash
    zig build -Drelease-fast
    ```
 ### Prepare Data & Run
 ```bash
 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
 # install Python dependencies
-python3 -m pip install torch numpy sentencepiece
+python3 -m pip install -r requirements.txt
 # convert the 7B model to ggml FP16 format
-python3 convert-pth-to-ggml.py models/7B/ 1
+python3 convert.py models/7B/
 # quantize the model to 4-bits (using method 2 = q4_0)
 ./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
@ -181,8 +204,6 @@ python3 convert-pth-to-ggml.py models/7B/ 1
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
 ```
 Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11.
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
 ### Memory/Disk Requirements
--- a/build.zig
+++ b/build.zig
@ -1,16 +1,14 @@
 const std = @import("std");
-pub fn build(b: *std.Build) void {
+pub fn build(b: *std.build.Builder) void {
    const target = b.standardTargetOptions(.{});
-    const optimize = b.standardOptimizeOption(.{});
+    const optimize = b.standardReleaseOptions();
    const want_lto = b.option(bool, "lto", "Want -fLTO");
-    const lib = b.addStaticLibrary(.{
+    const lib = b.addStaticLibrary("llama", null);
        .name = "llama",
        .target = target,
        .optimize = optimize,
    });
    lib.want_lto = want_lto;
    lib.setTarget(target);
    lib.setBuildMode(optimize);
    lib.linkLibCpp();
    lib.addIncludePath(".");
    lib.addIncludePath("examples");
@ -44,16 +42,12 @@ pub fn build(b: *std.Build) void {
 fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
    const b = args.b;
    const lib = args.lib;
    const target = args.target;
    const optimize = args.optimize;
    const want_lto = args.want_lto;
-    const exe = b.addExecutable(.{
+    const exe = b.addExecutable(name, null);
        .name = name,
        .target = target,
        .optimize = optimize,
    });
    exe.want_lto = want_lto;
    lib.setTarget(args.target);
    lib.setBuildMode(args.optimize);
    exe.addIncludePath(".");
    exe.addIncludePath("examples");
    exe.addCSourceFiles(&.{
--- a/convert-ggml-to-pth.py
+++ b/convert-ggml-to-pth.py
@ -1,299 +0,0 @@
 # Author: github.com/ductai199x
 import argparse
 import os
 import struct
 import numpy as np
 import torch
 from numba import njit
 from tqdm.auto import tqdm
 def read_header(fin):
    values = struct.unpack("i" * 9, fin.read(4 * 9))
    _, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
    return {
        "vocab_size": vocab_size,
        "dim": dim,
        "multiple_of": multiple_of,
        "n_heads": n_heads,
        "n_layers": n_layers,
    }, ftype
 def read_tokens(fin, vocab_size):
    tokens = []
    for _ in range(vocab_size):
        text_len = struct.unpack("i", fin.read(4))[0]
        text_bytes = fin.read(text_len)
        try:
            text = text_bytes.decode()
        except UnicodeDecodeError:
            text = text_bytes.decode(errors="replace")
        score = struct.unpack("f", fin.read(4))[0]
        tokens.append((text, score))
    return tokens
@njit
 def dequantize_weights_numba(fin_data, n_rows, n_cols):
    qk = 32
    nb = n_cols // qk
    bs = 4 + (qk // 2)
    weights = np.zeros((n_rows, n_cols), dtype=np.float32)
    data_pos = 0
    for row in range(n_rows):
        for block in range(nb):
            d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
            data_pos += 4
            packed_values = fin_data[data_pos : data_pos + (qk // 2)]
            data_pos += qk // 2
            for i in range(qk // 2):
                packed_value = packed_values[i]
                v0 = np.float32((packed_value & 0b00001111) - 8) * d
                v1 = np.float32((packed_value >> 4) - 8) * d
                weights[row, block * qk + 2 * i] = v0
                weights[row, block * qk + 2 * i + 1] = v1
    return weights
 def dequantize_weights(fin, n_rows, n_cols):
    qk = 32
    nb = n_cols // qk
    data_size = n_rows * n_cols // 2 + n_rows * nb * 4
    fin_data = fin.read(data_size)
    return dequantize_weights_numba(fin_data, n_rows, n_cols)
 def read_variables(fin):
    model = {}
    pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
    while True:
        start_pos = fin.tell()
        try:
            n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
        except struct.error:
            break
        shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
        shape = shape[::-1]
        name = fin.read(name_length).decode()
        # ensure tensor data is aligned
        tensor_data_offset = fin.tell()
        tensor_data_offset = (tensor_data_offset + 31) & -32
        fin.seek(tensor_data_offset)
        if ftype_cur == 2:
            # 4-bit quantized weights
            dtype = np.uint8
            data = dequantize_weights(fin, shape[0], shape[1])
            data = data.reshape(shape)
        elif ftype_cur == 0:
            dtype = np.float32
            data_size = np.prod(shape)
            data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
        elif ftype_cur == 1:
            dtype = np.float16
            data_size = np.prod(shape)
            data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
        model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
        pbar.update(fin.tell() - start_pos)
    return model
 def convert_to_hf_format(model, hparams):
    # This works for llama 7B, need to test with other models
    n_layers = hparams["n_layers"]
    n_heads = hparams["n_heads"]
    dim = hparams["dim"]
    dims_per_head = dim // n_heads
    base = 10000.0
    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
    # permute for sliced rotary
    def permute(w):
        return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
    state_dict = {}
    for layer_i in range(n_layers):
        state_dict.update(
            {
                f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
                    model[f"layers.{layer_i}.attention.wq.weight"]
                ),
                f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
                    model[f"layers.{layer_i}.attention.wk.weight"]
                ),
                f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
                    f"layers.{layer_i}.attention.wv.weight"
                ],
                f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
                    f"layers.{layer_i}.attention.wo.weight"
                ],
                f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
                    f"layers.{layer_i}.feed_forward.w1.weight"
                ],
                f"model.layers.{layer_i}.mlp.down_proj.weight": model[
                    f"layers.{layer_i}.feed_forward.w2.weight"
                ],
                f"model.layers.{layer_i}.mlp.up_proj.weight": model[
                    f"layers.{layer_i}.feed_forward.w3.weight"
                ],
                f"model.layers.{layer_i}.input_layernorm.weight": model[
                    f"layers.{layer_i}.attention_norm.weight"
                ],
                f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
                    f"layers.{layer_i}.ffn_norm.weight"
                ],
            }
        )
        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
    state_dict.update(
        {
            "model.embed_tokens.weight": model["tok_embeddings.weight"],
            "model.norm.weight": model["norm.weight"],
            "lm_head.weight": model["output.weight"],
        }
    )
    return state_dict
 def chat(model, hparams, llama_dir):
    from transformers import (GenerationConfig, LlamaForCausalLM,
                              LlamaTokenizer, StoppingCriteria,
                              StoppingCriteriaList)
    from transformers.models.llama.configuration_llama import LlamaConfig
    class StoppingCriteriaSub(StoppingCriteria):
        def __init__(self):
            super().__init__()
        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
            print(tokenizer.decode(input_ids[0]), end="", flush=True)
            if input_ids[0][-1] == 13:
                return True
            return False
    config = LlamaConfig(
        vocab_size=hparams["vocab_size"],
        dim=hparams["dim"],
        num_hidden_layers=hparams["n_layers"],
        num_attention_heads=hparams["n_heads"],
    )
    llama = LlamaForCausalLM(config=config)
    llama.load_state_dict(state_dict=model, strict=True)
    tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
    device = torch.device("cpu")
    llama = llama.to(device)
    ctx = """You are AI.
 This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
 User: Hello, AI.
 AI: Hello! How can I assist you today?
 """
    print(ctx.rstrip("\n"))
    while True:
        print("-" * 60)
        prompt = input("User: ")
        if ctx != "":
            ctx = f"{ctx}User: {prompt}\n"
        else:
            ctx = f"{prompt}\nAI:"
        ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
        print("-" * 60)
        if len(ctx.strip()) > 0:
            input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
            generation_config = GenerationConfig(
                temperature=0.8,
                top_p=0.95,
                top_k=50,
                repetition_penalty=1.1764,
            )
            with torch.no_grad():
                generation_output = llama.generate(
                    input_ids=input_ids,
                    generation_config=generation_config,
                    return_dict_in_generate=True,
                    output_scores=True,
                    max_length=2048,
                    do_sample=True,
                    stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
                )
            s = generation_output.sequences[0]
            decoded = tokenizer.decode(s)
            ctx = f"{decoded}\n"
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
    )
    parser.add_argument(
        "--prefix",
        "-p",
        type=str,
        required=True,
        help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
    )
    parser.add_argument(
        "--hf",
        action="store_true",
        help="Whether to save the model in the Hugging Face format. (default: False)",
    )
    parser.add_argument(
        "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
    )
    args = parser.parse_args()
    llama_dir = os.path.abspath(f"{args.input_dir}/../")
    ggml_files = sorted(
        [f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
    )
    fin = open(ggml_files[0], "rb")
    hparams, ftype = read_header(fin)
    tokens = read_tokens(fin, hparams["vocab_size"])
    model = read_variables(fin)
    for f in tqdm(ggml_files[1:]):
        fin = open(f, "rb")
        read_header(fin)
        read_tokens(fin, hparams["vocab_size"])
        model.update(read_variables(fin))
    if args.hf:
        model = convert_to_hf_format(model, hparams)
    pth_ckpt = {
        "state_dict": model,
        "hparams": hparams,
        "tokens": tokens,
    }
    torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
    if args.chat:
        if not args.hf:
            model = convert_to_hf_format(model, hparams)
        chat(model, hparams, llama_dir)
 if __name__ == "__main__":
    main()
--- a/convert-gpt4all-to-ggml.py
+++ b/convert-gpt4all-to-ggml.py
@ -1,107 +0,0 @@
 #!/usr/bin/env python3
 #
 # TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
 #
 # Original by https://github.com/eiz
 # https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
 import argparse
 import glob
 import os
 import struct
 import sys
 from sentencepiece import SentencePieceProcessor
 HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
 def parse_args():
    parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
    parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
    return parser.parse_args()
 def read_header(f_in):
    struct_fmt = "i" * (3 + len(HPARAMS))
    struct_size = struct.calcsize(struct_fmt)
    buf = f_in.read(struct_size)
    return struct.unpack(struct_fmt, buf)
 def write_header(f_out, header):
    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
    if magic != 0x67676d6c:
        raise Exception('Invalid file magic. Must be an old style ggml file.')
    values = [
        0x67676d66, # magic: ggml in hex
        1,          # file version
        vocab_size,
        dim,
        multiple_of,
        n_heads,
        n_layers,
        rot,
        ftype
    ]
    f_out.write(struct.pack("i" * len(values), *values))
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
            piece = tokenizer.id_to_piece(i)
            if len(piece) != 6:
                print(f"Invalid token: {piece}")
                sys.exit(1)
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))
    # TODO: GPT4All - add extra <pad> token
    text = "<pad>".encode()
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
    fout.write(struct.pack("f", 0.0))
 def read_tokens(f_in, tokenizer):
    for i in range(tokenizer.vocab_size()):
        len_b = f_in.read(4)
        (length,) = struct.unpack("i", len_b)
        f_in.read(length)
 def copy_all_data(f_out, f_in):
    while True:
        buf = f_in.read(1024 * 1024)
        if not buf:
            break
        f_out.write(buf)
 def convert_one_file(path_in, tokenizer):
    path_tmp = f"{path_in}.tmp"
    path_orig= f"{path_in}.orig"
    print(f"converting {path_in}")
    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
        write_header(f_out, read_header(f_in))
        read_tokens(f_in, tokenizer)
        write_tokens(f_out, tokenizer)
        copy_all_data(f_out, f_in)
    os.rename(path_in, path_orig)
    os.rename(path_tmp, path_in)
 def main():
    args = parse_args()
    tokenizer = SentencePieceProcessor(args.tokenizer_model)
    convert_one_file(args.gpt4all_model, tokenizer)
 if __name__ == "__main__":
    main()
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@ -1,172 +0,0 @@
 # Convert a GPTQ quantized LLaMA model to a ggml compatible file
 # Based on: https://github.com/qwopqwop200/GPTQ-for-LLaMa
 #
 import os
 import re
 import sys
 import json
 import struct
 import numpy as np
 import torch
 from sentencepiece import SentencePieceProcessor
 if len(sys.argv) != 4:
    print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n")
    sys.exit(1)
 fname_model = sys.argv[1]
 fname_tokenizer = sys.argv[2]
 dir_out = sys.argv[3]
 model = torch.load(fname_model, map_location="cpu")
 n_vocab, n_embd = model['model.embed_tokens.weight'].shape
 n_layer = 1 + max(int(m.group(1)) for name in model
                  if (m := re.match(r'model\.layers\.([0-9]+)', name)))
 # hardcoded:
 n_mult = 256
 n_head = {32: 32, 40: 40, 60: 52, 80: 64}[n_layer]
 tokenizer = SentencePieceProcessor(fname_tokenizer)
 assert tokenizer.vocab_size() == n_vocab
 fname_out = sys.argv[3]
 fout = open(fname_out, "wb")
 fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
 fout.write(struct.pack("i", 1)) # file version
 fout.write(struct.pack("i", n_vocab))
 fout.write(struct.pack("i", n_embd))
 fout.write(struct.pack("i", n_mult))
 fout.write(struct.pack("i", n_head))
 fout.write(struct.pack("i", n_layer))
 fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
 fout.write(struct.pack("i", 4))
 # This loop unchanged from convert-pth-to-ggml.py:
 for i in range(tokenizer.vocab_size()):
    if tokenizer.is_unknown(i):
        text = " \u2047 ".encode()
    elif tokenizer.is_control(i):
        text = b""
    elif tokenizer.is_byte(i):
        piece = tokenizer.id_to_piece(i)
        if len(piece) != 6:
            print(f"Invalid token: {piece}")
            sys.exit(1)
        byte_value = int(piece[3:-1], 16)
        text = struct.pack("B", byte_value)
    else:
        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
    fout.write(struct.pack("f", tokenizer.get_score(i)))
 def write_header(shape, dst_name, ftype_cur):
    sname = dst_name.encode()
    fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
    fout.write(sname)
    # ensure tensor data is aligned
    tensor_data_offset = fout.tell()
    tensor_data_offset = (tensor_data_offset + 31) & -32
    fout.seek(tensor_data_offset)
 def convert_non_q4(src_name, dst_name):
    v = model[src_name]
    shape = v.shape
    print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
    if len(shape) == 1:
        print("  Converting to float32")
        v = v.to(torch.float32)
    ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype]
    # header
    write_header(shape, dst_name, ftype_cur)
    # data
    v.numpy().tofile(fout)
 def convert_q4(src_name, dst_name, permute=False):
    zeros = model[f"{src_name}.zeros"].numpy()
    scales = model[f"{src_name}.scales"].numpy()
    bias = model[f"{src_name}.bias"].numpy()
    qweight = model[f"{src_name}.qweight"].numpy().T # transpose
    # Q4_1 does not support bias; good thing the bias is always all zeros.
    assert not np.any(bias)
    # Each int32 item is actually 8 int4 items packed together, and it's transposed.
    shape = (qweight.shape[0], qweight.shape[1] * 8)
    print(f"Processing Q4 variable: {src_name} with shape: {shape}")
    # The output format has the int4 weights in groups of 32 rather than 8.
    # It looks like this:
    # For each row:
    #   For each group of 32 columns:
    #     - addend (float32, 4 bytes)
    #     - scale (float32, 4 bytes)
    #     - weights (int4 * 32, 16 bytes)
    # Note that in the input, the scales and addends are shared between all
    # the columns in a row, so we end up wasting quite a bit of memory with
    # repeated scales and addends.
    addends = -zeros # flip sign
    # Since the output format is mixed between integers and floats, we have
    # to hackily view the floats as int32s just so numpy will let us
    # concatenate them.
    addends_view = addends.view(dtype=np.int32)
    scales_view = scales.view(dtype=np.int32)
    # Split into groups of 4 columns (i.e. 32 columns of quantized data):
    grouped = qweight.reshape([qweight.shape[0], qweight.shape[1] // 4, 4])
    # Repeat addends and scales:
    addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1)
    scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1)
    blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no')
    if permute:
        # Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
        # This can be done after the above conversion because it doesn't affect column order/layout.
        blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
                    .swapaxes(1, 2)
                    .reshape(blob.shape))
    # header
    write_header(shape, dst_name, 3) # ftype = Q4_1
    # data
    blob.tofile(fout)
 convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight")
 convert_non_q4("model.norm.weight", "norm.weight")
 convert_non_q4("lm_head.weight", "output.weight")
 for i in range(n_layer):
    convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True)
    convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True)
    convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight")
    convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight")
    convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight")
    convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight")
    convert_q4(f"model.layers.{i}.mlp.up_proj",   f"layers.{i}.feed_forward.w3.weight")
    convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight")
    convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight")
 fout.close()
 print(f"Done. Output file: {fname_out}")
 print()
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@ -1,274 +1,11 @@
-# Convert a LLaMA model checkpoint to a ggjt compatible file
+# Compatibility stub
 #
 # Load the model using Torch
 # Iterate over all variables and write them to a binary file.
 #
 # For each variable, write the following:
 #   - Number of dimensions (int)
 #   - Name length (int)
 #   - Dimensions (int[n_dims])
 #   - Name (char[name_length])
 #   - Data (float[n_dims])
 #
 # At the start of the ggml file we write the model parameters
 # and vocabulary.
 #
 import argparse
 import os
 import sys
 import json
 import struct
 import numpy as np
 import torch
-from sentencepiece import SentencePieceProcessor
+import convert
-QK = 32
+parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
-
+parser.add_argument('dir_model',  help='directory containing the model checkpoint')
-GGML_TYPE_Q4_0  = 0
+parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
-GGML_TYPE_Q4_1  = 1
+args = parser.parse_args()
-GGML_TYPE_I8    = 2
+convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])
 GGML_TYPE_I16   = 3
 GGML_TYPE_I32   = 4
 GGML_TYPE_F16   = 5
 GGML_TYPE_F32   = 6
 WTYPES = {
    0: GGML_TYPE_F32,
    1: GGML_TYPE_F16,
    2: GGML_TYPE_Q4_0,
    3: GGML_TYPE_Q4_1,
 }
 GGML_BLCK_SIZE = {
    GGML_TYPE_Q4_0:  QK,
    GGML_TYPE_Q4_1:  QK,
    GGML_TYPE_I8:    1,
    GGML_TYPE_I16:   1,
    GGML_TYPE_I32:   1,
    GGML_TYPE_F16:   1,
    GGML_TYPE_F32:   1,
 }
 GGML_TYPE_SIZE = {
    GGML_TYPE_Q4_0: 4   + QK//2,
    GGML_TYPE_Q4_1: 4*2 + QK//2,
    GGML_TYPE_I8:   1,
    GGML_TYPE_I16:  2,
    GGML_TYPE_I32:  4,
    GGML_TYPE_F16:  2,
    GGML_TYPE_F32:  4,
 }
 def ggml_nelements(shape):
    r = 1
    for i in shape:
        r *= i
    return r
 def ggml_nbytes(shape, ftype):
    x = ggml_nelements(shape)
    t = WTYPES[ftype]
    x *= GGML_TYPE_SIZE[t]
    x //= GGML_BLCK_SIZE[t]
    return x
 def parse_args():
    parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
    parser.add_argument('dir_model',  help='directory containing the model checkpoint')
    parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
    parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
    return parser.parse_args()
 def get_n_parts(dim):
    mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
    n_parts = mappings.get(dim)
    if n_parts is None:
        print(f"Invalid dim: {dim}")
        sys.exit(1)
    print(f"n_parts = {n_parts}\n")
    return n_parts
 def load_hparams_and_tokenizer(dir_model):
    # `dir_model` is something like `models/7B` or `models/7B/`.
    # "tokenizer.model" is expected under model's parent dir.
    # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
    # Let's use the model's parent dir directly.
    model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
    fname_hparams = f"{dir_model}/params.json"
    fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
    with open(fname_hparams, "r") as f:
        hparams = json.load(f)
        print(hparams)
    tokenizer = SentencePieceProcessor(fname_tokenizer)
    hparams.update({"vocab_size": tokenizer.vocab_size()})
    return hparams, tokenizer
 def write_header(fout, hparams, ftype):
    keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
    values = [
        0x67676a74,  # magic: ggjt in hex
        1, # file version
        *[hparams[key] for key in keys],
        hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
        ftype
    ]
    fout.write(struct.pack("i" * len(values), *values))
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
            piece = tokenizer.id_to_piece(i)
            if len(piece) != 6:
                print(f"Invalid token: {piece}")
                sys.exit(1)
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))
 def process_and_write_variables(fout, model, ftype, part_id, n_parts):
    for name, datao in model.items():
        if name.endswith("freqs"):
            continue
        # remove dimensions with a single element
        data = datao.numpy().squeeze()
        partshape = data.shape
        n_dims = len(data.shape)
        assert n_dims in (1, 2)
        print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
        # coerce single-dimensional tensors from float16 to float32
        ftype_cur = 1
        if ftype == 0 or n_dims == 1:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
        blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
        type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
        # determine dimension along which multipart tensor is sharded
        #
        # split_dim 0 regex:
        #   - output.*
        #   - layers.*.attention.wq.weight
        #   - layers.*.attention.wk.weight
        #   - layers.*.attention.wv.weight
        #   - layers.*.feed_forward.w1.weight
        #   - layers.*.feed_forward.w3.weight
        #
        # split_dim 1 regex:
        #   - tok_embeddings.*
        #   - layers.*.attention.wo.weight
        #   - layers.*.feed_forward.w2.weight
        #
        if n_dims > 1:
            split_dim = 1
            if "tok_embeddings" in name:
                split_dim = 1
            elif "layers" in name:
                if "attention.wo.weight" in name:
                    split_dim = 1
                elif "feed_forward.w2.weight" in name:
                    split_dim = 1
                else:
                    split_dim = 0
            elif "output" in name:
                split_dim = 0
        # output tensor header
        fullshape = list(partshape)
        if n_dims > 1:
            fullshape[split_dim] *= n_parts
        sname = name.encode()
        fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
        for dim in reversed(fullshape):
            fout.write(struct.pack("i", dim))
        fout.write(sname)
        # ensure tensor data is aligned
        tensor_data_offset = fout.tell()
        while tensor_data_offset % QK != 0:
            fout.write(struct.pack("B", 0))
            tensor_data_offset += 1
        # output unified mappable tensor data
        if n_dims == 1 or n_parts == 1:
            # copy tensor which we thankfully received in one piece
            if part_id == 0:
                data.tofile(fout)
        elif split_dim == 0:
            # reassemble multifile tensor containing some of the rows
            rows_per_chunk = partshape[0]
            current_row = part_id * rows_per_chunk
            bytes_per_row = fullshape[1] // blck_size * type_size
            offset = current_row * bytes_per_row
            fout.seek(tensor_data_offset + offset)
            data.tofile(fout)
        elif split_dim == 1:
            # reassemble multifile tensor containing some of the cols
            cols_per_chunk = partshape[1]
            current_col = part_id * cols_per_chunk
            bytes_per_row = fullshape[1] // blck_size * type_size
            offset_current_col = current_col // blck_size * type_size
            for row in range(partshape[0]):
                offset_row = row * bytes_per_row
                offset = offset_row + offset_current_col
                fout.seek(tensor_data_offset + offset)
                data[row].tofile(fout)
        # advance file position to next tensor
        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
 def main():
    args = parse_args()
    dir_model = args.dir_model
    ftype = args.ftype
    ftype_str = ["f32", "f16"]
    hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
    print(args)
    # if only writing vocab to file
    if args.vocab_only:
        fname_model = f"{dir_model}/consolidated.00.pth"
        fname_out = f"{dir_model}/ggml-vocab.bin"
        print(f"Extracting only the vocab from '{fname_model}'\n")
        with open(fname_out, "wb") as fout:
            write_header(fout, hparams, ftype)
            write_tokens(fout, tokenizer)
        print(f"Done. Output file: {fname_out}\n")
        return
    n_parts = get_n_parts(hparams["dim"])
    fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
    # we output a single file for ggml
    with open(fname_out, "wb") as fout:
        write_header(fout, hparams, ftype)
        write_tokens(fout, tokenizer)
        offset_of_tensors = fout.tell()
        # the tensors we load could be split across multiple files
        for part_id in range(n_parts):
            fout.seek(offset_of_tensors)
            print(f"Processing part {part_id+1} of {n_parts}\n")
            fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
            model = torch.load(fname_model, map_location="cpu")
            process_and_write_variables(fout, model, ftype, part_id, n_parts)
            del model
    print(f"Done. Output file: {fname_out}\n")
 if __name__ == "__main__":
    main()
--- a/convert-unversioned-ggml-to-ggml.py
+++ b/convert-unversioned-ggml-to-ggml.py
@ -1,100 +0,0 @@
 #!/usr/bin/env python3
 # Original by https://github.com/eiz
 # https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
 import argparse
 import glob
 import os
 import struct
 import sys
 from sentencepiece import SentencePieceProcessor
 HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
 def parse_args():
    parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
    parser.add_argument('dir_model', help='directory containing ggml .bin files')
    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
    return parser.parse_args()
 def read_header(f_in):
    struct_fmt = "i" * (3 + len(HPARAMS))
    struct_size = struct.calcsize(struct_fmt)
    buf = f_in.read(struct_size)
    return struct.unpack(struct_fmt, buf)
 def write_header(f_out, header):
    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
    if magic != 0x67676d6c:
        raise Exception('Invalid file magic. Must be an old style ggml file.')
    values = [
        0x67676d66,  # magic: ggml in hex
        1, # file version
        vocab_size,
        dim,
        multiple_of,
        n_heads,
        n_layers,
        rot,
        ftype
    ]
    f_out.write(struct.pack("i" * len(values), *values))
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
            piece = tokenizer.id_to_piece(i)
            if len(piece) != 6:
                print(f"Invalid token: {piece}")
                sys.exit(1)
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))
 def read_tokens(f_in, tokenizer):
    for i in range(tokenizer.vocab_size()):
        len_b = f_in.read(4)
        (length,) = struct.unpack("i", len_b)
        f_in.read(length)
 def copy_all_data(f_out, f_in):
    while True:
        buf = f_in.read(1024 * 1024)
        if not buf:
            break
        f_out.write(buf)
 def convert_one_file(path_in, tokenizer):
    path_tmp = f"{path_in}.tmp"
    path_orig= f"{path_in}.orig"
    print(f"converting {path_in}")
    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
        write_header(f_out, read_header(f_in))
        read_tokens(f_in, tokenizer)
        write_tokens(f_out, tokenizer)
        copy_all_data(f_out, f_in)
    os.rename(path_in, path_orig)
    os.rename(path_tmp, path_in)
 def main():
    args = parse_args()
    files = []
    files.extend(glob.glob(f"{args.dir_model}/*.bin"))
    files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
    tokenizer = SentencePieceProcessor(args.tokenizer_model)
    for file in files:
        convert_one_file(file, tokenizer)
 if __name__ == "__main__":
    main()
--- a/convert.py
+++ b/convert.py
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@ -7,4 +7,4 @@
 cd `dirname $0`
 cd ..
-./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
+./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt --ctx_size 2048 -n -1 -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
--- a/examples/benchmark/benchmark-q4_0-matmult.c
+++ b/examples/benchmark/benchmark-q4_0-matmult.c
@ -0,0 +1,270 @@
 /*
    License: MIT License
    Changelog:
    - 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
 */
 #include <locale.h>
 #include "ggml.h"
 #include <assert.h>
 #include <math.h>
 #include <cstring>
 #include <cstdio>
 #include <cinttypes>
 #include <unordered_map>
 #include <queue>
 #include <string.h>
 #include <cassert>
 #include <fstream>
 #include <string>
 #include <iterator>
 #include <algorithm>
 float tensor_sum_elements(struct ggml_tensor * tensor) {
    float sum = 0;
    if (tensor->type==GGML_TYPE_F32) {
        for (int j = 0; j < tensor->ne[1]; j++) {
            for (int k = 0; k < tensor->ne[0]; k++) {
                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k];
            }
        }
    }
    return sum;
 }
 /*
    These are mapping to unknown
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
    GGML_TYPE_COUNT,
 */
 #define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
 #define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
        TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
        TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
    { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
 struct benchmark_params_struct {
    int32_t n_threads     = 1;
    int32_t n_iterations  = 10;
 };
 void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h, --help            show this help message and exit\n");
    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -i N, --iter N     number of iterations to use during computation (default: %d)\n", params.n_iterations);
    fprintf(stderr, "\n");
 }
 int main(int argc, char ** argv)  {
    struct benchmark_params_struct benchmark_params;
    bool invalid_param = false;
    std::string arg;
    for (int i = 1; i < argc; i++) {
        arg = argv[i];
        if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            benchmark_params.n_threads = std::stoi(argv[i]);
        } else if (arg == "-i" || arg == "--iter") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            benchmark_params.n_iterations = std::stoi(argv[i]);
        }  else if (arg == "-h" || arg == "--help") {
            print_usage(argc, argv, benchmark_params);
            exit(0);
        }
        if (invalid_param) {
            fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
            print_usage(argc, argv, benchmark_params);
            exit(1);
        }
    }
    // create the ggml context
    printf("Starting Test\n");
    struct ggml_context * ctx;
    //const int sizex = 4096;
    //const int sizey = 11008;
 #undef VERBOSE_DEBUGGING
 #ifndef VERBOSE_DEBUGGING
    const int sizey = 4096;
    const int sizex = 11008;
    const int sizez = 128;
 #else
    /* Working - let's increase size */
    const int sizey = 1;
    const int sizex = (8*32);
    const int sizez = 1;
    /*const int sizey = 1;
    const int sizex = 3*(8*32);
    const int sizez = 1;*/
 #endif
    //printf("Memsize required = %i\n", sizex*sizex);
    ggml_type wtype = GGML_TYPE_F32;
    size_t ctx_size = 0;
    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
    ctx_size += sizex*sizeof(float);
    ctx_size += 1024*1024*100;
    printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
    struct ggml_init_params params = {
        /*.mem_size   =*/ ctx_size,
        /*.mem_buffer =*/ NULL,
        /* no_alloc   =*/ 0
    };
    ctx = ggml_init(params);
    if (!ctx) {
        fprintf(stderr, "%s: ggml_init() failed\n", __func__);
        return false;
    }
    printf("Creating new tensors\n");
    // printf("Creating new tensor m1\n");
    struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
    ggml_set_f32(m11, 1.0f);
    // printf("Creating new tensor m1\n");
    struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
    ggml_set_f32(m12, 1.5f);
    // printf("Creating new tensor m2\n");
    struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
    ggml_set_f32(m2, 2.0f);
    printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
    // printf("Creating new tensor m11xm2\n");
    struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
    // printf("Creating compute graph\n");
    struct ggml_cgraph gf = ggml_build_forward(m11xm2);
    gf.n_threads=benchmark_params.n_threads;
    printf("cgraph->n_threads=%i\n",gf.n_threads);
    TENSOR_DUMP(m11);
    TENSOR_DUMP(m2);
    ggml_graph_compute(ctx, &gf);
    TENSOR_DUMP(gf.nodes[0]);
    printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
    int32_t nelements = sizex*sizey;
    int32_t ne[2] = { sizex, sizey };
    std::vector<int64_t> hist_cur(1 << 4, 0);
    // Set up a the benchmark matrices
    // printf("Creating new tensor q11 & Running quantize\n");
    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
    ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
    // Set up a the compute graph
    // printf("Creating new tensor q31\n");
    struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
    // printf("Creating compute graph\n");
    struct ggml_cgraph gf31 = ggml_build_forward(q31);
    gf31.n_threads=benchmark_params.n_threads;
    // Set up a second graph computation to make sure we override the CPU cache lines
    // printf("Creating new tensor q12 & Running quantize\n");
    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
    ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
    // printf("Creating new tensor q32\n");
    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
    //printf("Creating compute graph\n");
    struct ggml_cgraph gf32 = ggml_build_forward(q32);
    gf32.n_threads=benchmark_params.n_threads;
    printf("cgraph->n_threads=%i\n",gf31.n_threads);
    const int dimx = sizex;
    const int dimy = sizey;
    const int dimz = sizez;
    long long int flops_per_dot_product = dimy + dimy;
    long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
    // Let's use the F32 result from above as a reference for the q4_0 multiplication
    float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
    printf("==============================================================================================\n");
    for (int i=0;i<benchmark_params.n_iterations ;i++) {
        long long int start = ggml_time_us();
        //printf("Running ggml_graph_compute\n");
        ggml_graph_compute(ctx, &gf31);
        long long int stop = ggml_time_us();
        long long int usec = stop-start;
        float sec = usec/1000000;
        float flops_per_usec = (1.0f*flops_per_matrix)/usec;
        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
            i,
            gf31.n_threads,
            sizex, sizey, sizez, flops_per_matrix,
            usec,flops_per_usec);
 #ifdef VERBOSE_DEBUGGING
        TENSOR_DUMP("res",gf31.nodes[0])
 #endif
        // Check that the matrix multiplication result is in the right ballpark
        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
        float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
        float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
        if (delta > allowed_delta)  {
            printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
                sum_of_F32_reference,
                sum_of_Q4_result,
                delta,
                allowed_delta
            );
            exit(0);
        }
        // Running a different graph computation to make sure we override the CPU cache lines
        ggml_graph_compute(ctx, &gf32);
    }
 }
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -7,12 +7,6 @@
 #include <iterator>
 #include <algorithm>
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
 #include <alloca.h>
 #endif
 #if defined (_WIN32)
 #include <fcntl.h>
 #include <io.h>
--- a/examples/gpt4all.sh
+++ b/examples/gpt4all.sh
@ -10,6 +10,6 @@ cd ..
 ./main --color --instruct --threads 4 \
       --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
       --file ./prompts/alpaca.txt \
-       --batch_size 8 --ctx_size 2048 \
+       --batch_size 8 --ctx_size 2048 -n -1 \
       --repeat_last_n 64 --repeat_penalty 1.3 \
       --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -27,20 +27,27 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    int count = 0;
    int seq_count = tokens.size() / params.n_ctx;
    int n_vocab = llama_n_vocab(ctx);
    double nll = 0.0;
-
+    fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch);
    fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
    for (int i = 0; i < seq_count; ++i) {
        int start = i * params.n_ctx;
-        int end = start + params.n_ctx - 1; // TODO: this is not optimal, e.g. it makes the batch 511 instead of 512
+        int end = start + params.n_ctx;
-                                            //       it is better to always be power of 2 for better performance
+
-        std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
+        std::vector<float> logits;
        int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch;
        auto start_t = std::chrono::high_resolution_clock::now();
-        if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
+        for (int j = 0; j < num_batches; ++j) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
+            int batch_start = start + j * params.n_batch;
-            return;
+            int batch_size = std::min(end - batch_start, params.n_batch);
            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
            }
            auto batch_logits = llama_get_logits(ctx);
            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
        }
        auto end_t = std::chrono::high_resolution_clock::now();
        if (i == 0) {
@ -59,15 +66,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-
+        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
        auto logits = llama_get_logits(ctx);
        for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
            // Calculate probability of next token, given the previous ones.
            int n_vocab = llama_n_vocab(ctx);
            std::vector<float> tok_logits(
-                logits + j * n_vocab,
+                logits.begin() + j * n_vocab,
-                logits + (j + 1) * n_vocab);
+                logits.begin() + (j + 1) * n_vocab);
-            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
+            float prob = softmax(tok_logits)[tokens[start + j + 1]];
            nll += -std::log(prob);
            ++count;
        }
@ -82,11 +86,13 @@ int main(int argc, char ** argv) {
    gpt_params params;
    params.model = "models/llama-7B/ggml-model.bin";
    params.n_batch = 512;
    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }
    params.perplexity = true;
    params.n_batch = std::min(params.n_batch, params.n_ctx);
    if (params.n_ctx > 2048) {
        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -1,6 +1,7 @@
 #include "ggml.h"
 #define LLAMA_API_INTERNAL
 #include "llama.h"
 #include "llama_internal.h"
 #include <algorithm>
 #include <cassert>
@ -15,9 +16,6 @@
 #include <unordered_map>
 #include <vector>
 static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32"  };
 static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
 struct quantize_stats_params {
    std::string model = "models/7B/ggml-model-f16.bin";
    bool verbose = false;
@ -223,7 +221,7 @@ int main(int argc, char ** argv) {
                break;
            }
            int j;
-            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
+            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) {
                // find match
            }
            if (j < GGML_TYPE_COUNT) {
@ -278,7 +276,7 @@ int main(int argc, char ** argv) {
            continue;
        }
        if (params.verbose) {
-            printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
+            printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
        }
        if (kv_tensor.second->type == GGML_TYPE_F16) {
            is_f16 = true;
@ -303,13 +301,14 @@ int main(int argc, char ** argv) {
    // loop throught quantization types
    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
        const ggml_type type = (ggml_type) i;
        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
            continue;
        }
        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
            if (params.verbose) {
-                printf("testing %s ...\n",  type_strs[i]);
+                printf("testing %s ...\n",  ggml_type_name(type));
            }
            error_stats global_stats {};
@ -321,7 +320,7 @@ int main(int argc, char ** argv) {
                if (params.verbose) {
                    printf("  %s ...\n",  kv_tensor.first.c_str());
                }
-                std::string layer_name { type_strs[i] };
+                std::string layer_name { ggml_type_name(type) };
                layer_name += "::" + kv_tensor.first;
                test_roundtrip_on_layer(
                        layer_name,
@ -336,7 +335,7 @@ int main(int argc, char ** argv) {
                );
            }
-            print_error_stats(type_strs[i], global_stats, params.print_histogram);
+            print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
        }
    }
--- a/flake.nix
+++ b/flake.nix
@ -10,7 +10,6 @@
          inherit system;
        };
        llama-python = pkgs.python310.withPackages (ps: with ps; [
          torch
          numpy
          sentencepiece
        ]);
@ -28,10 +27,8 @@
          ];
          installPhase = ''
            mkdir -p $out/bin
-            mv bin/main $out/bin/llama
+            mv bin/* $out/bin/
-            mv bin/quantize $out/bin/quantize
+            mv $out/bin/main $out/bin/llama
            mv bin/embedding $out/bin/embedding
            mv bin/perplexity $out/bin/perplexity
            echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
            cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
--- a/ggml.c
+++ b/ggml.c
@ -118,6 +118,23 @@ typedef void* thread_ret_t;
    #define GGML_MEM_ALIGN 16
 #endif
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #define GGML_ALIGNED_MALLOC(size)  _aligned_malloc(size, GGML_MEM_ALIGN)
 #define GGML_ALIGNED_FREE(ptr)     _aligned_free(ptr)
 #else
 inline static void* ggml_aligned_malloc(size_t size) {
    void* aligned_memory = NULL;
    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
    if (result != 0) {
        // Handle allocation failure
        return NULL;
    }
    return aligned_memory;
 }
 #define GGML_ALIGNED_MALLOC(size)  ggml_aligned_malloc(size)
 #define GGML_ALIGNED_FREE(ptr)     free(ptr)
 #endif
 #define UNUSED(x) (void)(x)
 #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
@ -569,6 +586,77 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
 }
 #endif
 #if __ARM_NEON
 #if !defined(__aarch64__)
 inline static uint16_t vaddvq_u8(uint8x16_t v) {
    return
        (uint16_t)vgetq_lane_u8(v, 0)  + (uint16_t)vgetq_lane_u8(v, 1)  +
        (uint16_t)vgetq_lane_u8(v, 2)  + (uint16_t)vgetq_lane_u8(v, 3)  +
        (uint16_t)vgetq_lane_u8(v, 4)  + (uint16_t)vgetq_lane_u8(v, 5)  +
        (uint16_t)vgetq_lane_u8(v, 6)  + (uint16_t)vgetq_lane_u8(v, 7)  +
        (uint16_t)vgetq_lane_u8(v, 8)  + (uint16_t)vgetq_lane_u8(v, 9)  +
        (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
        (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
        (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
 }
 inline static int32_t vaddvq_s16(int16x8_t v) {
    return
        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
 }
 inline static uint32_t vaddvq_u16(uint16x8_t v) {
    return
        (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
        (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
        (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
        (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
 }
 inline static int32_t vaddvq_s32(int32x4_t v) {
    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
 }
 inline static float vaddvq_f32(float32x4_t v) {
    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
 }
 float vminvq_f32(float32x4_t v) {
    return
        MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
            MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
 }
 float vmaxvq_f32(float32x4_t v) {
    return
        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
 }
 int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
    return vget_low_s8(vcombine_s8(a, b));
 }
 int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
    return vget_high_s8(vcombine_s8(a, b));
 }
 uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
    return vget_low_u8(vcombine_u8(a, b));
 }
 uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
    return vget_high_u8(vcombine_u8(a, b));
 }
 #endif
 #endif
 // method 5
 // blocks of QK elements
 // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
@ -1296,15 +1384,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
 #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
 #define GGML_F32x4_ADD          vaddq_f32
 #define GGML_F32x4_MUL          vmulq_f32
-#if defined(__ARM_FEATURE_QRDMX)
+#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
    #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
 #else
    #define GGML_F32x4_REDUCE_ONE(x) \
    (vgetq_lane_f32(x, 0) +          \
     vgetq_lane_f32(x, 1) +          \
     vgetq_lane_f32(x, 2) +          \
     vgetq_lane_f32(x, 3))
 #endif
 #define GGML_F32x4_REDUCE(res, x)              \
 {                                              \
    for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
@ -1927,55 +2007,43 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
        // 4-bit -> 8-bit
        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
        const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b));
        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
        const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4));
        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
        const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b));
        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
        const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4));
        // sub 8
        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
        const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b);
        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
        const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b);
        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
        const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b);
        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
        const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
 #if defined(__ARM_FEATURE_DOTPROD)
-        // dot product into int16x8_t
+        // dot product into int32x4_t
        int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls);
        int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls);
        p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs);
        p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs);
-        // scalar
+        sum0 += x0->d*y0->d*vaddvq_s32(p_0);
-#if defined(__ARM_FEATURE_QRDMX)
+        sum1 += x1->d*y1->d*vaddvq_s32(p_1);
        sum0 += x0->d * y0->d * vaddvq_s32(p_0);
        sum1 += x1->d * y1->d * vaddvq_s32(p_1);
 #else
        sum0 += x0->d * y0->d * (vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3));
        sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
 #endif
 #else
        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
@ -1988,14 +2056,8 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
        const int16x8_t p_0 = vaddq_s16(pl_0, ph_0);
        const int16x8_t p_1 = vaddq_s16(pl_1, ph_1);
-        // scalar
+        sum0 += x0->d*y0->d*vaddvq_s16(p_0);
-#if defined(__ARM_FEATURE_QRDMX)
+        sum1 += x1->d*y1->d*vaddvq_s16(p_1);
        sum0 += x0->d * y0->d * vaddvq_s16(p_0);
        sum1 += x1->d * y1->d * vaddvq_s16(p_1);
 #else
        sum0 += x0->d * y0->d * (vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7));
        sum1 += x1->d * y1->d * (vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7));
 #endif
 #endif
    }
@ -2238,18 +2300,20 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
        const uint8_t * restrict p0 = x[i].qs;
        const uint8_t * restrict p1 = y[i].qs;
        int sumi = 0;
        for (int j = 0; j < QK/2; j++) {
            const uint8_t v0 = p0[j];
            const uint8_t v1 = p1[j];
-            const float f0 = d0*((int8_t) (v0 & 0xf) - 8);
+            const int8_t i0 = (int8_t) (v0 & 0xf) - 8;
-            const float f1 = d0*((int8_t) (v0 >> 4)  - 8);
+            const int8_t i1 = (int8_t) (v0 >> 4)  - 8;
-            const float f2 = d1*((int8_t) (v1 & 0xf) - 8);
+            const int8_t i2 = (int8_t) (v1 & 0xf) - 8;
-            const float f3 = d1*((int8_t) (v1 >> 4)  - 8);
+            const int8_t i3 = (int8_t) (v1 >> 4)  - 8;
-            sumf += f0*f2 + f1*f3;
+            sumi += i0*i2 + i1*i3;
        }
        sumf += d0 * d1 * sumi;
    }
 #endif
@ -2341,36 +2405,71 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
    float sum10 = 0.0f;
    float sum11 = 0.0f;
-    for (int i = 0; i < nb; ++i) {
+    for (int i = 0; i < nb; i += 2) {
        const block_q4_1 * restrict x0 = &x[i + 0];
        const block_q4_1 * restrict y0 = &y[i + 0];
        const block_q4_1 * restrict x1 = &x[i + 1];
        const block_q4_1 * restrict y1 = &y[i + 1];
        const uint8x16_t m4b = vdupq_n_u8(0xf);
        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
        const uint8x16_t v1_0 = vld1q_u8(y0->qs);
        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
        const uint8x16_t v1_1 = vld1q_u8(y1->qs);
-        // and with 0xf
+        // 4-bit -> 8-bit
        const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
        const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
        const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
        const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
-        // dot product into uint16x8_t
+        const uint8x16_t v0_1l = vandq_u8(v0_1, m4b);
-        const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
+        const uint8x16_t v1_1l = vandq_u8(v1_1, m4b);
-        const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
+        const uint8x16_t v0_1h = vshrq_n_u8(v0_1, 4);
-
+        const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4);
        const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
        const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
        const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h);
        const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h);
        sum00 += x0->m*y0->m;
        sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h));
        sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h));
-        sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0));
+
        sum00 += x1->m*y1->m;
        sum01 += y1->m*x1->d*(vaddvq_u8(v0_1l) + vaddvq_u8(v0_1h));
        sum10 += x1->m*y1->d*(vaddvq_u8(v1_1l) + vaddvq_u8(v1_1h));
 #if defined(__ARM_FEATURE_DOTPROD)
        // dot product into int32x4_t
        uint32x4_t p_0 = vdotq_u32(vdupq_n_u32(0), v0_0l, v1_0l);
        uint32x4_t p_1 = vdotq_u32(vdupq_n_u32(0), v0_1l, v1_1l);
        p_0 = vdotq_u32(p_0, v0_0h, v1_0h);
        p_1 = vdotq_u32(p_1, v0_1h, v1_1h);
        sum11 += x0->d*y0->d*vaddvq_u32(p_0);
        sum11 += x1->d*y1->d*vaddvq_u32(p_1);
 #else
        const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
        const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
        const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
        const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
        const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l));
        const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l));
        const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h));
        const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h));
        const uint16x8_t pl_0 = vaddq_u16(pl0l, pl0h);
        const uint16x8_t ph_0 = vaddq_u16(ph0l, ph0h);
        const uint16x8_t pl_1 = vaddq_u16(pl1l, pl1h);
        const uint16x8_t ph_1 = vaddq_u16(ph1l, ph1h);
        const uint16x8_t p_0 = vaddq_u16(pl_0, ph_0);
        const uint16x8_t p_1 = vaddq_u16(pl_1, ph_1);
        sum11 += x0->d*y0->d*vaddvq_u16(p_0);
        sum11 += x1->d*y1->d*vaddvq_u16(p_1);
 #endif
    }
    sumf = QK*sum00 + sum01 + sum10 + sum11;
@ -2667,6 +2766,18 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
 };
 static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated");
 static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
    [GGML_TYPE_F32]  = "f32",
    [GGML_TYPE_F16]  = "f16",
    [GGML_TYPE_Q4_0] = "q4_0",
    [GGML_TYPE_Q4_1] = "q4_1",
    [GGML_TYPE_I8]   = "i8",
    [GGML_TYPE_I16]  = "i16",
    [GGML_TYPE_I32]  = "i32",
 };
 static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_NAME is outdated");
 static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
    "NONE",
@ -2708,9 +2819,12 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
    "FLASH_ATTN",
    "FLASH_FF",
    "MAP_UNARY",
    "MAP_BINARY",
 };
-static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36");
+static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@ -2753,9 +2867,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "flash_attn(x)",
    "flash_ff(x)",
    "f(x)",
    "f(x,y)",
 };
-static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36");
+static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@ -2885,6 +3002,11 @@ float ggml_type_sizef(enum ggml_type type) {
    return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
 }
 const char * ggml_type_name(enum ggml_type type) {
    return GGML_TYPE_NAME[type];
 }
 size_t ggml_element_size(const struct ggml_tensor * tensor) {
    return GGML_TYPE_SIZE[tensor->type];
 }
@ -3050,9 +3172,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
        return NULL;
    }
    const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
    *ctx = (struct ggml_context) {
-        /*.mem_size           =*/ params.mem_size,
+        /*.mem_size           =*/ mem_size,
-        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
+        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
        /*.no_alloc           =*/ params.no_alloc,
        /*.n_objects          =*/ 0,
@ -3062,7 +3186,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
        /*.scratch_save       =*/ { 0, 0, NULL, },
    };
-    GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure
+    GGML_ASSERT(ctx->mem_buffer != NULL);
    ggml_assert_aligned(ctx->mem_buffer);
@ -3087,7 +3211,7 @@ void ggml_free(struct ggml_context * ctx) {
                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
            if (ctx->mem_buffer_owned) {
-                free(ctx->mem_buffer);
+                GGML_ALIGNED_FREE(ctx->mem_buffer);
            }
            found = true;
@ -4901,6 +5025,90 @@ struct ggml_tensor * ggml_flash_ff(
    return result;
 }
 // ggml_map_unary
 struct ggml_tensor * ggml_map_unary_impl_f32(
        struct ggml_context        * ctx,
        struct ggml_tensor         * a,
        const  ggml_unary_op_f32_t fun,
        bool   inplace) {
    bool is_node = false;
    if (!inplace && a->grad) {
        is_node = true;
    }
    struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
    *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    result->op = GGML_OP_MAP_UNARY;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
    result->opt[0] = addr_tensor;
    return result;
 }
 struct ggml_tensor * ggml_map_unary_f32(
        struct ggml_context        * ctx,
        struct ggml_tensor         * a,
        const  ggml_unary_op_f32_t fun) {
    return ggml_map_unary_impl_f32(ctx, a, fun, false);
 }
 struct ggml_tensor * ggml_map_unary_inplace_f32(
        struct ggml_context        * ctx,
        struct ggml_tensor         * a,
        const  ggml_unary_op_f32_t fun) {
    return ggml_map_unary_impl_f32(ctx, a, fun, true);
 }
 // ggml_map_binary
 struct ggml_tensor * ggml_map_binary_impl_f32(
        struct ggml_context         * ctx,
        struct ggml_tensor          * a,
        struct ggml_tensor          * b,
        const  ggml_binary_op_f32_t fun,
        bool   inplace) {
    GGML_ASSERT(ggml_are_same_shape(a, b));
    bool is_node = false;
    if (!inplace && (a->grad || b->grad)) {
        is_node = true;
    }
    struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
    *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    result->op = GGML_OP_MAP_BINARY;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
    result->src1 = b;
    result->opt[0] = addr_tensor;
    return result;
 }
 struct ggml_tensor * ggml_map_binary_f32(
        struct ggml_context         * ctx,
        struct ggml_tensor          * a,
        struct ggml_tensor          * b,
        const  ggml_binary_op_f32_t fun) {
    return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
 }
 struct ggml_tensor * ggml_map_binary_inplace_f32(
        struct ggml_context         * ctx,
        struct ggml_tensor          * a,
        struct ggml_tensor          * b,
        const  ggml_binary_op_f32_t fun) {
    return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
 }
 ////////////////////////////////////////////////////////////////////////////////
 void ggml_set_param(
@ -6550,7 +6758,7 @@ static void ggml_compute_forward_mul_mat_f32(
                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                        ne11, ne01, ne10,
                        1.0f,    y, ne10,
-                                 x, ne10,
+                                 x, ne00,
                        0.0f,    d, ne01);
            }
        }
@ -6722,7 +6930,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                        ne11, ne01, ne10,
                        1.0f,    y, ne10,
-                                 x, ne10,
+                                 x, ne00,
                        0.0f,    d, ne01);
            }
        }
@ -6935,7 +7143,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                        ne11, ne01, ne10,
                        1.0f,    y, ne10,
-                                 x, ne10,
+                                 x, ne00,
                        0.0f,    d, ne01);
            }
        }
@ -7532,6 +7740,8 @@ static void ggml_compute_forward_rope_f32(
    // row index used to determine which thread to use
    int ir = 0;
    const float theta_scale = powf(10000.0, -2.0f/n_dims);
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
            const int p = (mode == 0 ? n_past + i2 : i2);
@ -7539,11 +7749,13 @@ static void ggml_compute_forward_rope_f32(
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
-                for (int i0 = 0; i0 < n_dims; i0 += 2) {
+                float theta = (float)p;
                    const float theta = powf(10000.0, ((float)-i0)/n_dims);
-                    const float cos_theta = cosf(p*theta);
+                for (int i0 = 0; i0 < n_dims; i0 += 2) {
-                    const float sin_theta = sinf(p*theta);
+                    const float cos_theta = cosf(theta);
                    const float sin_theta = sinf(theta);
                    theta *= theta_scale;
                    const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
                          float * dst_data  = (float *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@ -7605,6 +7817,8 @@ static void ggml_compute_forward_rope_f16(
    // row index used to determine which thread to use
    int ir = 0;
    const float theta_scale = powf(10000.0, -2.0f/n_dims);
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
            const int p = (mode == 0 ? n_past + i2 : i2);
@ -7612,11 +7826,13 @@ static void ggml_compute_forward_rope_f16(
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
-                for (int i0 = 0; i0 < n_dims; i0 += 2) {
+                float theta = (float)p;
                    const float theta = powf(10000.0, ((float)-i0)/n_dims);
-                    const float cos_theta = cosf(p*theta);
+                for (int i0 = 0; i0 < n_dims; i0 += 2) {
-                    const float sin_theta = sinf(p*theta);
+                    const float cos_theta = cosf(theta);
                    const float sin_theta = sinf(theta);
                    theta *= theta_scale;
                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
                          ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@ -8890,6 +9106,111 @@ static void ggml_compute_forward_flash_ff(
    }
 }
 // ggml_compute_forward_map_unary
 static void ggml_compute_forward_map_unary_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst,
        const ggml_unary_op_f32_t fun) {
    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];
    assert( dst->nb[0] == sizeof(float));
    assert(src0->nb[0] == sizeof(float));
    for (int i = 0; i < n; i++) {
        fun(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
 }
 static void ggml_compute_forward_map_unary(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst,
        const ggml_unary_op_f32_t fun) {
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
            } break;
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
        case GGML_TYPE_F16:
        case GGML_TYPE_COUNT:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 // ggml_compute_forward_map_binary
 static void ggml_compute_forward_map_binary_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst,
        const ggml_binary_op_f32_t fun) {
    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];
    assert( dst->nb[0] == sizeof(float));
    assert(src0->nb[0] == sizeof(float));
    assert(src1->nb[0] == sizeof(float));
    for (int i = 0; i < n; i++) {
        fun(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])),
                (float *) ((char *) src1->data + i*(src1->nb[1])));
    }
 }
 static void ggml_compute_forward_map_binary(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst,
        const ggml_binary_op_f32_t fun) {
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
            } break;
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
        case GGML_TYPE_F16:
        case GGML_TYPE_COUNT:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 /////////////////////////////////
 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
@ -9039,6 +9360,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
            } break;
        case GGML_OP_MAP_UNARY:
            {
                const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
                ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun);
            }
            break;
        case GGML_OP_MAP_BINARY:
            {
                const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data);
                ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
            }
            break;
        case GGML_OP_NONE:
            {
                // nop
@ -9298,6 +9631,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                GGML_ASSERT(false); // not supported
            } break;
        case GGML_OP_MAP_UNARY:
        case GGML_OP_MAP_BINARY:
            {
                GGML_ASSERT(false); // not supported
            } break;
        case GGML_OP_NONE:
            {
                // nop
@ -9388,7 +9726,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
    struct ggml_cgraph result = {
        /*.n_nodes      =*/ 0,
        /*.n_leafs      =*/ 0,
-        /*.n_threads    =*/ 0,
+        /*.n_threads    =*/ GGML_DEFAULT_N_THREADS,
        /*.work_size    =*/ 0,
        /*.work         =*/ NULL,
        /*.nodes        =*/ { NULL },
@ -9790,6 +10128,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                        work_size = MAX(work_size, cur);
                    } break;
                case GGML_OP_MAP_UNARY:
                case GGML_OP_MAP_BINARY:
                    {
                        node->n_tasks = 1;
                    } break;
                case GGML_OP_NONE:
                    {
                        node->n_tasks = 1;
@ -10008,8 +10351,8 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
    GGML_PRINT("=== GRAPH ===\n");
-    GGML_PRINT_DEBUG("n_threads       = %d\n",       cgraph->n_threads);
+    GGML_PRINT_DEBUG("n_threads       = %d\n",        cgraph->n_threads);
-    GGML_PRINT_DEBUG("total work size = %zu bytes\n",cgraph->work_size);
+    GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
    GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
    for (int i = 0; i < cgraph->n_nodes; i++) {
--- a/ggml.h
+++ b/ggml.h
@ -177,11 +177,12 @@ extern "C" {
 #include <stddef.h>
 #include <stdbool.h>
-#define GGML_MAX_DIMS     4
+#define GGML_MAX_DIMS          4
-#define GGML_MAX_NODES    4096
+#define GGML_MAX_NODES         4096
-#define GGML_MAX_PARAMS   16
+#define GGML_MAX_PARAMS        16
-#define GGML_MAX_CONTEXTS 64
+#define GGML_MAX_CONTEXTS      64
-#define GGML_MAX_OPT      4
+#define GGML_MAX_OPT           4
 #define GGML_DEFAULT_N_THREADS 4
 #ifdef __ARM_NEON
 // we use the built-in 16-bit float type
@ -252,6 +253,9 @@ enum ggml_op {
    GGML_OP_FLASH_ATTN,
    GGML_OP_FLASH_FF,
    GGML_OP_MAP_UNARY,
    GGML_OP_MAP_BINARY,
    GGML_OP_COUNT,
 };
@ -350,6 +354,8 @@ int    ggml_blck_size (enum ggml_type type);
 size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
 float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
 const char * ggml_type_name(enum ggml_type type);
 size_t ggml_element_size(const struct ggml_tensor * tensor);
 struct ggml_context * ggml_init(struct ggml_init_params params);
@ -651,6 +657,21 @@ struct ggml_tensor * ggml_flash_ff(
        struct ggml_tensor  * c0,
        struct ggml_tensor  * c1);
 // Mapping operations
 typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
 typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
 struct ggml_tensor * ggml_map_unary_f32(
        struct ggml_context        * ctx,
        struct ggml_tensor         * a,
        const  ggml_unary_op_f32_t fun);
 struct ggml_tensor * ggml_map_binary_f32(
        struct ggml_context         * ctx,
        struct ggml_tensor          * a,
        struct ggml_tensor          * b,
        const  ggml_binary_op_f32_t fun);
 //
 // automatic differentiation
 //
--- a/llama.cpp
+++ b/llama.cpp
@ -5,7 +5,6 @@
 #include "llama_util.h"
 #include "llama.h"
 #include "llama_internal.h"
 #include "ggml.h"
@ -270,16 +269,6 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
    return ret;
 }
 static const char * llama_format_type(enum ggml_type type) {
    switch (type) {
        case GGML_TYPE_F32: return "f32";
        case GGML_TYPE_F16: return "f16";
        case GGML_TYPE_Q4_0: return "q4_0";
        case GGML_TYPE_Q4_1: return "q4_1";
        default: LLAMA_ASSERT(false);
    }
 }
 static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
    size_t size = ggml_type_size(type);
    for (uint32_t dim : ne) {
@ -1583,7 +1572,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        printf("[%zu/%zu] %36s - %s, type = %6s, ",
               ++idx, model_loader->tensors_map.tensors.size(),
               tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
-               llama_format_type(tensor.type));
+               ggml_type_name(tensor.type));
        // This used to be a regex, but <regex> has an extreme cost to compile times.
        bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
@ -1616,7 +1605,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                    f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
                }
            } else {
-                throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type));
+                throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
            }
            printf("quantizing .. ");
--- a/llama.h
+++ b/llama.h
@ -179,4 +179,15 @@ extern "C" {
 }
 #endif
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
 #include <vector>
 #include <string>
 struct ggml_tensor;
 std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
 #endif
 #endif // LLAMA_H
--- a/llama_internal.h
+++ b/llama_internal.h
@ -1,12 +0,0 @@
 // Internal header to be included by llama.cpp and tests/benchmarks only.
 #ifndef LLAMA_INTERNAL_H
 #define LLAMA_INTERNAL_H
 #include <vector>
 #include <string>
 struct ggml_tensor;
 std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
 #endif // LLAMA_INTERNAL_H
--- a/migrate-ggml-2023-03-30-pr613.py
+++ b/migrate-ggml-2023-03-30-pr613.py
@ -1,311 +0,0 @@
 # Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
 #
 # We caused a breaking change to the file format on 2023-03-30 in:
 #     https://github.com/ggerganov/llama.cpp/pull/613
 #
 # (1) If you still have the Meta LLaMA .pth files, then close this
 #     file now; you can just run `convert-pth-to-ggml.py` again to
 #     migrate to the new format. The tool is easier to use too. It
 #     isn't necessary anymore to manage split output files because
 #     the new format always combines things into a single file.
 #
 # (2) If you deleted the Meta LLaMA .pth files due to save on disk
 #     space, then this tool is intended to help you.  Please check
 #     out the instructions below.
 #
 # USAGE
 #
 #     python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
 #
 # PREREQUISITES
 #
 #     pip install numpy
 #     cd llama.cpp
 #     make -j4
 #
 # EXAMPLE (7B MODEL)
 #
 #     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
 #     python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
 #
 #     # check that it works
 #     ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
 #
 #     # you can delete the old files
 #     rm -f models/7B/ggml-model-f16.bin
 #     mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
 #
 # EXAMPLE (13B MODEL)
 #
 #     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
 #     python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
 #
 #     # check that it works
 #     ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
 #
 #     # you can delete the old files
 #     rm -f models/13B/ggml-model-f16.bin*
 #     mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
 #
 import argparse
 import os
 import sys
 import json
 import struct
 import numpy as np
 QK = 32
 GGML_TYPE_Q4_0  = 0
 GGML_TYPE_Q4_1  = 1
 GGML_TYPE_I8    = 2
 GGML_TYPE_I16   = 3
 GGML_TYPE_I32   = 4
 GGML_TYPE_F16   = 5
 GGML_TYPE_F32   = 6
 WTYPE_NAMES = {
    0: "F32",
    1: "F16",
    2: "Q4_0",
    3: "Q4_1",
 }
 WTYPES = {
    0: GGML_TYPE_F32,
    1: GGML_TYPE_F16,
    2: GGML_TYPE_Q4_0,
    3: GGML_TYPE_Q4_1,
 }
 GGML_BLCK_SIZE = {
    GGML_TYPE_Q4_0:  QK,
    GGML_TYPE_Q4_1:  QK,
    GGML_TYPE_I8:    1,
    GGML_TYPE_I16:   1,
    GGML_TYPE_I32:   1,
    GGML_TYPE_F16:   1,
    GGML_TYPE_F32:   1,
 }
 GGML_TYPE_SIZE = {
    GGML_TYPE_Q4_0: 4   + QK//2,
    GGML_TYPE_Q4_1: 4*2 + QK//2,
    GGML_TYPE_I8:   1,
    GGML_TYPE_I16:  2,
    GGML_TYPE_I32:  4,
    GGML_TYPE_F16:  2,
    GGML_TYPE_F32:  4,
 }
 HPARAMS = [
    'magic',    # int32
    'version',  # int32
    'n_vocab',  # int32
    'n_embd',   # int32
    'n_mult',   # int32
    'n_head',   # int32
    'n_layer',  # int32
    'n_rot',    # int32
    'f16',      # int32
 ]
 def read_hparams(fin):
    struct_fmt = "i" * len(HPARAMS)
    struct_size = struct.calcsize(struct_fmt)
    buf = fin.read(struct_size)
    ints = struct.unpack(struct_fmt, buf)
    hparams = dict(zip(HPARAMS, ints))
    return hparams
 def write_hparams(fout, hparams):
    struct_fmt = "i" * len(HPARAMS)
    struct_size = struct.calcsize(struct_fmt)
    ints = [hparams[h] for h in HPARAMS]
    fout.write(struct.pack(struct_fmt, *ints))
 def read_tokens(fin, hparams):
    tokens = []
    for i in range(hparams['n_vocab']):
        len_b = fin.read(4)
        (length,) = struct.unpack("i", len_b)
        word = fin.read(length)
        score_b = fin.read(4)
        (score,) = struct.unpack("f", score_b)
        tokens.append((word, score))
    return tokens
 def write_tokens(fout, tokens):
    for word, score in tokens:
        fout.write(struct.pack("i", len(word)))
        fout.write(word)
        fout.write(struct.pack("f", score))
 def ggml_nelements(shape):
    r = 1
    for i in shape:
        r *= i
    return r
 def ggml_nbytes(shape, ftype):
    x = ggml_nelements(shape)
    t = WTYPES[ftype]
    x *= GGML_TYPE_SIZE[t]
    x //= GGML_BLCK_SIZE[t]
    return x
 def copy_tensors(fin, fout, part_id, n_parts):
    while True:
        b = fin.read(4)
        if not b: break
        (n_dims,) = struct.unpack("i", b)
        b = fin.read(4)
        (length,) = struct.unpack("i", b)
        b = fin.read(4)
        (ftype,) = struct.unpack("i", b)
        assert n_dims in (1, 2)
        partshape = list(range(n_dims))
        for i in range(n_dims):
            b = fin.read(4)
            partshape[i] = struct.unpack("i", b)[0]
        partshape = list(reversed(partshape))
        name = fin.read(length)
        data = fin.read(ggml_nbytes(partshape, ftype))
        blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
        type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
        print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
        # determine dimension along which multipart tensor is sharded
        #
        # split_dim 0 regex:
        #   - output.*
        #   - layers.*.attention.wq.weight
        #   - layers.*.attention.wk.weight
        #   - layers.*.attention.wv.weight
        #   - layers.*.feed_forward.w1.weight
        #   - layers.*.feed_forward.w3.weight
        #
        # split_dim 1 regex:
        #   - tok_embeddings.*
        #   - layers.*.attention.wo.weight
        #   - layers.*.feed_forward.w2.weight
        #
        if n_dims > 1:
            split_dim = 1
            if b"tok_embeddings" in name:
                split_dim = 1
            elif b"layers" in name:
                if b"attention.wo.weight" in name:
                    split_dim = 1
                elif b"feed_forward.w2.weight" in name:
                    split_dim = 1
                else:
                    split_dim = 0
            elif b"output" in name:
                split_dim = 0
        # output tensor header
        fullshape = list(partshape)
        if n_dims > 1:
            fullshape[split_dim] *= n_parts
        fout.write(struct.pack("iii", n_dims, len(name), ftype))
        for dim in reversed(fullshape):
            fout.write(struct.pack("i", dim))
        fout.write(name)
        # ensure tensor data is aligned
        tensor_data_offset = fout.tell()
        while tensor_data_offset % QK != 0:
            fout.write(struct.pack("B", 0))
            tensor_data_offset += 1
        # output unified mappable tensor data
        if n_dims == 1 or n_parts == 1:
            # copy tensor which we thankfully received in one piece
            if part_id == 0:
                fout.write(data)
        elif split_dim == 0:
            # reassemble multifile tensor containing some of the rows
            rows_per_chunk = partshape[0]
            current_row = part_id * rows_per_chunk
            bytes_per_row = fullshape[1] // blck_size * type_size
            offset = current_row * bytes_per_row
            fout.seek(tensor_data_offset + offset)
            fout.write(data)
        elif split_dim == 1:
            # reassemble multifile tensor containing some of the cols
            cols_per_chunk = partshape[1]
            current_col = part_id * cols_per_chunk
            bpr = partshape[1] // blck_size * type_size
            bytes_per_row = fullshape[1] // blck_size * type_size
            offset_current_col = current_col // blck_size * type_size
            for row in range(partshape[0]):
                offset_row = row * bytes_per_row
                offset = offset_row + offset_current_col
                fout.seek(tensor_data_offset + offset)
                fout.write(data[row * bpr:row * bpr + bpr])
        # advance file position to next tensor
        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
 def parse_args():
    parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
    parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
    parser.add_argument('fout_path', help='your new ggjt file name')
    return parser.parse_args()
 def main():
    args = parse_args()
    assert args.fin_path
    assert args.fout_path
    assert args.fin_path != args.fout_path
    with open(args.fin_path, "rb") as fin:
        hparams = read_hparams(fin)
        tokens = read_tokens(fin, hparams)
    if hparams['magic'] == 0x67676a74:  # ggjt
        print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
        sys.exit(1)
    if hparams['magic'] != 0x67676d66:  # ggmf
        print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
        sys.exit(1)
    hparams['magic'] = 0x67676a74  # ggjt
    # count number of multipart files by convention
    n_parts = 1
    while True:
        if os.path.exists(f"{args.fin_path}.{n_parts}"):
            n_parts += 1
        else:
            break
    # we output a single file for ggml
    with open(args.fout_path, "wb") as fout:
        write_hparams(fout, hparams)
        write_tokens(fout, tokens)
        offset_of_tensors = fout.tell()
        # the tensors we load could be split across multiple files
        for part_id in range(n_parts):
            fout.seek(offset_of_tensors)
            print(f"Processing part {part_id+1} of {n_parts}\n")
            fin_path = args.fin_path
            if part_id > 0:
                fin_path += f".{part_id}"
            with open(fin_path, "rb") as fin:
                read_tokens(fin, read_hparams(fin))
                copy_tensors(fin, fout, part_id, n_parts)
    print(f"Done. Output file: {args.fout_path}\n")
 if __name__ == "__main__":
    main()
--- a/prompts/chat-with-bob.txt
+++ b/prompts/chat-with-bob.txt
@ -4,4 +4,4 @@ User: Hello, Bob.
 Bob: Hello. How may I help you today?
 User: Please tell me the largest city in Europe.
 Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
-User:
+User:
--- a/prompts/reason-act.txt
+++ b/prompts/reason-act.txt
@ -15,4 +15,4 @@ Answer: The calculate tool says it is 9.3333333333
 Question: What is capital of france?
 Thought: Do I need to use an action? No, I know the answer
 Answer: Paris is the capital of France
-Question:
+Question:
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 numpy==1.24
 sentencepiece==0.1.98