Merge branch 'ggerganov:master' into master

2023-09-07 15:09:52 -05:00 · 2023-09-07 15:09:52 -05:00 · bf7eeb731f
commit bf7eeb731f
parent 9d1a41a966 6336d834ec
28 changed files with 262 additions and 195 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@ -3,6 +3,7 @@ Checks: >
    bugprone-*,
    -bugprone-easily-swappable-parameters,
    -bugprone-implicit-widening-of-multiplication-result,
+    -bugprone-misplaced-widening-cast,
    -bugprone-narrowing-conversions,
    readability-*,
    -readability-avoid-unconditional-preprocessor-if,
@ -15,4 +16,8 @@ Checks: >
    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
    performance-*,
    portability-*,
+    misc-*,
+    -misc-const-correctness,
+    -misc-non-private-member-variables-in-classes,
+    -misc-no-recursion,
 FormatStyle: none
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -83,6 +83,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
+option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
@ -174,7 +175,9 @@ if (LLAMA_METAL)
    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)

    add_compile_definitions(GGML_USE_METAL)
-    #add_compile_definitions(GGML_METAL_NDEBUG)
+    if (LLAMA_METAL_NDEBUG)
+        add_compile_definitions(GGML_METAL_NDEBUG)
+    endif()

    # get full path to the file
    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
@ -423,7 +426,7 @@ if (LLAMA_ALL_WARNINGS)
        )
        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
            # g++ only
-            set(cxx_flags ${cxx_flags} -Wno-format-truncation)
+            set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds)
        endif()
    else()
        # todo : msvc
--- a/35
+++ b/35
@ -42,9 +42,9 @@ endif

 default: $(BUILD_TARGETS)

-test:
-	@echo "Running tests..."
-	@for test_target in $(TEST_TARGETS); do \
+test: $(TEST_TARGETS)
+	@failures=0; \
+	for test_target in $(TEST_TARGETS); do \
 		if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
 			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
@ -52,10 +52,21 @@ test:
 		elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
 			continue; \
 		else \
+			echo "Running test $$test_target..."; \
 			./$$test_target; \
 		fi; \
-	done
-	@echo "All tests have been run."
+		if [ $$? -ne 0 ]; then \
+			printf 'Test $$test_target FAILED!\n\n' $$test_target; \
+			failures=$$(( failures + 1 )); \
+		else \
+			printf 'Test %s passed.\n\n' $$test_target; \
+		fi; \
+	done; \
+	if [ $$failures -gt 0 ]; then \
+		printf '\n%s tests failed.\n' $$failures; \
+		exit 1; \
+	fi
+	@echo 'All tests passed.'

 all: $(BUILD_TARGETS) $(TEST_TARGETS)

@ -91,8 +102,8 @@ else
 OPT = -O3
 endif
 MK_CPPFLAGS = -I. -Icommon
-MK_CFLAGS   = $(CPPFLAGS) $(OPT) -std=c11   -fPIC
-MK_CXXFLAGS = $(CPPFLAGS) $(OPT) -std=c++11 -fPIC
+MK_CFLAGS   = $(OPT) -std=c11   -fPIC
+MK_CXXFLAGS = $(OPT) -std=c++11 -fPIC
 MK_LDFLAGS  =

 ifdef LLAMA_DEBUG
@ -123,7 +134,7 @@ MK_CXXFLAGS  += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-m

 ifeq '' '$(findstring clang++,$(CXX))'
 	# g++ only
-	MK_CXXFLAGS += -Wno-format-truncation
+	MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
 endif

 # OS specific
@ -360,6 +371,9 @@ ifdef LLAMA_METAL
 	MK_CPPFLAGS += -DGGML_USE_METAL
 	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
 	OBJS		+= ggml-metal.o
+ifdef LLAMA_METAL_NDEBUG
+	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
+endif
 endif # LLAMA_METAL

 ifdef LLAMA_METAL
@ -378,9 +392,8 @@ k_quants.o: k_quants.c k_quants.h
 endif # LLAMA_NO_K_QUANTS

 # combine build flags with cmdline overrides
-override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
-override CFLAGS   := $(MK_CFLAGS) $(CFLAGS)
-override CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
+override CFLAGS   := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
+override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
 override LDFLAGS  := $(MK_LDFLAGS) $(LDFLAGS)

 #
--- a/common/common.cpp
+++ b/common/common.cpp
@ -57,7 +57,7 @@ int32_t get_num_physical_cores() {
            siblings.insert(line);
        }
    }
-    if (siblings.size() > 0) {
+    if (!siblings.empty()) {
        return static_cast<int32_t>(siblings.size());
    }
 #elif defined(__APPLE__) && defined(__MACH__)
@ -773,7 +773,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        LOG("warming up the model with an empty run\n");

        const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
-        llama_eval(lctx, tmp.data(), tmp.size(), 0, params.n_threads);
+        llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
        llama_reset_timings(lctx);
    }

--- a/common/common.h
+++ b/common/common.h
@ -20,6 +20,9 @@
 #define DIRECTORY_SEPARATOR '/'
 #endif // _WIN32

+#define die(msg)          do { fputs("error: " msg "\n", stderr);                  exit(1); } while (0)
+#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", ##__VA_ARGS__); exit(1); } while (0)
+
 //
 // CLI argument parsing
 //
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@ -415,6 +415,7 @@ namespace grammar_parser {

    std::vector<const llama_grammar_element *> parse_state::c_rules() {
        std::vector<const llama_grammar_element *> ret;
+        ret.reserve(rules.size());
        for (const auto & rule : rules) {
            ret.push_back(rule.data());
        }
--- a/convert-llama-ggmlv3-to-gguf.py
+++ b/convert-llama-ggmlv3-to-gguf.py
@ -5,6 +5,7 @@ import argparse
 import math
 import struct
 import sys
+from enum import IntEnum
 from pathlib import Path

 import numpy as np
@ -34,10 +35,35 @@ GGML_QUANT_SIZES = {
    gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
 }

+class GGMLFormat(IntEnum):
+    GGML = 0
+    GGMF = 1
+    GGJT = 2
+
+class GGMLFType(IntEnum):
+    ALL_F32              = 0
+    MOSTLY_F16           = 1
+    MOSTLY_Q4_0          = 2
+    MOSTLY_Q4_1          = 3
+    MOSTLY_Q4_1_SOME_F16 = 4
+    MOSTLY_Q8_0          = 7
+    MOSTLY_Q5_0          = 8
+    MOSTLY_Q5_1          = 9
+    MOSTLY_Q2_K          = 10
+    MOSTLY_Q3_K_S        = 11
+    MOSTLY_Q3_K_M        = 12
+    MOSTLY_Q3_K_L        = 13
+    MOSTLY_Q4_K_S        = 14
+    MOSTLY_Q4_K_M        = 15
+    MOSTLY_Q5_K_S        = 16
+    MOSTLY_Q5_K_M        = 17
+    MOSTLY_Q6_K          = 18
+
 class Hyperparameters:
    def __init__(self):
-        self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
-        self.n_ff = 0
+        self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
+        self.n_layer = self.n_rot = self.n_ff = 0
+        self.ftype = GGMLFType.ALL_F32

    def set_n_ff(self, model):
        ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
@ -53,16 +79,21 @@ class Hyperparameters:
            self.n_head,
            self.n_layer,
            self.n_rot,
-            self.ftype,
+            ftype,
        ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
+        try:
+            self.ftype = GGMLFType(ftype)
+        except ValueError:
+            raise ValueError(f'Invalid ftype {ftype}')
        return 4 * 7

    def __str__(self):
-        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
+        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'

 class Vocab:
-    def __init__(self):
+    def __init__(self, load_scores = True):
        self.items = []
+        self.load_scores = load_scores

    def load(self, data, offset, n_vocab):
        orig_offset = offset
@ -70,20 +101,24 @@ class Vocab:
            itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
            assert itemlen < 4096, 'Absurd vocab item length'
            offset += 4
-            vocab = bytes(data[offset:offset + itemlen])
+            item_text = bytes(data[offset:offset + itemlen])
            offset += itemlen
-            score = struct.unpack('<f', data[offset:offset + 4])[0]
-            offset += 4
-            self.items.append((vocab, score))
+            if self.load_scores:
+                item_score = struct.unpack('<f', data[offset:offset + 4])[0]
+                offset += 4
+            else:
+                item_score = 0.0
+            self.items.append((item_text, item_score))
        return offset - orig_offset

 class Tensor:
-    def __init__(self):
+    def __init__(self, use_padding = True):
        self.name = None
        self.dims: tuple[int, ...] = ()
        self.dtype = None
        self.start_offset = 0
        self.len_bytes = np.int64(0)
+        self.use_padding = use_padding

    def load(self, data, offset):
        orig_offset = offset
@ -99,7 +134,7 @@ class Tensor:
        offset += 4 * n_dims
        self.name = bytes(data[offset:offset + name_len])
        offset += name_len
-        pad = ((offset + 31) & ~31) - offset
+        pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
        offset += pad
        n_elems = np.prod(self.dims)
        n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
@ -109,7 +144,7 @@ class Tensor:
        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
        return offset - orig_offset

-class GGMLV3Model:
+class GGMLModel:
    def __init__(self):
        self.hyperparameters = None
        self.vocab = None
@ -117,20 +152,52 @@ class GGMLV3Model:
        self.tensors = []

    def validate_header(self, data, offset):
-        if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
-            raise ValueError('Only GGJTv3 supported')
-        return 8
+        magic = bytes(data[offset:offset + 4])
+        if magic == b'GGUF':
+            raise ValueError('File is already in GGUF format.')
+        if magic == b'lmgg':
+            self.file_format = GGMLFormat.GGML
+            self.format_version = 1
+            return 4
+        version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
+        if magic == b'fmgg':
+            if version != 1:
+                raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
+            self.file_format = GGMLFormat.GGMF
+            self.format_version = version
+            return 8
+        if magic == b'tjgg':
+            if version < 1 or version > 3:
+                raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
+            self.file_format = GGMLFormat.GGJT
+            self.format_version = version
+            return 8
+        raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
+
+    def validate_conversion(self, ftype):
+        err = ''
+        if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
+            if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
+                err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
+        elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
+            if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
+                          GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
+                err = 'Q4 and Q8 quantizations changed in GGJTv3.'
+        if len(err) > 0:
+            raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')

    def load(self, data, offset):
        offset += self.validate_header(data, offset)
        hp = Hyperparameters()
        offset += hp.load(data, offset)
-        vocab = Vocab()
+        print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
+        self.validate_conversion(hp.ftype)
+        vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
        offset += vocab.load(data, offset, hp.n_vocab)
        tensors: list[Tensor] = []
        tensor_map = {}
        while offset < len(data):
-            tensor = Tensor()
+            tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
            offset += tensor.load(data, offset)
            tensor_map[tensor.name] = len(tensors)
            tensors.append(tensor)
@ -168,7 +235,10 @@ class GGMLToGGUF:

    def save(self):
        print('* Preparing to save GGUF file')
-        gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
+        gguf_writer = gguf.GGUFWriter(
+            self.cfg.output,
+            gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
+            use_temp_file = False )
        self.add_params(gguf_writer)
        self.add_vocab(gguf_writer)
        if self.special_vocab is not None:
@ -185,7 +255,10 @@ class GGMLToGGUF:
    def add_params(self, gguf_writer):
        hp = self.model.hyperparameters
        cfg = self.cfg
-        desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
+        if cfg.desc is not None:
+            desc = cfg.desc
+        else:
+            desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
        try:
            # Filenames aren't necessarily valid UTF8.
            name = cfg.name if cfg.name is not None else cfg.input.name
@ -195,6 +268,7 @@ class GGMLToGGUF:
        if name is not None:
            gguf_writer.add_name(name)
        gguf_writer.add_description(desc)
+        gguf_writer.add_file_type(int(hp.ftype))
        if self.params_override is not None:
            po = self.params_override
            assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
@ -231,7 +305,8 @@ class GGMLToGGUF:
                tokens.append(vbytes)
                scores.append(score)
                toktypes.append(ttype)
-            assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
+            assert len(tokens) == hp.n_vocab, \
+                f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
            gguf_writer.add_token_list(tokens)
            gguf_writer.add_token_scores(scores)
            if len(toktypes) > 0:
@ -283,7 +358,11 @@ class GGMLToGGUF:
                tempdims[1] = tempdims[0]
                tempdims[0] = temp
            # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
-            gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
+            gguf_writer.add_tensor(
+                mapped_name,
+                data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
+                raw_shape = tempdims,
+                raw_dtype = tensor.dtype )

 def handle_metadata(cfg, hp):
    import convert
@ -305,32 +384,46 @@ def handle_metadata(cfg, hp):
        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
    else:
        raise ValueError('Unable to load metadata')
-    vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
+    vocab = convert.load_vocab(
+        cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
+        cfg.vocabtype )
    # FIXME: Respect cfg.vocab_dir?
    svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
    convert.check_vocab_size(params, vocab)
    return (params, vocab, svocab)

 def handle_args():
-    parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
-    parser.add_argument('--input', '-i', type = Path, required = True, help = 'Input GGMLv3 filename')
-    parser.add_argument('--output', '-o', type = Path, required = True, help ='Output GGUF filename')
-    parser.add_argument('--name', help = 'Set model name')
-    parser.add_argument('--desc', help = 'Set model description')
-    parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
-    parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
-    parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
-    parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
-    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
-    parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
+    parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
+    parser.add_argument('--input', '-i', type = Path, required = True,
+        help = 'Input GGMLv3 filename')
+    parser.add_argument('--output', '-o', type = Path, required = True,
+        help ='Output GGUF filename')
+    parser.add_argument('--name',
+        help = 'Set model name')
+    parser.add_argument('--desc',
+        help = 'Set model description')
+    parser.add_argument('--gqa', type = int, default = 1,
+        help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
+    parser.add_argument('--eps', default = '5.0e-06',
+        help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
+    parser.add_argument('--context-length', '-c', type=int, default = 2048,
+        help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
+    parser.add_argument('--model-metadata-dir', '-m', type = Path,
+        help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
+    parser.add_argument("--vocab-dir", type=Path,
+        help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
+    parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
+        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
    return parser.parse_args()

 def main():
    cfg = handle_args()
    print(f'* Using config: {cfg}')
    print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
+    if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
+        print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
    data = np.memmap(cfg.input, mode = 'r')
-    model = GGMLV3Model()
+    model = GGMLModel()
    print('* Scanning GGML input file')
    offset = model.load(data, 0)
    print(f'* GGML model hyperparameters: {model.hyperparameters}')
@ -345,7 +438,12 @@ def main():
        print(f'* Special vocab: {special_vocab}')
    else:
        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
-    converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab)
+        if model.file_format == GGMLFormat.GGML:
+            print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
+    converter = GGMLToGGUF(model, data, cfg,
+        params_override = params_override,
+        vocab_override = vocab_override,
+        special_vocab = special_vocab )
    converter.save()
    print(f'* Successful completion. Output saved to: {cfg.output}')

--- a/convert.py
+++ b/convert.py
@ -266,7 +266,7 @@ class Params:
        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None

        # hack to determine LLaMA v1 vs v2 vs CodeLlama
-        if f_rope_freq_base and f_rope_freq_base == 1000000:
+        if f_rope_freq_base == 1000000:
            # CodeLlama
            n_ctx = 16384
        elif config["norm_eps"] == 1e-05:
@ -841,9 +841,9 @@ class OutputFile:
        name = "LLaMA"

        # TODO: better logic to determine model name
-        if (params.n_ctx == 4096):
+        if params.n_ctx == 4096:
            name = "LLaMA v2"
-        elif params.path_model:
+        elif params.path_model is not None:
            name = str(params.path_model.parent).split('/')[-1]

        self.gguf.add_name                (name)
@ -856,13 +856,13 @@ class OutputFile:
        self.gguf.add_head_count_kv       (params.n_head_kv)
        self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps)

-        if params.f_rope_freq_base:
+        if params.f_rope_freq_base is not None:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)

-        if params.f_rope_scale:
+        if params.f_rope_scale is not None:
            self.gguf.add_rope_scale_linear(params.f_rope_scale)

-        if params.ftype:
+        if params.ftype is not None:
            self.gguf.add_file_type(params.ftype)

    def add_meta_vocab(self, vocab: Vocab) -> None:
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -1,5 +1,6 @@
 #include "ggml.h"
 #include "llama.h"
+#include "common.h"

 #include <unordered_map>
 #include <vector>
@ -499,10 +500,10 @@ struct llama_file {
        errno = 0;
        std::size_t ret = std::fread(ptr, size, 1, fp);
        if (ferror(fp)) {
-            throw std::runtime_error(format("read error: %s", strerror(errno)));
+            die_fmt("fread failed: %s", strerror(errno));
        }
        if (ret != 1) {
-            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+            die("unexpectedly reached end of file");
        }
    }

@ -597,8 +598,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
        printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
        llama_file file(filename, "rb");
        if (!file.fp) {
-            fprintf(stderr, "error: %s: %s\n", strerror(errno), filename);
-            exit(1);
+            die_fmt("%s: %s", strerror(errno), filename);
        }
        const int  n_vocab = config->vocab_size;
        /* uint32_t max_token_length =  */ file.read_u32(); // unused
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@ -23,7 +23,7 @@ extern "C" {
 struct MyModel* create_mymodel(int argc, char ** argv) {
    gpt_params params;

-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!gpt_params_parse(argc, argv, params)) {
        return nullptr;
    }

--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -11,7 +11,7 @@
 int main(int argc, char ** argv) {
    gpt_params params;

-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!gpt_params_parse(argc, argv, params)) {
        return 1;
    }

--- a/examples/gptneox-wip/falcon-main.cpp
+++ b/examples/gptneox-wip/falcon-main.cpp
@ -953,7 +953,7 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!gpt_params_parse(argc, argv, params)) {
        return 1;
    }

--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
@ -925,7 +925,7 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!gpt_params_parse(argc, argv, params)) {
        return 1;
    }

--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -986,7 +986,12 @@ int main(int argc, char ** argv) {
        test t(inst, lmodel, ctx);

        // warmup run
-        test_gen(ctx, 1, 0, t.n_threads);
+        if (t.n_prompt > 0) {
+            test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
+        }
+        if (t.n_gen > 0) {
+            test_gen(ctx, 1, 0, t.n_threads);
+        }

        for (int i = 0; i < params.reps; i++) {
            uint64_t t_start = get_time_ns();
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -48,8 +48,9 @@ static bool is_interacting = false;

 void write_logfile(
    const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const std::vector<llama_token> input_tokens, const std::string output, const std::vector<llama_token> output_tokens) {
-
+    const std::vector<llama_token> & input_tokens, const std::string & output,
+    const std::vector<llama_token> & output_tokens
+) {
    if (params.logdir.empty()) {
        return;
    }
@ -109,7 +110,7 @@ int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;

-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!gpt_params_parse(argc, argv, params)) {
        return 1;
    }

@ -303,7 +304,7 @@ int main(int argc, char ** argv) {

    // debug message about similarity of saved session, if applicable
    size_t n_matching_session_tokens = 0;
-    if (session_tokens.size() > 0) {
+    if (!session_tokens.empty()) {
        for (llama_token id : session_tokens) {
            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
                break;
@ -401,7 +402,7 @@ int main(int argc, char ** argv) {

        LOG_TEE("%s: interactive mode on.\n", __func__);

-        if (params.antiprompt.size()) {
+        if (!params.antiprompt.empty()) {
            for (const auto & antiprompt : params.antiprompt) {
                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
            }
@ -499,7 +500,7 @@ int main(int argc, char ** argv) {

    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
-        if (embd.size() > 0) {
+        if (!embd.empty()) {
            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
            // --prompt or --file which uses the same value.
            int max_embd_size = n_ctx - 4;
@ -624,7 +625,7 @@ int main(int argc, char ** argv) {
                LOG("n_past = %d\n", n_past);
            }

-            if (embd.size() > 0 && !path_session.empty()) {
+            if (!embd.empty() && !path_session.empty()) {
                session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
                n_session_consumed = session_tokens.size();
            }
@ -695,7 +696,7 @@ int main(int argc, char ** argv) {
        // if not currently processing queued inputs;
        if ((int) embd_inp.size() <= n_consumed) {
            // check for reverse prompt
-            if (params.antiprompt.size()) {
+            if (!params.antiprompt.empty()) {
                std::string last_output;
                for (auto id : last_tokens) {
                    last_output += llama_token_to_piece(ctx, id);
@ -732,7 +733,7 @@ int main(int argc, char ** argv) {
                LOG("found EOS token\n");

                if (params.interactive) {
-                    if (params.antiprompt.size() != 0) {
+                    if (!params.antiprompt.empty()) {
                        // tokenize and inject first reverse prompt
                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -655,7 +655,7 @@ int main(int argc, char ** argv) {
    gpt_params params;

    params.n_batch = 512;
-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!gpt_params_parse(argc, argv, params)) {
        return 1;
    }

--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -71,7 +71,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
 }

 // Check if a layer is included/excluded by command line
-bool layer_included(const quantize_stats_params params, const std::string & layer) {
+bool layer_included(const quantize_stats_params & params, const std::string & layer) {
    for (const auto& excluded : params.exclude_layers) {
        if (std::regex_search(layer, std::regex(excluded))) {
            return false;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -143,10 +143,9 @@ int main(int argc, char ** argv) {
        if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
            return 1;
-        } else {
-            if (ftype_str == "COPY") {
-               params.only_copy = true;
-            }
+        }
+        if (ftype_str == "COPY") {
+           params.only_copy = true;
        }
        arg_idx++;
    }
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -13,7 +13,7 @@ int main(int argc, char ** argv) {
    params.repeat_last_n = 64;
    params.prompt = "The quick brown fox";

-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!gpt_params_parse(argc, argv, params)) {
        return 1;
    }

@ -44,7 +44,7 @@ int main(int argc, char ** argv) {
        llama_free_model(model);
        return 1;
    }
-    auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
+    auto tokens = llama_tokenize(ctx, params.prompt, true);
    auto n_prompt_tokens = tokens.size();
    if (n_prompt_tokens < 1) {
        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -139,7 +139,7 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
 }

 // convert a vector of completion_token_output to json
-static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> probs)
+static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> & probs)
 {
    json out = json::array();
    for (const auto &prob : probs)
@ -271,7 +271,7 @@ struct llama_server_context
        return true;
    }

-    std::vector<llama_token> tokenize(json json_prompt, bool add_bos)
+    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
    {
        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
        // or the first element of the json_prompt array is a string.
@ -611,7 +611,7 @@ struct llama_server_context

    completion_token_output doCompletion()
    {
-        const completion_token_output token_with_probs = nextToken();
+        auto token_with_probs = nextToken();

        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
        generated_text += token_text;
@ -1255,7 +1255,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
 struct token_translator {
    llama_context * ctx;
    std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
-    std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
+    std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
 };

 void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -169,10 +169,6 @@ struct my_llama_hparams {

    float rope_freq_base  = 10000.0f;
    float rope_freq_scale = 1.0f;
-
-    bool operator!=(const my_llama_hparams& other) const {
-        return memcmp(this, &other, sizeof(my_llama_hparams));
-    }
 };

 struct my_llama_layer {
@ -929,28 +925,6 @@ void get_example_targets_batch(struct llama_context * lctx, const int * train_sa
    }
 }

-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((format(gnu_printf, 1, 2)))
-#else
-__attribute__((format(printf, 1, 2)))
-#endif
-#endif
-static std::string format(const char * fmt, ...) {
-    va_list ap, ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX);
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
 int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
    FILE * fp = std::fopen(filename, "rb");
    if (fp == NULL) {
@ -983,10 +957,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
    out.resize(size+1);

    if (std::fread(buf.data(), size, 1, fp) != 1) {
-        throw std::runtime_error(std::string("unexpectedly reached end of file"));
+        die("unexpectedly reached end of file");
    }
    if (ferror(fp)) {
-        throw std::runtime_error(format("read error: %s", strerror(errno)));
+        die_fmt("fread failed: %s", strerror(errno));
    }

    buf[size] = '\0';
@ -1047,11 +1021,11 @@ void shuffle_ints(int * begin, int * end) {
    if (kid >= 0) { \
        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
        if (ktype != (type)) { \
-            throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
+            die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
        } \
        (dst) = func(ctx, kid); \
    } else if (req) { \
-        throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
+        die_fmt("key not found in model: %s", skey.c_str()); \
    } \
 }

@ -1136,7 +1110,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
        read_tensor_by_name(opt->lbfgs.lms,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
        read_tensor_by_name(opt->lbfgs.lmy,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
    } else {
-        throw std::runtime_error("unknown optimizer type\n");
+        die("unknown optimizer type");
    }
 }

@ -1315,20 +1289,20 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod

        const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST));
        if (token_idx == -1) {
-            throw std::runtime_error("cannot find tokenizer vocab in model file\n");
+            die("cannot find tokenizer vocab in model file");
        }
        const uint32_t n_vocab = gguf_get_arr_n(vctx, token_idx);

        const int score_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_SCORES));
        if (score_idx == -1) {
-            throw std::runtime_error("cannot find tokenizer scores in model file\n");
+            die("cannot find tokenizer scores in model file");
        }

        const float * scores = (const float * ) gguf_get_arr_data(vctx, score_idx);

        const int toktype_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE));
        if (toktype_idx == -1) {
-            throw std::runtime_error("cannot find token type list in GGUF file\n");
+            die("cannot find token type list in GGUF file");
        }

        const int * toktypes = (const int * ) gguf_get_arr_data(vctx, toktype_idx);
@ -1356,7 +1330,7 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod
            // read and copy bpe merges
            const int merges_keyidx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_MERGES));
            if (merges_keyidx == -1) {
-                throw std::runtime_error("cannot find tokenizer merges in model file\n");
+                die("cannot find tokenizer merges in model file");
            }

            const int n_merges = gguf_get_arr_n(vctx, merges_keyidx);
@ -1988,7 +1962,7 @@ void opt_callback(void * vdata, float * sched) {
    float min_sched = params->adam_min_alpha / params->adam_alpha;
    *sched = min_sched + *sched * (1.0f - min_sched);

-    int impr_plot = std::isnan(opt->loss_after) ? 0 : -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
+    int impr_plot = std::isnan(opt->loss_after) ? 0 : -std::lround(1 + (opt->loss_before - opt->loss_after) * 10.0f);
    printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0);

    if (data->shuffle_countdown < n_batch) {
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -138,7 +138,7 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten

 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
 #ifdef GGML_ALLOCATOR_DEBUG
-    GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
+    GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
 #endif
    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
@ -165,14 +165,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
    if (best_fit_block == -1) {
        // the last block is our last resort
        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
+        max_avail = MAX(max_avail, block->size);
        if (block->size >= size) {
            best_fit_block = alloc->n_free_blocks - 1;
-            max_avail = MAX(max_avail, block->size);
        } else {
            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
                    __func__, size, max_avail);
            GGML_ASSERT(!"not enough space in the buffer");
-        return;
+            return;
        }
    }
    struct free_block * block = &alloc->free_blocks[best_fit_block];
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -307,7 +307,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {

 void * ggml_metal_host_malloc(size_t n) {
    void * data = NULL;
-    const int result = posix_memalign((void **) &data, getpagesize(), n);
+    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
    if (result != 0) {
        metal_printf("%s: error: posix_memalign failed\n", __func__);
        return NULL;
@ -381,7 +381,7 @@ bool ggml_metal_add_buffer(
            }
        }

-        const size_t size_page = getpagesize();
+        const size_t size_page = sysconf(_SC_PAGESIZE);

        size_t size_aligned = size;
        if ((size_aligned % size_page) != 0) {
@ -1114,7 +1114,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];

-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
                        } break;
                    case GGML_OP_DUP:
                    case GGML_OP_CPY:
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -220,14 +220,10 @@ kernel void kernel_norm(
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    //// broadcast
-    //if (tpitg == 0) {
-    //    sum[0] /= ne00;
-    //}
-    //threadgroup_barrier(mem_flags::mem_threadgroup);
-    const float mean  = sum[0];
+    const float mean  = sum[0] / ne00;

    // recenter and VARIANCE
+    threadgroup_barrier(mem_flags::mem_threadgroup);
    device float * y = dst + tgpig*ne00;
    sum[tpitg] = 0.0f;
    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@ -235,12 +231,6 @@ kernel void kernel_norm(
        sum[tpitg] += y[i00] * y[i00];
    }

-    //// VARIANCE
-    //// parallel sum
-    //sum[tpitg] = 0.0f;
-    //for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-    //    sum[tpitg] += y[i00] * y[i00];
-    //}
    // reduce
    threadgroup_barrier(mem_flags::mem_threadgroup);
    for (uint i = ntg/2; i > 0; i /= 2) {
@ -249,12 +239,7 @@ kernel void kernel_norm(
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    //// broadcast
-    //if (tpitg == 0) {
-    //    sum[0] /= ne00;
-    //}
-    //threadgroup_barrier(mem_flags::mem_threadgroup);
-    const float variance = sum[0];
+    const float variance = sum[0] / ne00;

    const float scale = 1.0f/sqrt(variance + eps);
    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@ -262,7 +247,6 @@ kernel void kernel_norm(
    }
 }

-
 kernel void kernel_rms_norm(
        device const  void * src0,
        device       float * dst,
@ -630,7 +614,6 @@ kernel void kernel_mul_mat_f16_f32(
            }
        }
    }
-
 }

 kernel void kernel_alibi_f32(
@ -699,25 +682,27 @@ kernel void kernel_rope(
        constant       int & mode,
        constant     float & freq_base,
        constant     float & freq_scale,
-        uint3 tpig[[thread_position_in_grid]]) {
-    const int64_t i3 = tpig[2];
-    const int64_t i2 = tpig[1];
-    const int64_t i1 = tpig[0];
+        uint  tiitg[[thread_index_in_threadgroup]],
+        uint3 tptg[[threads_per_threadgroup]],
+        uint3 tgpig[[threadgroup_position_in_grid]]) {
+    const int64_t i3 = tgpig[2];
+    const int64_t i2 = tgpig[1];
+    const int64_t i1 = tgpig[0];

    const bool is_neox = mode & 2;
-    const float theta_scale = pow(freq_base, -2.0f/n_dims);

    const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);

-    float theta = freq_scale * (float)p;
+    const float theta_0 = freq_scale * (float)p;
+    const float inv_ndims = -1.f/n_dims;

    if (!is_neox) {
-        for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+        for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
+
+            const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
            const float cos_theta = cos(theta);
            const float sin_theta = sin(theta);

-            theta *= theta_scale;
-
            device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
            device       float * dst_data  = (device float *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

@ -729,12 +714,12 @@ kernel void kernel_rope(
        }
    } else {
        for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-            for (int64_t ic = 0; ic < n_dims; ic += 2) {
+            for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
+
+                const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
                const float cos_theta = cos(theta);
                const float sin_theta = sin(theta);

-                theta *= theta_scale;
-
                const int64_t i0 = ib*n_dims + ic/2;

                device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
--- a/ggml.c
+++ b/ggml.c
@ -195,7 +195,7 @@ typedef void * thread_ret_t;
 inline static void * ggml_aligned_malloc(size_t size) {
    void * aligned_memory = NULL;
 #ifdef GGML_USE_METAL
-    int result = posix_memalign(&aligned_memory, getpagesize(), size);
+    int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
 #else
    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
 #endif
@ -4815,7 +4815,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(

    size_t obj_alloc_size = 0;

-    if (view_src == NULL && ctx->no_alloc == false) {
+    if (view_src == NULL && !ctx->no_alloc) {
        if (ctx->scratch.data != NULL) {
            // allocate tensor data in the scratch buffer
            if (ctx->scratch.offs + data_size > ctx->scratch.size) {
@ -5516,7 +5516,7 @@ static struct ggml_tensor * ggml_mul_impl(
    }

    if (inplace) {
-        GGML_ASSERT(is_node == false);
+        GGML_ASSERT(!is_node);
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@ -5559,7 +5559,7 @@ static struct ggml_tensor * ggml_div_impl(
    }

    if (inplace) {
-        GGML_ASSERT(is_node == false);
+        GGML_ASSERT(!is_node);
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@ -20006,7 +20006,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

        struct ggml_tensor * data = NULL;

-        if (params.no_alloc == false) {
+        if (!params.no_alloc) {
            data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);

            ok = ok && data != NULL;
@ -20047,7 +20047,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
            }

            // point the data member to the appropriate location in the binary blob using the tensor infos
-            if (params.no_alloc == false) {
+            if (!params.no_alloc) {
              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
                cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
            }
--- a/k_quants.c
+++ b/k_quants.c
@ -1089,6 +1089,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
        if (!max_abs_scale) {
            memset(&y[i], 0, sizeof(block_q6_K));
            y[i].d = ggml_fp32_to_fp16(0.f);
+            x += QK_K;
            continue;
        }

--- a/llama.cpp
+++ b/llama.cpp
@ -606,16 +606,16 @@ struct llama_mmap {

        if (prefetch > 0) {
            // Advise the kernel to preload the mapped memory
-            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
-                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+            if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
+                fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
                        strerror(errno));
            }
        }
        if (numa) {
            // advise the kernel not to use readahead
            // (because the next page might not belong on the same node)
-            if (madvise(addr, file->size, MADV_RANDOM)) {
-                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
+            if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
+                fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
                        strerror(errno));
            }
        }
@ -3052,33 +3052,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
 }

-static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
-}
-
-static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
-}
-
 static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
 }

-static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
-    GGML_ASSERT(llama_is_control_token(vocab, id));
-    return id == vocab.special_bos_id;
-}
-
-static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
-    GGML_ASSERT(llama_is_control_token(vocab, id));
-    return id == vocab.special_eos_id;
-}
-
-static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
-    GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
-    return id == vocab.special_pad_id;
-}
-
 static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
    GGML_ASSERT(llama_is_byte_token(vocab, id));
    const auto& token_data = vocab.id_to_token.at(id);
@ -4800,9 +4777,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    std::vector<std::thread> workers;
    std::mutex mutex;

+#ifdef GGML_USE_K_QUANTS
    auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
        return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
    };
+#endif

    int idx = 0;

@ -5947,7 +5926,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
        rng_ss.str(std::string(&rng_buf[0], rng_size));
        rng_ss >> ctx->rng;

-        GGML_ASSERT(rng_ss.fail() == false);
+        GGML_ASSERT(!rng_ss.fail());
    }

    // set logits
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@ -76,7 +76,7 @@ void * align_with_offset(void * ptr, int offset) {
    return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
 }

-void benchmark_function(size_t size, size_t q_size, int64_t iterations, std::function<size_t(void)> function) {
+void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
    int64_t min_time_us = INT64_MAX;
    int64_t total_time_us = 0;
    int64_t min_time_cycles = INT64_MAX;