Merge branch 'master' into concedo_experimental

# Conflicts: # Makefile # README.md # common/log.h
2023-09-02 11:24:28 +08:00 · 2023-09-02 11:24:28 +08:00 · eed651494e
commit eed651494e
parent 8df03ed026 571083f508
26 changed files with 1143 additions and 658 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -173,6 +173,7 @@ if (LLAMA_ALL_WARNINGS)
            -Wpointer-arith
            -Wmissing-prototypes
            -Werror=implicit-int
            -Wno-unused-function
        )
        set(cxx_flags
            -Wall
@ -182,6 +183,10 @@ if (LLAMA_ALL_WARNINGS)
            -Wno-unused-function
            -Wno-multichar
        )
        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
            # g++ only
            set(cxx_flags ${cxx_flags} -Wno-format-truncation)
        endif()
    else()
        # todo : msvc
    endif()
--- a/common/common.cpp
+++ b/common/common.cpp
@ -24,7 +24,9 @@
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
 #   define NOMINMAX
 #endif
 #include <codecvt>
 #include <locale>
 #include <windows.h>
@ -1027,7 +1029,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
    fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
-    fprintf(stream, "hellaswag_tasks: %ld # default: 400\n", params.hellaswag_tasks);
+    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
    const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
    const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
--- a/common/console.cpp
+++ b/common/console.cpp
@ -235,6 +235,7 @@ namespace console {
    int estimateWidth(char32_t codepoint) {
 #if defined(_WIN32)
        (void)codepoint;
        return 1;
 #else
        return wcwidth(codepoint);
--- a/common/log.h
+++ b/common/log.h
@ -1 +1,643 @@
-#define LOG(...)
+#pragma once
 #include <chrono>
 #include <cstring>
 #include <sstream>
 #include <iostream>
 #include <thread>
 #include <vector>
 #include <algorithm>
 #include <cinttypes>
 // --------------------------------
 //
 // Basic usage:
 //
 // --------
 //
 //  The LOG() and LOG_TEE() macros are ready to go by default
 //   they do not require any initialization.
 //
 //  LOGLN() and LOG_TEELN() are variants which automatically
 //   include \n character at the end of the log string.
 //
 //  LOG() behaves exactly like printf, by default writing to a logfile.
 //  LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
 //
 //  Default logfile is named
 //   "llama.<threadID>.log"
 //  Default LOG_TEE() secondary output target is
 //   stderr
 //
 //  Logs can be dynamically disabled or enabled using functions:
 //   log_disable()
 //  and
 //   log_enable()
 //
 //  A log target can be changed with:
 //   log_set_target( string )
 //    creating and opening, or re-opening a file by string filename
 //  or
 //   log_set_target( FILE* )
 //    allowing to point at stderr, stdout, or any valid FILE* file handler.
 //
 // --------
 //
 // End of Basic usage.
 //
 // --------------------------------
 // Specifies a log target.
 //  default uses log_handler() with "llama.log" log file
 //  this can be changed, by defining LOG_TARGET
 //  like so:
 //
 //  #define LOG_TARGET (a valid FILE*)
 //  #include "log.h"
 //
 //  or it can be simply redirected to stdout or stderr
 //  like so:
 //
 //  #define LOG_TARGET stderr
 //  #include "log.h"
 //
 //  The log target can also be redirected to a diffrent function
 //  like so:
 //
 //  #define LOG_TARGET log_handler_diffrent()
 //  #include "log.h"
 //
 //  FILE* log_handler_diffrent()
 //  {
 //      return stderr;
 //  }
 //
 //  or:
 //
 //  #define LOG_TARGET log_handler_another_one("somelog.log")
 //  #include "log.h"
 //
 //  FILE* log_handler_another_one(char*filename)
 //  {
 //      static FILE* logfile = nullptr;
 //      (...)
 //      if( !logfile )
 //      {
 //          fopen(...)
 //      }
 //      (...)
 //      return logfile
 //  }
 //
 #ifndef LOG_TARGET
    #define LOG_TARGET log_handler()
 #endif
 #ifndef LOG_TEE_TARGET
    #define LOG_TEE_TARGET stderr
 #endif
 // Utility to obtain "pid" like unique process id and use it when creating log files.
 inline std::string log_get_pid()
 {
    static std::string pid;
    if (pid.empty())
    {
        // std::this_thread::get_id() is the most portable way of obtaining a "process id"
        //  it's not the same as "pid" but is unique enough to solve multiple instances
        //  trying to write to the same log.
        std::stringstream ss;
        ss << std::this_thread::get_id();
        pid = ss.str();
    }
    return pid;
 }
 // Utility function for generating log file names with unique id based on thread id.
 //  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
 //  where the number is a runtime id of the current thread.
 #define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)
 // INTERNAL, DO NOT USE
 inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
 {
    std::stringstream buf;
    buf << log_file_basename;
    buf << ".";
    buf << log_get_pid();
    buf << ".";
    buf << log_file_extension;
    return buf.str();
 }
 #ifndef LOG_DEFAULT_FILE_NAME
    #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
 #endif
 // Utility for turning #define values into string literals
 //  so we can have a define for stderr and
 //  we can print "stderr" instead of literal stderr, etc.
 #define LOG_STRINGIZE1(s) #s
 #define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
 #define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
 // Allows disabling timestamps.
 //  in order to disable, define LOG_NO_TIMESTAMPS
 //  like so:
 //
 //  #define LOG_NO_TIMESTAMPS
 //  #include "log.h"
 //
 #ifndef LOG_NO_TIMESTAMPS
    #ifndef _MSC_VER
        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
    #else
        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
    #endif
 #else
    #define LOG_TIMESTAMP_FMT "%s"
    #define LOG_TIMESTAMP_VAL ,""
 #endif
 #ifdef LOG_TEE_TIMESTAMPS
    #ifndef _MSC_VER
        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
    #else
        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
    #endif
 #else
    #define LOG_TEE_TIMESTAMP_FMT "%s"
    #define LOG_TEE_TIMESTAMP_VAL ,""
 #endif
 // Allows disabling file/line/function prefix
 //  in order to disable, define LOG_NO_FILE_LINE_FUNCTION
 //  like so:
 //
 //  #define LOG_NO_FILE_LINE_FUNCTION
 //  #include "log.h"
 //
 #ifndef LOG_NO_FILE_LINE_FUNCTION
    #ifndef _MSC_VER
        #define LOG_FLF_FMT "[%24s:%5d][%24s] "
        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #else
        #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #endif
 #else
    #define LOG_FLF_FMT "%s"
    #define LOG_FLF_VAL ,""
 #endif
 #ifdef LOG_TEE_FILE_LINE_FUNCTION
    #ifndef _MSC_VER
        #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #else
        #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #endif
 #else
    #define LOG_TEE_FLF_FMT "%s"
    #define LOG_TEE_FLF_VAL ,""
 #endif
 // Utility for synchronizing log configuration state
 //  since std::optional was introduced only in c++17
 enum LogTriState
 {
    LogTriStateSame,
    LogTriStateFalse,
    LogTriStateTrue
 };
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
 #ifndef _MSC_VER
    #define LOG_IMPL(str, ...)                                                                                          \
    {                                                                                                               \
        if (LOG_TARGET != nullptr)                                                                                  \
        {                                                                                                           \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
            fflush(LOG_TARGET);                                                                                     \
        }                                                                                                           \
    }
 #else
    #define LOG_IMPL(str, ...)                                                                                               \
    {                                                                                                                    \
        if (LOG_TARGET != nullptr)                                                                                       \
        {                                                                                                                \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
            fflush(LOG_TARGET);                                                                                          \
        }                                                                                                                \
    }
 #endif
 // INTERNAL, DO NOT USE
 //  USE LOG_TEE() INSTEAD
 //
 #ifndef _MSC_VER
    #define LOG_TEE_IMPL(str, ...)                                                                                                          \
    {                                                                                                                                   \
        if (LOG_TARGET != nullptr)                                                                                                      \
        {                                                                                                                               \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__);                     \
            fflush(LOG_TARGET);                                                                                                         \
        }                                                                                                                               \
        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                         \
        {                                                                                                                               \
            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
            fflush(LOG_TEE_TARGET);                                                                                                     \
        }                                                                                                                               \
    }
 #else
    #define LOG_TEE_IMPL(str, ...)                                                                                                               \
    {                                                                                                                                        \
        if (LOG_TARGET != nullptr)                                                                                                           \
        {                                                                                                                                    \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__);                     \
            fflush(LOG_TARGET);                                                                                                              \
        }                                                                                                                                    \
        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                              \
        {                                                                                                                                    \
            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
            fflush(LOG_TEE_TARGET);                                                                                                          \
        }                                                                                                                                    \
    }
 #endif
 // The '\0' as a last argument, is a trick to bypass the silly
 //  "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
 //  so we can have a single macro which can be called just like printf.
 // Main LOG macro.
 //  behaves like printf, and supports arguments the exact same way.
 //
 #ifndef _MSC_VER
    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
 #else
    #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
 #endif
 // Main TEE macro.
 //  does the same as LOG
 //  and
 //  simultaneously writes stderr.
 //
 // Secondary target can be changed just like LOG_TARGET
 //  by defining LOG_TEE_TARGET
 //
 #ifndef _MSC_VER
    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
 #else
    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
 #endif
 // LOG macro variants with auto endline.
 #ifndef _MSC_VER
    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
 #else
    #define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
    #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
 #endif
 // INTERNAL, DO NOT USE
 inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
 {
    static bool _initialized{false};
    static bool _disabled{(filename.empty() && target == nullptr)};
    static std::string log_current_filename{filename};
    static FILE *log_current_target{target};
    static FILE *logfile = nullptr;
    if (change)
    {
        if (disable == LogTriStateTrue)
        {
            // Disable primary target
            _disabled = true;
        }
        // If previously disabled, only enable, and keep previous target
        else if (disable == LogTriStateFalse)
        {
            _disabled = false;
        }
        // Otherwise, process the arguments
        else if (log_current_filename != filename || log_current_target != target)
        {
            _initialized = false;
        }
    }
    if (_initialized)
    {
        if (_disabled)
        {
            // Log is disabled
            return nullptr;
        }
        // with fallback in case something went wrong
        return logfile ? logfile : stderr;
    }
    // do the (re)initialization
    if (target != nullptr)
    {
        if (logfile != nullptr && logfile != stdout && logfile != stderr)
        {
            fclose(logfile);
        }
        log_current_filename = LOG_DEFAULT_FILE_NAME;
        log_current_target = target;
        logfile = target;
    }
    else
    {
        if (log_current_filename != filename)
        {
            if (logfile != nullptr && logfile != stdout && logfile != stderr)
            {
                fclose(logfile);
            }
        }
        logfile = fopen(filename.c_str(), "w");
    }
    if (!logfile)
    {
        //  Verify whether the file was opened, otherwise fallback to stderr
        logfile = stderr;
        fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
        fflush(stderr);
        // At this point we let the init flag be to true below, and let the target fallback to stderr
        //  otherwise we would repeatedly fopen() which was already unsuccessful
    }
    _initialized = true;
    return logfile ? logfile : stderr;
 }
 // INTERNAL, DO NOT USE
 inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
 {
    return log_handler1_impl(change, disable, filename, target);
 }
 // Disables logs entirely at runtime.
 //  Makes LOG() and LOG_TEE() produce no output,
 //  untill enabled back.
 #define log_disable() log_disable_impl()
 // INTERNAL, DO NOT USE
 inline FILE *log_disable_impl()
 {
    return log_handler1_impl(true, LogTriStateTrue);
 }
 // Enables logs at runtime.
 #define log_enable() log_enable_impl()
 // INTERNAL, DO NOT USE
 inline FILE *log_enable_impl()
 {
    return log_handler1_impl(true, LogTriStateFalse);
 }
 // Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
 #define log_set_target(target) log_set_target_impl(target)
 // INTERNAL, DO NOT USE
 inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
 inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }
 // INTERNAL, DO NOT USE
 inline FILE *log_handler() { return log_handler1_impl(); }
 inline void log_test()
 {
    log_disable();
    LOG("01 Hello World to nobody, because logs are disabled!\n")
    log_enable();
    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET))
    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n")
    log_set_target(stderr);
    LOG("04 Hello World to stderr!\n")
    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n")
    log_set_target(LOG_DEFAULT_FILE_NAME);
    LOG("06 Hello World to default log file!\n")
    log_set_target(stdout);
    LOG("07 Hello World to stdout!\n")
    log_set_target(LOG_DEFAULT_FILE_NAME);
    LOG("08 Hello World to default log file again!\n")
    log_disable();
    LOG("09 Hello World _1_ into the void!\n")
    log_enable();
    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n")
    log_disable();
    log_set_target("llama.anotherlog.log");
    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n")
    log_enable();
    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n")
    log_set_target("llama.yetanotherlog.log");
    LOG("13 Hello World this time in yet new file?\n")
    log_set_target(log_filename_generator("llama_autonamed", "log"));
    LOG("14 Hello World in log with generated filename!\n")
 #ifdef _MSC_VER
    LOG_TEE("15 Hello msvc TEE without arguments\n")
    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test")
    LOG_TEELN("17 Hello msvc TEELN without arguments\n")
    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test")
    LOG("19 Hello msvc LOG without arguments\n")
    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test")
    LOGLN("21 Hello msvc LOGLN without arguments\n")
    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test")
 #endif
 }
 inline bool log_param_single_parse(const std::string & param)
 {
    if ( param == "--log-test")
    {
        log_test();
        return true;
    }
    if ( param == "--log-disable")
    {
        log_disable();
        return true;
    }
    if ( param == "--log-enable")
    {
        log_enable();
        return true;
    }
    return false;
 }
 inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
 {
    if ( param == "--log-file")
    {
        if (!check_but_dont_parse)
        {
            log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
        }
        return true;
    }
    return false;
 }
 inline void log_print_usage()
 {
    fprintf(stdout, "log options:\n");
    /* format
    fprintf(stdout, "  -h, --help            show this help message and exit\n");*/
    /* spacing
    fprintf(stdout, "__-param----------------Description\n");*/
    fprintf(stdout, "  --log-test            Run simple logging test\n");
    fprintf(stdout, "  --log-disable         Disable trace logs\n");
    fprintf(stdout, "  --log-enable          Enable trace logs\n");
    fprintf(stdout, "  --log-file            Specify a log filename (without extension)\n");
    fprintf(stdout, "                        Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /*  */
 }
 #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
 // INTERNAL, DO NOT USE
 inline void log_dump_cmdline_impl(int argc, char **argv)
 {
    std::stringstream buf;
    for (int i = 0; i < argc; ++i)
    {
        if (std::string(argv[i]).find(' ') != std::string::npos)
        {
            buf << " \"" << argv[i] <<"\"";
        }
        else
        {
            buf << " " << argv[i];
        }
    }
    LOGLN("Cmd:%s", buf.str().c_str())
 }
 #define log_tostr(var) log_var_to_string_impl(var).c_str()
 inline std::string log_var_to_string_impl(bool var)
 {
    return var ? "true" : "false";
 }
 inline std::string log_var_to_string_impl(std::string var)
 {
    return var;
 }
 inline std::string log_var_to_string_impl(const std::vector<int> & var)
 {
    std::stringstream buf;
    buf << "[ ";
    bool first = true;
    for (auto e : var)
    {
        if (first)
        {
            first = false;
        }
        else
        {
            buf << ", ";
        }
        buf << std::to_string(e);
    }
    buf << " ]";
    return buf.str();
 }
 #define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens)                                 \
    [&tokens, &ctx]()                                                        \
    {                                                                        \
        std::stringstream buf;                                               \
        buf << "[ ";                                                         \
                                                                             \
        bool first = true;                                                   \
        for (const auto &token : tokens)                                     \
        {                                                                    \
            if (!first)                                                      \
                buf << ", ";                                                 \
            else                                                             \
                first = false;                                               \
                                                                             \
            auto detokenized = llama_token_to_piece(ctx, token);             \
                                                                             \
            detokenized.erase(                                               \
                std::remove_if(                                              \
                    detokenized.begin(),                                     \
                    detokenized.end(),                                       \
                    [](const unsigned char c) { return !std::isprint(c); }), \
                detokenized.end());                                          \
                                                                             \
            buf                                                              \
                << "'" << detokenized << "'"                                 \
                << ":" << std::to_string(token);                             \
        }                                                                    \
        buf << " ]";                                                         \
                                                                             \
        return buf.str();                                                    \
    }()                                                                      \
        .c_str()
 #ifdef LOG_DISABLE_LOGS
 #undef LOG
 #define LOG(...) // dummy stub
 #undef LOGLN
 #define LOGLN(...) // dummy stub
 #undef LOG_TEE
 #define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
 #undef LOG_TEELN
 #define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
 #undef LOG_DISABLE
 #define LOG_DISABLE() // dummy stub
 #undef LOG_ENABLE
 #define LOG_ENABLE() // dummy stub
 #undef LOG_ENABLE
 #define LOG_ENABLE() // dummy stub
 #undef LOG_SET_TARGET
 #define LOG_SET_TARGET(...) // dummy stub
 #undef LOG_DUMP_CMDLINE
 #define LOG_DUMP_CMDLINE(...) // dummy stub
 #endif // LOG_DISABLE_LOGS
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@ -11,11 +11,14 @@ import sys
 from pathlib import Path
 from typing import Any
 import gguf
 import numpy as np
 import torch
 from transformers import AutoTokenizer  # type: ignore[import]
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 def bytes_to_unicode():
    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@ -11,11 +11,14 @@ import sys
 from pathlib import Path
 from typing import Any
 import gguf
 import numpy as np
 import torch
 from transformers import AutoTokenizer  # type: ignore[import]
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@ -1,261 +0,0 @@
 #!/usr/bin/env python3
 # 7b pth llama --> gguf conversion
 # Only models with a single datafile are supported, like 7B
 # HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
 from __future__ import annotations
 import argparse
 import json
 import os
 import struct
 import sys
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 import gguf
 import numpy as np
 import torch
 from sentencepiece import SentencePieceProcessor  # type: ignore[import]
 if TYPE_CHECKING:
    from typing import TypeAlias
 NDArray: TypeAlias = 'np.ndarray[Any, Any]'
 def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("consolidated."):
            num_parts += 1
    if num_parts > 0:
        print("gguf: found " + str(num_parts) + " model parts")
    return num_parts
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Convert a PyTorch 7B LLaMA model to a GGML compatible file")
    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.bin)")
    parser.add_argument("ftype",     type=int, choices=[0, 1],   help="output format - use 0 for float32, 1 for float16", default = 1)
    return parser.parse_args()
 args = parse_args()
 dir_model = args.model
 ftype = args.ftype
 if not dir_model.is_dir():
    print(f'Error: {args.model} is not a directory', file = sys.stderr)
    sys.exit(1)
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 if args.outfile is not None:
    fname_out = args.outfile
 else:
    # output in the same directory as the model by default
    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
 print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 if hparams["architectures"][0] != "LlamaForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit()
 # get number of model parts
 num_parts = count_model_parts(dir_model)
 if num_parts > 1:
    print("gguf: Only models with a single datafile are supported.")
    sys.exit()
 ARCH=gguf.MODEL_ARCH.LLAMA
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 block_count = hparams["num_hidden_layers"]
 head_count = hparams["num_attention_heads"]
 if "num_key_value_heads" in hparams:
    head_count_kv = hparams["num_key_value_heads"]
 else:
    head_count_kv = head_count
 if "_name_or_path" in hparams:
    hf_repo = hparams["_name_or_path"]
 else:
    hf_repo = ""
 if "max_sequence_length" in hparams:
    ctx_length = hparams["max_sequence_length"]
 elif "max_position_embeddings" in hparams:
    ctx_length = hparams["max_position_embeddings"]
 else:
    print("gguf: can not find ctx length parameter.")
    sys.exit()
 gguf_writer.add_name(dir_model.name)
 gguf_writer.add_source_hf_repo(hf_repo)
 gguf_writer.add_tensor_data_layout("Meta AI original pth")
 gguf_writer.add_context_length(ctx_length)
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
 gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
 gguf_writer.add_head_count(head_count)
 gguf_writer.add_head_count_kv(head_count_kv)
 gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
 if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
    if "type" in hparams["rope_scaling"]:
        if hparams["rope_scaling"]["type"] == "linear":
            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
 # TOKENIZATION
 print("gguf: get tokenizer metadata")
 tokens: list[bytes] = []
 scores: list[float] = []
 toktypes: list[int] = []
 tokenizer_model_file = dir_model / 'tokenizer.model'
 if not tokenizer_model_file.is_file():
    print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
    sys.exit(1)
 # vocab type sentencepiece
 print("gguf: get sentencepiece tokenizer vocab and scores")
 tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
 for i in range(tokenizer.vocab_size()):
    text: bytes
    score: float
    piece = tokenizer.id_to_piece(i)
    text = piece.encode("utf-8")
    score = tokenizer.get_score(i)
    toktype = 1  # defualt to normal token type
    if tokenizer.is_unknown(i):
        toktype = 2
    if tokenizer.is_control(i):
        toktype = 3
    # toktype = 4 is user-defined = tokens from added_tokens.json
    if tokenizer.is_unused(i):
        toktype = 5
    if tokenizer.is_byte(i):
        toktype = 6
    tokens.append(text)
    scores.append(score)
    toktypes.append(toktype)
 added_tokens_file = dir_model / 'added_tokens.json'
 if added_tokens_file.is_file():
    with open(added_tokens_file, "r", encoding="utf-8") as f:
        addtokens_json = json.load(f)
        print("gguf: get added tokens")
        for key in addtokens_json:
            tokens.append( key.encode("utf-8") )
            scores.append(-1000.0)
            toktypes.append(4) # user-defined token type
 gguf_writer.add_tokenizer_model("llama")
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 special_vocab = gguf.SpecialVocab(dir_model)
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 # tensor info
 print("gguf: get tensor metadata")
 part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
 for part_name in part_names:
    if args.vocab_only:
        break
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
    for name in model_part.keys():
        data = model_part[name]
        # we don't need these
        if name == "rope.freqs":
            continue
        old_dtype = data.dtype
        # convert any unsupported data types to float32
        if data.dtype != torch.float16 and data.dtype != torch.float32:
            data = data.to(torch.float32)
        data = data.squeeze().numpy()
        # map tensor names
        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
        if new_name is None:
            print("Can not map tensor '" + name + "'")
            sys.exit()
        n_dims = len(data.shape)
        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data_dtype == np.float16:
            data = data.astype(np.float32)
        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
            data = data.astype(np.float32)
        # if f16 desired, convert any float32 2-dim weight tensors to float16
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)
        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(new_name, data)
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 if not args.vocab_only:
    print("gguf: write tensors")
    gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print(f"gguf: model successfully exported to '{fname_out}'")
 print("")
--- a/convert-llama-ggmlv3-to-gguf.py
+++ b/convert-llama-ggmlv3-to-gguf.py
@ -7,9 +7,13 @@ import struct
 import sys
 from pathlib import Path
 import gguf
 import numpy as np
 import os
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 # Note: Does not support GGML_QKK_64
 QK_K = 256
 # Items here are (block size, type size)
--- a/convert-llama-hf-to-gguf.py
+++ b/convert-llama-hf-to-gguf.py
@ -1,280 +0,0 @@
 #!/usr/bin/env python3
 # HF llama --> gguf conversion
 from __future__ import annotations
 import argparse
 import json
 import os
 import struct
 import sys
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 import gguf
 import numpy as np
 import torch
 from sentencepiece import SentencePieceProcessor  # type: ignore[import]
 if TYPE_CHECKING:
    from typing import TypeAlias
 NDArray: TypeAlias = 'np.ndarray[Any, Any]'
 # reverse HF permute back to original pth layout
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
 def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
    if n_kv_head is not None and n_head != n_kv_head:
        n_head //= n_kv_head
    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
            .swapaxes(1, 2)
            .reshape(weights.shape))
 def count_model_parts(dir_model: str) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
            num_parts += 1
    if num_parts > 0:
        print("gguf: found " + str(num_parts) + " model parts")
    return num_parts
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.bin)")
    parser.add_argument("ftype",     type=int, choices=[0, 1],   help="output format - use 0 for float32, 1 for float16", default = 1)
    return parser.parse_args()
 args = parse_args()
 dir_model = args.model
 ftype = args.ftype
 if not dir_model.is_dir():
    print(f'Error: {args.model} is not a directory', file = sys.stderr)
    sys.exit(1)
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 if args.outfile is not None:
    fname_out = args.outfile
 else:
    # output in the same directory as the model by default
    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
 print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 if hparams["architectures"][0] != "LlamaForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit()
 # get number of model parts
 num_parts = count_model_parts(dir_model)
 ARCH=gguf.MODEL_ARCH.LLAMA
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 block_count = hparams["num_hidden_layers"]
 head_count = hparams["num_attention_heads"]
 if "num_key_value_heads" in hparams:
    head_count_kv = hparams["num_key_value_heads"]
 else:
    head_count_kv = head_count
 if "_name_or_path" in hparams:
    hf_repo = hparams["_name_or_path"]
 else:
    hf_repo = ""
 if "max_sequence_length" in hparams:
    ctx_length = hparams["max_sequence_length"]
 elif "max_position_embeddings" in hparams:
    ctx_length = hparams["max_position_embeddings"]
 else:
    print("gguf: can not find ctx length parameter.")
    sys.exit()
 gguf_writer.add_name(dir_model.name)
 gguf_writer.add_source_hf_repo(hf_repo)
 gguf_writer.add_tensor_data_layout("Meta AI original pth")
 gguf_writer.add_context_length(ctx_length)
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
 gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
 gguf_writer.add_head_count(head_count)
 gguf_writer.add_head_count_kv(head_count_kv)
 gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
 if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
    if "type" in hparams["rope_scaling"]:
        if hparams["rope_scaling"]["type"] == "linear":
            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
 # TOKENIZATION
 print("gguf: get tokenizer metadata")
 tokens: list[bytes] = []
 scores: list[float] = []
 toktypes: list[int] = []
 tokenizer_model_file = dir_model / 'tokenizer.model'
 if not tokenizer_model_file.is_file():
    print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
    sys.exit(1)
 # vocab type sentencepiece
 print("gguf: get sentencepiece tokenizer vocab, scores and token types")
 tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
 for i in range(tokenizer.vocab_size()):
    text: bytes
    score: float
    piece = tokenizer.id_to_piece(i)
    text = piece.encode("utf-8")
    score = tokenizer.get_score(i)
    toktype = 1  # defualt to normal token type
    if tokenizer.is_unknown(i):
        toktype = 2
    if tokenizer.is_control(i):
        toktype = 3
    # toktype = 4 is user-defined = tokens from added_tokens.json
    if tokenizer.is_unused(i):
        toktype = 5
    if tokenizer.is_byte(i):
        toktype = 6
    tokens.append(text)
    scores.append(score)
    toktypes.append(toktype)
 added_tokens_file = dir_model / 'added_tokens.json'
 if added_tokens_file.is_file():
    with open(added_tokens_file, "r", encoding="utf-8") as f:
        addtokens_json = json.load(f)
        print("gguf: get added tokens")
        for key in addtokens_json:
            tokens.append( key.encode("utf-8") )
            scores.append(-1000.0)
            toktypes.append(4) # user-defined token type
 gguf_writer.add_tokenizer_model("llama")
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 special_vocab = gguf.SpecialVocab(dir_model)
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 # tensor info
 print("gguf: get tensor metadata")
 if num_parts == 0:
    part_names = iter(("pytorch_model.bin",))
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )
 for part_name in part_names:
    if args.vocab_only:
        break
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
    for name in model_part.keys():
        data = model_part[name]
        # we don't need these
        if name.endswith(".rotary_emb.inv_freq"):
            continue
        old_dtype = data.dtype
        # convert any unsupported data types to float32
        if data.dtype != torch.float16 and data.dtype != torch.float32:
            data = data.to(torch.float32)
        data = data.squeeze().numpy()
        # reverse permute these
        if name.endswith(".q_proj.weight"):
            data = reverse_hf_permute(data, head_count)
        if name.endswith(".k_proj.weight"):
            data = reverse_hf_permute(data, head_count, head_count_kv)
        # map tensor names
        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
        if new_name is None:
            print("Can not map tensor '" + name + "'")
            sys.exit()
        n_dims = len(data.shape)
        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data_dtype == np.float16:
            data = data.astype(np.float32)
        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
            data = data.astype(np.float32)
        # if f16 desired, convert any float32 2-dim weight tensors to float16
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)
        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(new_name, data)
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 if not args.vocab_only:
    print("gguf: write tensors")
    gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print(f"gguf: model successfully exported to '{fname_out}'")
 print("")
--- a/convert.py
+++ b/convert.py
@ -25,10 +25,14 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar
 import gguf
 import numpy as np
 from sentencepiece import SentencePieceProcessor  # type: ignore[import]
 import os
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 if TYPE_CHECKING:
    from typing import TypeAlias
@ -526,7 +530,7 @@ class LazyTensor:
            raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
-LazyModel = dict[str, LazyTensor]
+LazyModel: TypeAlias = 'dict[str, LazyTensor]'
@dataclass
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -1617,15 +1617,10 @@ int main(int argc, char ** argv) {
        float error_before_opt = ggml_get_f32_1d(e, 0);
        struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
        opt_params_adam.print_forward_graph = false;
        opt_params_adam.print_backward_graph = false;
        opt_params_lbfgs.print_forward_graph = false;
        opt_params_lbfgs.print_backward_graph = false;
        opt_params_adam.adam.n_iter = 16;
        opt_params_lbfgs.lbfgs.n_iter = 16;
        // ggml_opt(ctx0, opt_params_adam, e);
        ggml_opt(ctx0, opt_params_lbfgs, e);
        //
        ggml_build_forward_expand(&gf, e);
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@ -22,7 +22,9 @@
 #include <unistd.h>
 #elif defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
 #   define NOMINMAX
 #endif
 #include <windows.h>
 #include <signal.h>
 #endif
@ -73,7 +75,7 @@ void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_stat
        assert(0u < beams_state.n_beams);
        const llama_token * tokens = beams_state.beam_views[0].tokens;
        std::copy(tokens, tokens + n, callback_data.response.end() - n);
-        printf("%lu", n);
+        printf("%zu", n);
    }
    fflush(stdout);
 #if 1 // DEBUG: print current beams for this iteration
@ -145,7 +147,7 @@ int main(int argc, char ** argv)
    if (tokens_list.size() > max_tokens_list_size)
    {
-        fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" ,
+        fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
             __func__ , tokens_list.size() , max_tokens_list_size );
        return 1;
    }
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -75,7 +75,7 @@ typedef struct {
    int seq_len; // max sequence length
 } Config;
-typedef struct {
+struct TransformerWeights {
    // token embedding table
    float* token_embedding_table;    // (vocab_size, dim)
    // weights for rmsnorms
@ -97,7 +97,22 @@ typedef struct {
    // float* freq_cis_imag; // (seq_len, dim/2)
    // (optional) classifier weights for the logits, on the last layer
    float* wcls;
-} TransformerWeights;
+
    ~TransformerWeights() {
        delete[] token_embedding_table;
        delete[] rms_att_weight;
        delete[] rms_ffn_weight;
        delete[] wq;
        delete[] wk;
        delete[] wv;
        delete[] wo;
        delete[] w1;
        delete[] w2;
        delete[] w3;
        delete[] rms_final_weight;
        delete[] wcls;
    }
 };
 void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
    // we calloc instead of malloc to keep valgrind happy
@ -173,21 +188,6 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar
    return 0;
 }
 void free_weights(TransformerWeights* w) {
    delete w->token_embedding_table;
    delete w->rms_att_weight;
    delete w->rms_ffn_weight;
    delete w->wq;
    delete w->wk;
    delete w->wv;
    delete w->wo;
    delete w->w1;
    delete w->w2;
    delete w->w3;
    delete w->rms_final_weight;
    if (w->wcls) delete w->wcls;
 }
 void print_sample_weights(TransformerWeights *w){
    printf("----- Quick print of first of the weight vales of all the variables\n");
    printf("%f\n", w->token_embedding_table[0]);
@ -596,6 +596,10 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
        // assume llama2.c vocabulary
        printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
        llama_file file(filename, "rb");
        if (!file.fp) {
            fprintf(stderr, "error: %s: %s\n", strerror(errno), filename);
            exit(1);
        }
        const int  n_vocab = config->vocab_size;
        /* uint32_t max_token_length =  */ file.read_u32(); // unused
        vocab->id_to_token.resize(n_vocab);
@ -633,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
    }
 }
-void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
+void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
    int ct;
    switch (gg_weights->n_dims){
        case 1:
@ -670,13 +674,13 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar
 }
 void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
-    // stuff AK weights into GG weights one by one.
+    // convert AK weights into GG weights one by one.
    // w->token_embedding_table -> model->tok_embeddings
    // float*                   -> struct ggml_tensor
-    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
-    stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
+    convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
-    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
+    convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
    //print_row(model->norm, 0);
    // for rms-att-weight
@ -686,18 +690,18 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
        auto & layer = model->layers[i];
        // 1d
-        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+        convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
+        convert_weights_ak_to_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
-        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+        convert_weights_ak_to_gg(layer.wq            , &w->wq[i*row_length*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+        convert_weights_ak_to_gg(layer.wo            , &w->wo[i*row_length*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
+        convert_weights_ak_to_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
-        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
+        convert_weights_ak_to_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
+        convert_weights_ak_to_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
    }
    struct gguf_context * ctx = gguf_init_empty();
@ -898,7 +902,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
 }
 std::string basename(const std::string &path) {
-    size_t pos = path.find_last_of("/");
+    size_t pos = path.find_last_of("/\\");
    if (pos == std::string::npos) {
        return path;
    }
@ -911,7 +915,7 @@ int main(int argc, char ** argv) {
        return 1;
    }
    Config config;
-    TransformerWeights weights;
+    TransformerWeights weights = {};
    {
        FILE *file = fopen(params.fn_llama2c_model, "rb");
        if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
@ -953,6 +957,5 @@ int main(int argc, char ** argv) {
    printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
    ggml_free(model.ctx);
    free_weights(&weights);
    return 0;
 }
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -34,7 +34,7 @@ For an interactive experience, try this command:
 #### Unix-based systems (Linux, macOS, etc.):
 ```bash
-./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \
+./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
 'User: Hi
 AI: Hello. I am an AI chatbot. Would you like to talk?
 User: Sure!
@ -45,7 +45,7 @@ User:'
 #### Windows:
 ```powershell
-main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
+main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
 ```
 The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -33,6 +33,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
    // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
 };
@ -69,12 +71,17 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
 //  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 void usage(const char * executable) {
-    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
-    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
+    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
-    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
+    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
-    fprintf(stderr, "\nAllowed quantization types:\n");
+    printf("\nAllowed quantization types:\n");
    for (auto & it : QUANT_OPTIONS) {
-        printf("  %2d  or  %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
+        if (it.name != "COPY") {
            printf("  %2d  or  ", it.ftype);
        } else {
            printf("          ");
        }
        printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
    }
    exit(1);
 }
@ -119,6 +126,9 @@ int main(int argc, char ** argv) {
        // export as [inp path]/ggml-model-[ftype].gguf
        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
        arg_idx++;
        if (ftype_str == "COPY") {
            params.only_copy = true;
        }
    }
    else {
        fname_out = argv[arg_idx];
@ -131,6 +141,10 @@ int main(int argc, char ** argv) {
        if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
            return 1;
        } else {
            if (ftype_str == "COPY") {
               params.only_copy = true;
            }
        }
        arg_idx++;
    }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -17,6 +17,8 @@
 #include "completion.js.hpp"
 #include "json-schema-to-grammar.mjs.hpp"
 #include <cstddef>
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
 #endif
@ -1038,7 +1040,7 @@ static json format_timings(llama_server_context &llama)
 {
    const auto timings = llama_get_timings(llama.ctx);
-    assert(timings.n_eval == llama.num_tokens_predicted);
+    assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
    return json{
        {"prompt_n", timings.n_p_eval},
@ -1239,7 +1241,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
        const llama_token * tokens = beams_state.beam_views[0].tokens;
        const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
        std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
-        printf("%lu", n);
+        printf("%zu", n);
    }
    fflush(stdout);
 #if 0 // DEBUG: print current beams for this iteration
@ -1377,7 +1379,13 @@ int main(int argc, char **argv)
                }
            }
-            const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
+            auto probs = llama.generated_token_probs;
            if (llama.params.n_probs > 0 && llama.stopped_word) {
                const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
                probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
            }
            const json data = format_final_response(llama, llama.generated_text, probs);
            llama_print_timings(llama.ctx);
@ -1454,7 +1462,11 @@ int main(int argc, char **argv)
                    if (!llama.has_next_token) {
                        // Generation is done, send extra information.
-                        const json data = format_final_response(llama, "", llama.generated_token_probs);
+                        const json data = format_final_response(
                            llama,
                            "",
                            std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index)
                        );
                        const std::string str =
                            "data: " +
@ -1548,7 +1560,7 @@ int main(int argc, char **argv)
    svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep)
                              {
-        const auto * fmt = "500 Internal Server Error\n%s";
+        const char fmt[] = "500 Internal Server Error\n%s";
        char buf[BUFSIZ];
        try {
            std::rethrow_exception(std::move(ep));
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@ -2,13 +2,16 @@
 # train-text-from-scratch checkpoint --> gguf conversion
 import argparse
 import gguf
 import os
 import struct
 import sys
 import numpy as np
 from pathlib import Path
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf'))
 import gguf
 # gguf constants
 LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
 LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -81,12 +81,29 @@
 #if defined(GGML_USE_HIPBLAS)
 #define __CUDA_ARCH__ 1300
 #ifndef __has_builtin
    #define __has_builtin(x) 0
 #endif
 typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
 static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
 #if __has_builtin(__builtin_elementwise_sub_sat)
    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
    return reinterpret_cast<const int&>(c);
 #else
    int8x4_t c;
    int16_t tmp;
 #pragma unroll
    for (int i = 0; i < 4; i++) {
        tmp = va[i] - vb[i];
        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
        c[i] = tmp;
    }
    return reinterpret_cast<int&>(c);
 #endif // __has_builtin(__builtin_elementwise_sub_sat)
 }
 static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -680,6 +680,12 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_ADD:
                        {
                            GGML_ASSERT(ggml_is_contiguous(src0));
                            // utilize float4
                            GGML_ASSERT(ne00 % 4 == 0);
                            const int64_t nb = ne00/4;
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
                                [encoder setComputePipelineState:ctx->pipeline_add_row];
@ -689,14 +695,20 @@ void ggml_metal_graph_compute(
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                            [encoder setBytes:&nb     length:sizeof(nb) atIndex:3];
-                            const int64_t n = ggml_nelements(dst);
+                            const int64_t n = ggml_nelements(dst)/4;
                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_MUL:
                        {
                            GGML_ASSERT(ggml_is_contiguous(src0));
                            // utilize float4
                            GGML_ASSERT(ne00 % 4 == 0);
                            const int64_t nb = ne00/4;
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
                                [encoder setComputePipelineState:ctx->pipeline_mul_row];
@ -706,9 +718,9 @@ void ggml_metal_graph_compute(
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                            [encoder setBytes:&nb     length:sizeof(nb) atIndex:3];
-                            const int64_t n = ggml_nelements(dst);
+                            const int64_t n = ggml_nelements(dst)/4;
                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
@ -840,7 +852,7 @@ void ggml_metal_graph_compute(
                                switch (src0t) {
                                    case GGML_TYPE_F16:
                                        {
-                                            nth0 = 64;
+                                            nth0 = 32;
                                            nth1 = 1;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
                                        } break;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -25,9 +25,9 @@ typedef struct {
 } block_q8_0;
 kernel void kernel_add(
-        device const float * src0,
+        device const float4 * src0,
-        device const float * src1,
+        device const float4 * src1,
-        device       float * dst,
+        device       float4 * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] + src1[tpig];
 }
@ -35,18 +35,18 @@ kernel void kernel_add(
 // assumption: src1 is a row
 // broadcast src1 into src0
 kernel void kernel_add_row(
-        device const float * src0,
+        device const float4 * src0,
-        device const float * src1,
+        device const float4 * src1,
-        device       float * dst,
+        device       float4 * dst,
-        constant   int64_t & ne00,
+        constant   int64_t & nb,
        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] + src1[tpig % ne00];
+    dst[tpig] = src0[tpig] + src1[tpig % nb];
 }
 kernel void kernel_mul(
-        device const float * src0,
+        device const float4 * src0,
-        device const float * src1,
+        device const float4 * src1,
-        device       float * dst,
+        device       float4 * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] * src1[tpig];
 }
@ -54,12 +54,12 @@ kernel void kernel_mul(
 // assumption: src1 is a row
 // broadcast src1 into src0
 kernel void kernel_mul_row(
-        device const float * src0,
+        device const float4 * src0,
-        device const float * src1,
+        device const float4 * src1,
-        device       float * dst,
+        device       float4 * dst,
-        constant   int64_t & ne00,
+        constant    int64_t & nb,
        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * src1[tpig % ne00];
+    dst[tpig] = src0[tpig] * src1[tpig % nb];
 }
 kernel void kernel_scale(
@ -528,24 +528,42 @@ kernel void kernel_mul_mat_f16_f32(
    device const half  * x = (device const half  *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
    device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
-    sum[tpitg.x] = 0.0f;
+    uint ith = tpitg.x;
    uint nth = tptg.x;
-    for (int i = tpitg.x; i < ne00; i += tptg.x) {
+    sum[ith] = 0.0f;
-        sum[tpitg.x] += (float) x[i] * (float) y[i];
+
    for (int i = ith; i < ne00; i += nth) {
        sum[ith] += (float) x[i] * (float) y[i];
    }
    // accumulate the sum from all threads in the threadgroup
    threadgroup_barrier(mem_flags::mem_threadgroup);
-    for (uint i = tptg.x/2; i > 0; i /= 2) {
+    if (ith%4 == 0) {
-        if (tpitg.x < i) {
+        for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
            sum[tpitg.x] += sum[tpitg.x + i];
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    if (ith%16 == 0) {
        for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
    }
-
+    threadgroup_barrier(mem_flags::mem_threadgroup);
-    if (tpitg.x == 0) {
+    if (ith == 0) {
        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
        dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
    }
    // Original implementation. Left behind commented out for now
    //threadgroup_barrier(mem_flags::mem_threadgroup);
    //for (uint i = tptg.x/2; i > 0; i /= 2) {
    //    if (tpitg.x < i) {
    //        sum[tpitg.x] += sum[tpitg.x + i];
    //    }
    //    threadgroup_barrier(mem_flags::mem_threadgroup);
    //}
    //
    //if (tpitg.x == 0) {
    //    dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
    //}
 }
 kernel void kernel_alibi_f32(
--- a/ggml.c
+++ b/ggml.c
@ -301,6 +301,10 @@ typedef double ggml_float;
 #endif
 #endif
 #ifdef __riscv_v_intrinsic
 #include <riscv_vector.h>
 #endif
 #ifdef __F16C__
 #ifdef _MSC_VER
@ -2678,6 +2682,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
    }
    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
 #elif defined(__riscv_v_intrinsic)
    float sumf = 0.0;
    size_t vl = __riscv_vsetvl_e8m1(qk/2);
    for (int i = 0; i < nb; i++) {
        vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
        vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
        vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
        vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
        vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
        vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
        vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
        vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
        vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
        vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
        vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
        int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
        sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
    }
    *s = sumf;
 #else
    // scalar
    float sumf = 0.0;
@ -2804,6 +2843,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
    }
    *s = hsum_float_8(acc) + summs;
 #elif defined(__riscv_v_intrinsic)
    float sumf = 0.0;
    size_t vl = __riscv_vsetvl_e8m1(qk/2);
    for (int i = 0; i < nb; i++) {
        vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
        vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
        vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
        vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
        vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
        vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
        vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
        vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
        vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
        int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
        sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
    }
    *s = sumf;
 #else
    // scalar
    float sumf = 0.0;
@ -3038,6 +3109,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
    }
    *s = hsum_float_8(acc);
 #elif defined(__riscv_v_intrinsic)
    float sumf = 0.0;
    uint32_t qh;
    // These temp values are for masking and shift operations
    uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
    uint32_t temp_2[16] = {0x1,   0x2,   0x4,   0x8,   0x10,   0x20,   0x40,   0x80,
                         0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
    size_t vl = __riscv_vsetvl_e8m1(qk/2);
    for (int i = 0; i < nb; i++) {
        memcpy(&qh, x[i].qh, sizeof(uint32_t));
        // temporary registers
        vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
        vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
        vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
        vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
        // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
        vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
        vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
        vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
        // ((qh & (1u << (j + 16))) >> (j + 12));
        vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
        vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
        // narrowing
        vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
        vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
        vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
        vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
        // load
        vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
        vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
        vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
        vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
        vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
        vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
        vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
        vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
        vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
        vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
        vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
        vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
        vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
        int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
        sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
        sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
    }
    *s = sumf;
 #else
    // scalar
    float sumf = 0.0;
@ -3294,6 +3435,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
    }
    *s = hsum_float_8(acc) + summs;
 #elif defined(__riscv_v_intrinsic)
    float sumf = 0.0;
    uint32_t qh;
    // These temp values are for shift operations
    uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
    size_t vl = __riscv_vsetvl_e8m1(qk/2);
    for (int i = 0; i < nb; i++) {
        memcpy(&qh, x[i].qh, sizeof(uint32_t));
        // temporary registers
        vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
        vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
        // load qh
        vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
        // ((qh >> (j +  0)) << 4) & 0x10;
        vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
        vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
        vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
        // ((qh >> (j + 12))     ) & 0x10;
        vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
        vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
        // narrowing
        vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
        vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
        vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
        vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
        // load
        vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
        vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
        vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
        vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
        vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
        vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
        vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
        vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
        vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
        vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
        vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
        int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
        sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
    }
    *s = sumf;
 #else
    // scalar
    float sumf = 0.0;
@ -3405,6 +3612,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
    }
    *s = hsum_float_8(acc);
 #elif defined(__riscv_v_intrinsic)
    float sumf = 0.0;
    size_t vl = __riscv_vsetvl_e8m1(qk);
    for (int i = 0; i < nb; i++) {
        // load elements
        vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
        vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
        vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
        vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
        vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
        int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
    }
    *s = sumf;
 #else
    // scalar
    float sumf = 0.0;
--- a/grammars/c.gbnf
+++ b/grammars/c.gbnf
@ -0,0 +1,42 @@
 root ::= (declaration)*
 declaration ::= dataType identifier "(" parameter? ")" "{" statement* "}"
 dataType  ::= "int" ws | "float" ws | "char" ws
 identifier ::= [a-zA-Z_] [a-zA-Z_0-9]*
 parameter ::= dataType identifier
 statement ::=
    ( dataType identifier ws "=" ws expression ";" ) |
    ( identifier ws "=" ws expression ";" ) |
    ( identifier ws "(" argList? ")" ";" ) |
    ( "return" ws expression ";" ) |
    ( "while" "(" condition ")" "{" statement* "}" ) |
    ( "for" "(" forInit ";" ws condition ";" ws forUpdate ")" "{" statement* "}" ) |
    ( "if" "(" condition ")" "{" statement* "}" ("else" "{" statement* "}")? ) |
    ( singleLineComment ) |
    ( multiLineComment )
 forInit ::= dataType identifier ws "=" ws expression | identifier ws "=" ws expression
 forUpdate ::= identifier ws "=" ws expression
 condition ::= expression relationOperator expression
 relationOperator ::= ("<=" | "<" | "==" | "!=" | ">=" | ">")
 expression ::= term (("+" | "-") term)*
 term ::= factor(("*" | "/") factor)*
 factor ::= identifier | number | unaryTerm | funcCall | parenExpression
 unaryTerm ::= "-" factor
 funcCall ::= identifier "(" argList? ")"
 parenExpression ::= "(" ws expression ws ")"
 argList ::= expression ("," ws expression)*
 number ::= [0-9]+
 singleLineComment ::= "//" [^\n]* "\n"
 multiLineComment ::= "/*" ( [^*] | ("*" [^/]) )* "*/"
 ws ::= ([ \t\n]+)
--- a/k_quants.c
+++ b/k_quants.c
@ -183,13 +183,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
        int ntry, float alpha) {
    float min = x[0];
    float max = x[0];
    float sum_x = 0;
    float sum_x2 = 0;
    for (int i = 1; i < n; ++i) {
        if (x[i] < min) min = x[i];
        if (x[i] > max) max = x[i];
        sum_x += x[i];
        sum_x2 += x[i]*x[i];
    }
    if (max == min) {
        for (int i = 0; i < n; ++i) L[i] = 0;
@ -2060,7 +2056,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
    __m256 acc = _mm256_setzero_ps();
-    uint32_t *aux;
+    const uint32_t *aux;
    for (int i = 0; i < nb; ++i) {
@ -2070,7 +2066,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        const int8_t  * restrict q8 = y[i].qs;
        // Set up scales
-        aux = (uint32_t *)x[i].scales;
+        aux = (const uint32_t *)x[i].scales;
        __m128i scales128 = _mm_set_epi32(
                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -223,13 +223,24 @@ def load_model(model_filename):
        else:
            inputs.tensor_split[n] = 0
    # we must force an explicit tensor split
    # otherwise the default will divide equally and multigpu crap will slow it down badly
    inputs.cublas_info = 0
    if not args.tensor_split:
        if (args.usecublas and "0" in args.usecublas):
            os.environ["CUDA_VISIBLE_DEVICES"] = "0"
        elif (args.usecublas and "1" in args.usecublas):
            os.environ["CUDA_VISIBLE_DEVICES"] = "1"
        elif (args.usecublas and "2" in args.usecublas):
            os.environ["CUDA_VISIBLE_DEVICES"] = "2"
    else:
        if (args.usecublas and "0" in args.usecublas):
            inputs.cublas_info = 0
        elif (args.usecublas and "1" in args.usecublas):
            inputs.cublas_info = 1
        elif (args.usecublas and "2" in args.usecublas):
            inputs.cublas_info = 2
    inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
    inputs.debugmode = args.debugmode
--- a/llama.cpp
+++ b/llama.cpp
@ -3610,7 +3610,7 @@ static void llama_grammar_advance_stack(
        std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
    if (stack.empty()) {
-        new_stacks.push_back(stack);
+        new_stacks.emplace_back(stack);
        return;
    }
@ -3647,7 +3647,7 @@ static void llama_grammar_advance_stack(
        }
        case LLAMA_GRETYPE_CHAR:
        case LLAMA_GRETYPE_CHAR_NOT:
-            new_stacks.push_back(stack);
+            new_stacks.emplace_back(stack);
            break;
        default:
            // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@ -4406,7 +4406,7 @@ struct llama_logit_info {
        }
        return min_heap;
    }
-    float probability_from_logit(float logit) {
+    float probability_from_logit(float logit) const {
        return normalizer * std::exp(logit - max_l);
    }
 };
@ -4696,6 +4696,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    llm_load_arch(*ml, model);
    llm_load_hparams(*ml, model, 0, 0, 0);
    if (params->only_copy) {
        ftype = model.ftype;
    }
    const size_t align = GGUF_DEFAULT_ALIGNMENT;
    struct gguf_context * ctx_out = gguf_init_empty();
@ -4782,18 +4786,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        // quantize only 2D tensors
        quantize &= (tensor->n_dims == 2);
        quantize &= params->quantize_output_tensor || name != "output.weight";
-        quantize &= quantized_type != tensor->type;
+        quantize &= !params->only_copy;
        enum ggml_type new_type;
        void * new_data;
        size_t new_size;
-        if (!quantize) {
+        if (quantize) {
            new_type = tensor->type;
            new_data = tensor->data;
            new_size = ggml_nbytes(tensor);
            LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
        } else {
            new_type = quantized_type;
 #ifdef GGML_USE_K_QUANTS
            // TODO: avoid hardcoded tensor names - use the TN_* constants
@ -4892,7 +4891,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                }
            }
 #endif
-
+            // If we've decided to quantize to the same type the tensor is already
            // in then there's nothing to do.
            quantize = tensor->type != new_type;
        }
        if (!quantize) {
            new_type = tensor->type;
            new_data = tensor->data;
            new_size = ggml_nbytes(tensor);
            LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
        } else {
            const size_t nelements = ggml_nelements(tensor);
            float * f32_data;
@ -5323,6 +5331,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
        /*.allow_requantize            =*/ false,
        /*.quantize_output_tensor      =*/ true,
        /*.only_copy                   =*/ false,
    };
    return result;
--- a/llama.h
+++ b/llama.h
@ -164,6 +164,7 @@ extern "C" {
        enum llama_ftype ftype;      // quantize to this llama_ftype
        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor; // quantize output.weight
        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
    } llama_model_quantize_params;
    // grammar types