diff --git a/CMakeLists.txt b/CMakeLists.txt index b56af440f..82ec82e0d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,6 +173,7 @@ if (LLAMA_ALL_WARNINGS) -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int + -Wno-unused-function ) set(cxx_flags -Wall @@ -182,6 +183,10 @@ if (LLAMA_ALL_WARNINGS) -Wno-unused-function -Wno-multichar ) + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + # g++ only + set(cxx_flags ${cxx_flags} -Wno-format-truncation) + endif() else() # todo : msvc endif() diff --git a/common/common.cpp b/common/common.cpp index ed09fc27d..41fc59ced 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -24,7 +24,9 @@ #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN -#define NOMINMAX +#ifndef NOMINMAX +# define NOMINMAX +#endif #include #include #include @@ -1027,7 +1029,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str()); fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); - fprintf(stream, "hellaswag_tasks: %ld # default: 400\n", params.hellaswag_tasks); + fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx)); const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY; diff --git a/common/console.cpp b/common/console.cpp index 8efa2a674..23545e5be 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -235,6 +235,7 @@ namespace console { int estimateWidth(char32_t codepoint) { #if defined(_WIN32) + (void)codepoint; return 1; #else return wcwidth(codepoint); diff --git a/common/log.h b/common/log.h index 120a3ff9f..bf9fafd68 100644 --- a/common/log.h +++ b/common/log.h @@ -1 +1,643 @@ -#define LOG(...) \ No newline at end of file +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +// -------------------------------- +// +// Basic usage: +// +// -------- +// +// The LOG() and LOG_TEE() macros are ready to go by default +// they do not require any initialization. +// +// LOGLN() and LOG_TEELN() are variants which automatically +// include \n character at the end of the log string. +// +// LOG() behaves exactly like printf, by default writing to a logfile. +// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ). +// +// Default logfile is named +// "llama..log" +// Default LOG_TEE() secondary output target is +// stderr +// +// Logs can be dynamically disabled or enabled using functions: +// log_disable() +// and +// log_enable() +// +// A log target can be changed with: +// log_set_target( string ) +// creating and opening, or re-opening a file by string filename +// or +// log_set_target( FILE* ) +// allowing to point at stderr, stdout, or any valid FILE* file handler. +// +// -------- +// +// End of Basic usage. +// +// -------------------------------- + +// Specifies a log target. +// default uses log_handler() with "llama.log" log file +// this can be changed, by defining LOG_TARGET +// like so: +// +// #define LOG_TARGET (a valid FILE*) +// #include "log.h" +// +// or it can be simply redirected to stdout or stderr +// like so: +// +// #define LOG_TARGET stderr +// #include "log.h" +// +// The log target can also be redirected to a diffrent function +// like so: +// +// #define LOG_TARGET log_handler_diffrent() +// #include "log.h" +// +// FILE* log_handler_diffrent() +// { +// return stderr; +// } +// +// or: +// +// #define LOG_TARGET log_handler_another_one("somelog.log") +// #include "log.h" +// +// FILE* log_handler_another_one(char*filename) +// { +// static FILE* logfile = nullptr; +// (...) +// if( !logfile ) +// { +// fopen(...) +// } +// (...) +// return logfile +// } +// +#ifndef LOG_TARGET + #define LOG_TARGET log_handler() +#endif + +#ifndef LOG_TEE_TARGET + #define LOG_TEE_TARGET stderr +#endif + +// Utility to obtain "pid" like unique process id and use it when creating log files. +inline std::string log_get_pid() +{ + static std::string pid; + if (pid.empty()) + { + // std::this_thread::get_id() is the most portable way of obtaining a "process id" + // it's not the same as "pid" but is unique enough to solve multiple instances + // trying to write to the same log. + std::stringstream ss; + ss << std::this_thread::get_id(); + pid = ss.str(); + } + + return pid; +} + +// Utility function for generating log file names with unique id based on thread id. +// invocation with log_filename_generator( "llama", "log" ) creates a string "llama..log" +// where the number is a runtime id of the current thread. + +#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension) + +// INTERNAL, DO NOT USE +inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension) +{ + std::stringstream buf; + + buf << log_file_basename; + buf << "."; + buf << log_get_pid(); + buf << "."; + buf << log_file_extension; + + return buf.str(); +} + +#ifndef LOG_DEFAULT_FILE_NAME + #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log") +#endif + +// Utility for turning #define values into string literals +// so we can have a define for stderr and +// we can print "stderr" instead of literal stderr, etc. +#define LOG_STRINGIZE1(s) #s +#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s) + +#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET) + +// Allows disabling timestamps. +// in order to disable, define LOG_NO_TIMESTAMPS +// like so: +// +// #define LOG_NO_TIMESTAMPS +// #include "log.h" +// +#ifndef LOG_NO_TIMESTAMPS + #ifndef _MSC_VER + #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " + #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() + #else + #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " + #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() + #endif +#else + #define LOG_TIMESTAMP_FMT "%s" + #define LOG_TIMESTAMP_VAL ,"" +#endif + +#ifdef LOG_TEE_TIMESTAMPS + #ifndef _MSC_VER + #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " + #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() + #else + #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " + #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() + #endif +#else + #define LOG_TEE_TIMESTAMP_FMT "%s" + #define LOG_TEE_TIMESTAMP_VAL ,"" +#endif + +// Allows disabling file/line/function prefix +// in order to disable, define LOG_NO_FILE_LINE_FUNCTION +// like so: +// +// #define LOG_NO_FILE_LINE_FUNCTION +// #include "log.h" +// +#ifndef LOG_NO_FILE_LINE_FUNCTION + #ifndef _MSC_VER + #define LOG_FLF_FMT "[%24s:%5d][%24s] " + #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ + #else + #define LOG_FLF_FMT "[%24s:%5ld][%24s] " + #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ + #endif +#else + #define LOG_FLF_FMT "%s" + #define LOG_FLF_VAL ,"" +#endif + +#ifdef LOG_TEE_FILE_LINE_FUNCTION + #ifndef _MSC_VER + #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] " + #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ + #else + #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] " + #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ + #endif +#else + #define LOG_TEE_FLF_FMT "%s" + #define LOG_TEE_FLF_VAL ,"" +#endif + +// Utility for synchronizing log configuration state +// since std::optional was introduced only in c++17 +enum LogTriState +{ + LogTriStateSame, + LogTriStateFalse, + LogTriStateTrue +}; + +// INTERNAL, DO NOT USE +// USE LOG() INSTEAD +// +#ifndef _MSC_VER + #define LOG_IMPL(str, ...) \ + { \ + if (LOG_TARGET != nullptr) \ + { \ + fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ + fflush(LOG_TARGET); \ + } \ + } +#else + #define LOG_IMPL(str, ...) \ + { \ + if (LOG_TARGET != nullptr) \ + { \ + fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ + fflush(LOG_TARGET); \ + } \ + } +#endif + +// INTERNAL, DO NOT USE +// USE LOG_TEE() INSTEAD +// +#ifndef _MSC_VER + #define LOG_TEE_IMPL(str, ...) \ + { \ + if (LOG_TARGET != nullptr) \ + { \ + fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ + fflush(LOG_TARGET); \ + } \ + if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \ + { \ + fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \ + fflush(LOG_TEE_TARGET); \ + } \ + } +#else + #define LOG_TEE_IMPL(str, ...) \ + { \ + if (LOG_TARGET != nullptr) \ + { \ + fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ + fflush(LOG_TARGET); \ + } \ + if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \ + { \ + fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \ + fflush(LOG_TEE_TARGET); \ + } \ + } +#endif + +// The '\0' as a last argument, is a trick to bypass the silly +// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro" +// so we can have a single macro which can be called just like printf. + +// Main LOG macro. +// behaves like printf, and supports arguments the exact same way. +// +#ifndef _MSC_VER + #define LOG(...) LOG_IMPL(__VA_ARGS__, "") +#else + #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "") +#endif + +// Main TEE macro. +// does the same as LOG +// and +// simultaneously writes stderr. +// +// Secondary target can be changed just like LOG_TARGET +// by defining LOG_TEE_TARGET +// +#ifndef _MSC_VER + #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") +#else + #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "") +#endif + +// LOG macro variants with auto endline. +#ifndef _MSC_VER + #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") + #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") +#else + #define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n") + #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n") +#endif + +// INTERNAL, DO NOT USE +inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) +{ + static bool _initialized{false}; + static bool _disabled{(filename.empty() && target == nullptr)}; + static std::string log_current_filename{filename}; + static FILE *log_current_target{target}; + static FILE *logfile = nullptr; + + if (change) + { + if (disable == LogTriStateTrue) + { + // Disable primary target + _disabled = true; + } + // If previously disabled, only enable, and keep previous target + else if (disable == LogTriStateFalse) + { + _disabled = false; + } + // Otherwise, process the arguments + else if (log_current_filename != filename || log_current_target != target) + { + _initialized = false; + } + } + + if (_initialized) + { + if (_disabled) + { + // Log is disabled + return nullptr; + } + + // with fallback in case something went wrong + return logfile ? logfile : stderr; + } + + // do the (re)initialization + if (target != nullptr) + { + if (logfile != nullptr && logfile != stdout && logfile != stderr) + { + fclose(logfile); + } + + log_current_filename = LOG_DEFAULT_FILE_NAME; + log_current_target = target; + + logfile = target; + } + else + { + if (log_current_filename != filename) + { + if (logfile != nullptr && logfile != stdout && logfile != stderr) + { + fclose(logfile); + } + } + + logfile = fopen(filename.c_str(), "w"); + } + + if (!logfile) + { + // Verify whether the file was opened, otherwise fallback to stderr + logfile = stderr; + + fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno)); + fflush(stderr); + + // At this point we let the init flag be to true below, and let the target fallback to stderr + // otherwise we would repeatedly fopen() which was already unsuccessful + } + + _initialized = true; + + return logfile ? logfile : stderr; +} + +// INTERNAL, DO NOT USE +inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME) +{ + return log_handler1_impl(change, disable, filename, target); +} + +// Disables logs entirely at runtime. +// Makes LOG() and LOG_TEE() produce no output, +// untill enabled back. +#define log_disable() log_disable_impl() + +// INTERNAL, DO NOT USE +inline FILE *log_disable_impl() +{ + return log_handler1_impl(true, LogTriStateTrue); +} + +// Enables logs at runtime. +#define log_enable() log_enable_impl() + +// INTERNAL, DO NOT USE +inline FILE *log_enable_impl() +{ + return log_handler1_impl(true, LogTriStateFalse); +} + +// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*) +#define log_set_target(target) log_set_target_impl(target) + +// INTERNAL, DO NOT USE +inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); } +inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); } + +// INTERNAL, DO NOT USE +inline FILE *log_handler() { return log_handler1_impl(); } + +inline void log_test() +{ + log_disable(); + LOG("01 Hello World to nobody, because logs are disabled!\n") + log_enable(); + LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET)) + LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n") + log_set_target(stderr); + LOG("04 Hello World to stderr!\n") + LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n") + log_set_target(LOG_DEFAULT_FILE_NAME); + LOG("06 Hello World to default log file!\n") + log_set_target(stdout); + LOG("07 Hello World to stdout!\n") + log_set_target(LOG_DEFAULT_FILE_NAME); + LOG("08 Hello World to default log file again!\n") + log_disable(); + LOG("09 Hello World _1_ into the void!\n") + log_enable(); + LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n") + log_disable(); + log_set_target("llama.anotherlog.log"); + LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n") + log_enable(); + LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n") + log_set_target("llama.yetanotherlog.log"); + LOG("13 Hello World this time in yet new file?\n") + log_set_target(log_filename_generator("llama_autonamed", "log")); + LOG("14 Hello World in log with generated filename!\n") +#ifdef _MSC_VER + LOG_TEE("15 Hello msvc TEE without arguments\n") + LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test") + LOG_TEELN("17 Hello msvc TEELN without arguments\n") + LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test") + LOG("19 Hello msvc LOG without arguments\n") + LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test") + LOGLN("21 Hello msvc LOGLN without arguments\n") + LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test") +#endif +} + +inline bool log_param_single_parse(const std::string & param) +{ + if ( param == "--log-test") + { + log_test(); + return true; + } + + if ( param == "--log-disable") + { + log_disable(); + return true; + } + + if ( param == "--log-enable") + { + log_enable(); + return true; + } + + return false; +} + +inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string()) +{ + if ( param == "--log-file") + { + if (!check_but_dont_parse) + { + log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log")); + } + + return true; + } + + return false; +} + +inline void log_print_usage() +{ + fprintf(stdout, "log options:\n"); + /* format + fprintf(stdout, " -h, --help show this help message and exit\n");*/ + /* spacing + fprintf(stdout, "__-param----------------Description\n");*/ + fprintf(stdout, " --log-test Run simple logging test\n"); + fprintf(stdout, " --log-disable Disable trace logs\n"); + fprintf(stdout, " --log-enable Enable trace logs\n"); + fprintf(stdout, " --log-file Specify a log filename (without extension)\n"); + fprintf(stdout, " Log file will be tagged with unique ID and written as \"..log\"\n"); /* */ +} + +#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) + +// INTERNAL, DO NOT USE +inline void log_dump_cmdline_impl(int argc, char **argv) +{ + std::stringstream buf; + for (int i = 0; i < argc; ++i) + { + if (std::string(argv[i]).find(' ') != std::string::npos) + { + buf << " \"" << argv[i] <<"\""; + } + else + { + buf << " " << argv[i]; + } + } + LOGLN("Cmd:%s", buf.str().c_str()) +} + +#define log_tostr(var) log_var_to_string_impl(var).c_str() + +inline std::string log_var_to_string_impl(bool var) +{ + return var ? "true" : "false"; +} + +inline std::string log_var_to_string_impl(std::string var) +{ + return var; +} + +inline std::string log_var_to_string_impl(const std::vector & var) +{ + std::stringstream buf; + buf << "[ "; + bool first = true; + for (auto e : var) + { + if (first) + { + first = false; + } + else + { + buf << ", "; + } + buf << std::to_string(e); + } + buf << " ]"; + + return buf.str(); +} + +#define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens) \ + [&tokens, &ctx]() \ + { \ + std::stringstream buf; \ + buf << "[ "; \ + \ + bool first = true; \ + for (const auto &token : tokens) \ + { \ + if (!first) \ + buf << ", "; \ + else \ + first = false; \ + \ + auto detokenized = llama_token_to_piece(ctx, token); \ + \ + detokenized.erase( \ + std::remove_if( \ + detokenized.begin(), \ + detokenized.end(), \ + [](const unsigned char c) { return !std::isprint(c); }), \ + detokenized.end()); \ + \ + buf \ + << "'" << detokenized << "'" \ + << ":" << std::to_string(token); \ + } \ + buf << " ]"; \ + \ + return buf.str(); \ + }() \ + .c_str() + +#ifdef LOG_DISABLE_LOGS + +#undef LOG +#define LOG(...) // dummy stub +#undef LOGLN +#define LOGLN(...) // dummy stub + +#undef LOG_TEE +#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf + +#undef LOG_TEELN +#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf + +#undef LOG_DISABLE +#define LOG_DISABLE() // dummy stub + +#undef LOG_ENABLE +#define LOG_ENABLE() // dummy stub + +#undef LOG_ENABLE +#define LOG_ENABLE() // dummy stub + +#undef LOG_SET_TARGET +#define LOG_SET_TARGET(...) // dummy stub + +#undef LOG_DUMP_CMDLINE +#define LOG_DUMP_CMDLINE(...) // dummy stub + +#endif // LOG_DISABLE_LOGS diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index ec786ff67..271e58972 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -11,11 +11,14 @@ import sys from pathlib import Path from typing import Any -import gguf import numpy as np import torch from transformers import AutoTokenizer # type: ignore[import] +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + def bytes_to_unicode(): # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py index 852123d99..b9c8b4607 100755 --- a/convert-gptneox-hf-to-gguf.py +++ b/convert-gptneox-hf-to-gguf.py @@ -11,11 +11,14 @@ import sys from pathlib import Path from typing import Any -import gguf import numpy as np import torch from transformers import AutoTokenizer # type: ignore[import] +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py deleted file mode 100755 index 6574c11dd..000000000 --- a/convert-llama-7b-pth-to-gguf.py +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/env python3 -# 7b pth llama --> gguf conversion -# Only models with a single datafile are supported, like 7B -# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import TYPE_CHECKING, Any - -import gguf -import numpy as np -import torch -from sentencepiece import SentencePieceProcessor # type: ignore[import] - -if TYPE_CHECKING: - from typing import TypeAlias - -NDArray: TypeAlias = 'np.ndarray[Any, Any]' - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("consolidated."): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a PyTorch 7B LLaMA model to a GGML compatible file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") - parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "LlamaForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) - -if num_parts > 1: - print("gguf: Only models with a single datafile are supported.") - - sys.exit() - -ARCH=gguf.MODEL_ARCH.LLAMA -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] -head_count = hparams["num_attention_heads"] - -if "num_key_value_heads" in hparams: - head_count_kv = hparams["num_key_value_heads"] -else: - head_count_kv = head_count - -if "_name_or_path" in hparams: - hf_repo = hparams["_name_or_path"] -else: - hf_repo = "" - -if "max_sequence_length" in hparams: - ctx_length = hparams["max_sequence_length"] -elif "max_position_embeddings" in hparams: - ctx_length = hparams["max_position_embeddings"] -else: - print("gguf: can not find ctx length parameter.") - - sys.exit() - - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_source_hf_repo(hf_repo) -gguf_writer.add_tensor_data_layout("Meta AI original pth") -gguf_writer.add_context_length(ctx_length) -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) -gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) -gguf_writer.add_head_count(head_count) -gguf_writer.add_head_count_kv(head_count_kv) -gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - -if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: - if "type" in hparams["rope_scaling"]: - if hparams["rope_scaling"]["type"] == "linear": - gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) - - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytes] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -tokenizer_model_file = dir_model / 'tokenizer.model' -if not tokenizer_model_file.is_file(): - print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) - sys.exit(1) - -# vocab type sentencepiece -print("gguf: get sentencepiece tokenizer vocab and scores") - -tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) - -for i in range(tokenizer.vocab_size()): - text: bytes - score: float - - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) - - toktype = 1 # defualt to normal token type - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 - - # toktype = 4 is user-defined = tokens from added_tokens.json - - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - -added_tokens_file = dir_model / 'added_tokens.json' -if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - addtokens_json = json.load(f) - - print("gguf: get added tokens") - - for key in addtokens_json: - tokens.append( key.encode("utf-8") ) - scores.append(-1000.0) - toktypes.append(4) # user-defined token type - -gguf_writer.add_tokenizer_model("llama") -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts)) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - # we don't need these - if name == "rope.freqs": - continue - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-llama-ggmlv3-to-gguf.py b/convert-llama-ggmlv3-to-gguf.py index 3f39bc39e..08ba0c490 100755 --- a/convert-llama-ggmlv3-to-gguf.py +++ b/convert-llama-ggmlv3-to-gguf.py @@ -7,9 +7,13 @@ import struct import sys from pathlib import Path -import gguf import numpy as np +import os +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + # Note: Does not support GGML_QKK_64 QK_K = 256 # Items here are (block size, type size) diff --git a/convert-llama-hf-to-gguf.py b/convert-llama-hf-to-gguf.py deleted file mode 100755 index c453c83c3..000000000 --- a/convert-llama-hf-to-gguf.py +++ /dev/null @@ -1,280 +0,0 @@ -#!/usr/bin/env python3 -# HF llama --> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import TYPE_CHECKING, Any - -import gguf -import numpy as np -import torch -from sentencepiece import SentencePieceProcessor # type: ignore[import] - -if TYPE_CHECKING: - from typing import TypeAlias - -NDArray: TypeAlias = 'np.ndarray[Any, Any]' - -# reverse HF permute back to original pth layout -# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py - - -def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - -def count_model_parts(dir_model: str) -> int: - num_parts = 0 - - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") - parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "LlamaForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.LLAMA -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] -head_count = hparams["num_attention_heads"] - -if "num_key_value_heads" in hparams: - head_count_kv = hparams["num_key_value_heads"] -else: - head_count_kv = head_count - -if "_name_or_path" in hparams: - hf_repo = hparams["_name_or_path"] -else: - hf_repo = "" - -if "max_sequence_length" in hparams: - ctx_length = hparams["max_sequence_length"] -elif "max_position_embeddings" in hparams: - ctx_length = hparams["max_position_embeddings"] -else: - print("gguf: can not find ctx length parameter.") - - sys.exit() - - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_source_hf_repo(hf_repo) -gguf_writer.add_tensor_data_layout("Meta AI original pth") -gguf_writer.add_context_length(ctx_length) -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) -gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) -gguf_writer.add_head_count(head_count) -gguf_writer.add_head_count_kv(head_count_kv) -gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - -if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: - if "type" in hparams["rope_scaling"]: - if hparams["rope_scaling"]["type"] == "linear": - gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) - - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytes] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -tokenizer_model_file = dir_model / 'tokenizer.model' -if not tokenizer_model_file.is_file(): - print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) - sys.exit(1) - -# vocab type sentencepiece -print("gguf: get sentencepiece tokenizer vocab, scores and token types") - -tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) - -for i in range(tokenizer.vocab_size()): - text: bytes - score: float - - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) - - toktype = 1 # defualt to normal token type - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 - - # toktype = 4 is user-defined = tokens from added_tokens.json - - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - -added_tokens_file = dir_model / 'added_tokens.json' -if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - addtokens_json = json.load(f) - - print("gguf: get added tokens") - - for key in addtokens_json: - tokens.append( key.encode("utf-8") ) - scores.append(-1000.0) - toktypes.append(4) # user-defined token type - - -gguf_writer.add_tokenizer_model("llama") -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # reverse permute these - if name.endswith(".q_proj.weight"): - data = reverse_hf_permute(data, head_count) - if name.endswith(".k_proj.weight"): - data = reverse_hf_permute(data, head_count, head_count_kv) - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert.py b/convert.py index 9a39edb99..6c89b5ecc 100755 --- a/convert.py +++ b/convert.py @@ -25,10 +25,14 @@ from dataclasses import dataclass from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar -import gguf import numpy as np from sentencepiece import SentencePieceProcessor # type: ignore[import] +import os +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + if TYPE_CHECKING: from typing import TypeAlias @@ -526,7 +530,7 @@ class LazyTensor: raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.') -LazyModel = dict[str, LazyTensor] +LazyModel: TypeAlias = 'dict[str, LazyTensor]' @dataclass diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 6fa55b319..a99ece9a6 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1617,15 +1617,10 @@ int main(int argc, char ** argv) { float error_before_opt = ggml_get_f32_1d(e, 0); - struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM); struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); - opt_params_adam.print_forward_graph = false; - opt_params_adam.print_backward_graph = false; opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_backward_graph = false; - opt_params_adam.adam.n_iter = 16; opt_params_lbfgs.lbfgs.n_iter = 16; - // ggml_opt(ctx0, opt_params_adam, e); ggml_opt(ctx0, opt_params_lbfgs, e); // ggml_build_forward_expand(&gf, e); diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index 42c7c7254..4d021434b 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -22,7 +22,9 @@ #include #elif defined (_WIN32) #define WIN32_LEAN_AND_MEAN -#define NOMINMAX +#ifndef NOMINMAX +# define NOMINMAX +#endif #include #include #endif @@ -73,7 +75,7 @@ void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_stat assert(0u < beams_state.n_beams); const llama_token * tokens = beams_state.beam_views[0].tokens; std::copy(tokens, tokens + n, callback_data.response.end() - n); - printf("%lu", n); + printf("%zu", n); } fflush(stdout); #if 1 // DEBUG: print current beams for this iteration @@ -145,7 +147,7 @@ int main(int argc, char ** argv) if (tokens_list.size() > max_tokens_list_size) { - fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" , + fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" , __func__ , tokens_list.size() , max_tokens_list_size ); return 1; } diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index e9e070b1f..9e856c21a 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -75,7 +75,7 @@ typedef struct { int seq_len; // max sequence length } Config; -typedef struct { +struct TransformerWeights { // token embedding table float* token_embedding_table; // (vocab_size, dim) // weights for rmsnorms @@ -97,7 +97,22 @@ typedef struct { // float* freq_cis_imag; // (seq_len, dim/2) // (optional) classifier weights for the logits, on the last layer float* wcls; -} TransformerWeights; + + ~TransformerWeights() { + delete[] token_embedding_table; + delete[] rms_att_weight; + delete[] rms_ffn_weight; + delete[] wq; + delete[] wk; + delete[] wv; + delete[] wo; + delete[] w1; + delete[] w2; + delete[] w3; + delete[] rms_final_weight; + delete[] wcls; + } +}; void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { // we calloc instead of malloc to keep valgrind happy @@ -173,21 +188,6 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar return 0; } -void free_weights(TransformerWeights* w) { - delete w->token_embedding_table; - delete w->rms_att_weight; - delete w->rms_ffn_weight; - delete w->wq; - delete w->wk; - delete w->wv; - delete w->wo; - delete w->w1; - delete w->w2; - delete w->w3; - delete w->rms_final_weight; - if (w->wcls) delete w->wcls; -} - void print_sample_weights(TransformerWeights *w){ printf("----- Quick print of first of the weight vales of all the variables\n"); printf("%f\n", w->token_embedding_table[0]); @@ -596,6 +596,10 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) // assume llama2.c vocabulary printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); llama_file file(filename, "rb"); + if (!file.fp) { + fprintf(stderr, "error: %s: %s\n", strerror(errno), filename); + exit(1); + } const int n_vocab = config->vocab_size; /* uint32_t max_token_length = */ file.read_u32(); // unused vocab->id_to_token.resize(n_vocab); @@ -633,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) } } -void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){ +void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { int ct; switch (gg_weights->n_dims){ case 1: @@ -670,13 +674,13 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar } void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) { - // stuff AK weights into GG weights one by one. + // convert AK weights into GG weights one by one. // w->token_embedding_table -> model->tok_embeddings // float* -> struct ggml_tensor - stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table); - stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table); + convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table); + convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table); - stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight); + convert_weights_ak_to_gg(model->norm, w->rms_final_weight); //print_row(model->norm, 0); // for rms-att-weight @@ -686,18 +690,18 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ auto & layer = model->layers[i]; // 1d - stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); - stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); + convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); + convert_weights_ak_to_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); // from 3d matrix layer x dim x dim to 2d matrix dim x dim - stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]); - stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]); - stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]); + convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]); + convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]); + convert_weights_ak_to_gg(layer.w3 , &w->w3[i*row_length*n_ff]); } struct gguf_context * ctx = gguf_init_empty(); @@ -898,7 +902,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) { } std::string basename(const std::string &path) { - size_t pos = path.find_last_of("/"); + size_t pos = path.find_last_of("/\\"); if (pos == std::string::npos) { return path; } @@ -911,7 +915,7 @@ int main(int argc, char ** argv) { return 1; } Config config; - TransformerWeights weights; + TransformerWeights weights = {}; { FILE *file = fopen(params.fn_llama2c_model, "rb"); if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; } @@ -953,6 +957,5 @@ int main(int argc, char ** argv) { printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model); ggml_free(model.ctx); - free_weights(&weights); return 0; } diff --git a/examples/main/README.md b/examples/main/README.md index d555afdcc..2773fe976 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -34,7 +34,7 @@ For an interactive experience, try this command: #### Unix-based systems (Linux, macOS, etc.): ```bash -./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \ +./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \ 'User: Hi AI: Hello. I am an AI chatbot. Would you like to talk? User: Sure! @@ -45,7 +45,7 @@ User:' #### Windows: ```powershell -main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:" +main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:" ``` The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it): diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 1598a89bf..6b13b281d 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -33,6 +33,8 @@ static const std::vector QUANT_OPTIONS = { { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, + // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. + { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; @@ -69,12 +71,17 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std: // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // void usage(const char * executable) { - fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); - fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); - fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); - fprintf(stderr, "\nAllowed quantization types:\n"); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); + printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); + printf("\nAllowed quantization types:\n"); for (auto & it : QUANT_OPTIONS) { - printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str()); + if (it.name != "COPY") { + printf(" %2d or ", it.ftype); + } else { + printf(" "); + } + printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str()); } exit(1); } @@ -119,6 +126,9 @@ int main(int argc, char ** argv) { // export as [inp path]/ggml-model-[ftype].gguf fname_out = fpath + "ggml-model-" + ftype_str + ".gguf"; arg_idx++; + if (ftype_str == "COPY") { + params.only_copy = true; + } } else { fname_out = argv[arg_idx]; @@ -131,6 +141,10 @@ int main(int argc, char ** argv) { if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); return 1; + } else { + if (ftype_str == "COPY") { + params.only_copy = true; + } } arg_idx++; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b485a5ead..94def943b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -17,6 +17,8 @@ #include "completion.js.hpp" #include "json-schema-to-grammar.mjs.hpp" +#include + #ifndef SERVER_VERBOSE #define SERVER_VERBOSE 1 #endif @@ -1038,7 +1040,7 @@ static json format_timings(llama_server_context &llama) { const auto timings = llama_get_timings(llama.ctx); - assert(timings.n_eval == llama.num_tokens_predicted); + assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted)); return json{ {"prompt_n", timings.n_p_eval}, @@ -1239,7 +1241,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) { const llama_token * tokens = beams_state.beam_views[0].tokens; const auto map = [](llama_token tok) { return completion_token_output{{},tok}; }; std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map); - printf("%lu", n); + printf("%zu", n); } fflush(stdout); #if 0 // DEBUG: print current beams for this iteration @@ -1377,7 +1379,13 @@ int main(int argc, char **argv) } } - const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs); + auto probs = llama.generated_token_probs; + if (llama.params.n_probs > 0 && llama.stopped_word) { + const std::vector stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false); + probs = std::vector(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size()); + } + + const json data = format_final_response(llama, llama.generated_text, probs); llama_print_timings(llama.ctx); @@ -1454,7 +1462,11 @@ int main(int argc, char **argv) if (!llama.has_next_token) { // Generation is done, send extra information. - const json data = format_final_response(llama, "", llama.generated_token_probs); + const json data = format_final_response( + llama, + "", + std::vector(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index) + ); const std::string str = "data: " + @@ -1548,7 +1560,7 @@ int main(int argc, char **argv) svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) { - const auto * fmt = "500 Internal Server Error\n%s"; + const char fmt[] = "500 Internal Server Error\n%s"; char buf[BUFSIZ]; try { std::rethrow_exception(std::move(ep)); diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py index 01b3ee92a..a527d6153 100644 --- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py +++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py @@ -2,13 +2,16 @@ # train-text-from-scratch checkpoint --> gguf conversion import argparse -import gguf import os import struct import sys import numpy as np from pathlib import Path +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf')) +import gguf + # gguf constants LLM_KV_OPTIMIZER_TYPE = "optimizer.type" LLM_KV_OPTIMIZER_TYPE_ADAM = "adam" diff --git a/ggml-cuda.cu b/ggml-cuda.cu index ba25b1539..a4801e806 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -81,12 +81,29 @@ #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); static __device__ __forceinline__ int __vsubss4(const int a, const int b) { const int8x4_t va = reinterpret_cast(a); const int8x4_t vb = reinterpret_cast(b); +#if __has_builtin(__builtin_elementwise_sub_sat) const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); return reinterpret_cast(c); +#else + int8x4_t c; + int16_t tmp; +#pragma unroll + for (int i = 0; i < 4; i++) { + tmp = va[i] - vb[i]; + if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); + if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); + c[i] = tmp; + } + return reinterpret_cast(c); +#endif // __has_builtin(__builtin_elementwise_sub_sat) } static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { diff --git a/ggml-metal.m b/ggml-metal.m index e929c4b07..4267db9be 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -680,6 +680,12 @@ void ggml_metal_graph_compute( } break; case GGML_OP_ADD: { + GGML_ASSERT(ggml_is_contiguous(src0)); + + // utilize float4 + GGML_ASSERT(ne00 % 4 == 0); + const int64_t nb = ne00/4; + if (ggml_nelements(src1) == ne10) { // src1 is a row [encoder setComputePipelineState:ctx->pipeline_add_row]; @@ -689,14 +695,20 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:3]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_MUL: { + GGML_ASSERT(ggml_is_contiguous(src0)); + + // utilize float4 + GGML_ASSERT(ne00 % 4 == 0); + const int64_t nb = ne00/4; + if (ggml_nelements(src1) == ne10) { // src1 is a row [encoder setComputePipelineState:ctx->pipeline_mul_row]; @@ -706,9 +718,9 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:3]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; @@ -840,7 +852,7 @@ void ggml_metal_graph_compute( switch (src0t) { case GGML_TYPE_F16: { - nth0 = 64; + nth0 = 32; nth1 = 1; [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index 82e1a0c7a..8cdf0b9d2 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -25,9 +25,9 @@ typedef struct { } block_q8_0; kernel void kernel_add( - device const float * src0, - device const float * src1, - device float * dst, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] + src1[tpig]; } @@ -35,18 +35,18 @@ kernel void kernel_add( // assumption: src1 is a row // broadcast src1 into src0 kernel void kernel_add_row( - device const float * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant int64_t & nb, uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] + src1[tpig % ne00]; + dst[tpig] = src0[tpig] + src1[tpig % nb]; } kernel void kernel_mul( - device const float * src0, - device const float * src1, - device float * dst, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] * src1[tpig]; } @@ -54,12 +54,12 @@ kernel void kernel_mul( // assumption: src1 is a row // broadcast src1 into src0 kernel void kernel_mul_row( - device const float * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant int64_t & nb, uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] * src1[tpig % ne00]; + dst[tpig] = src0[tpig] * src1[tpig % nb]; } kernel void kernel_scale( @@ -528,24 +528,42 @@ kernel void kernel_mul_mat_f16_f32( device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); - sum[tpitg.x] = 0.0f; + uint ith = tpitg.x; + uint nth = tptg.x; - for (int i = tpitg.x; i < ne00; i += tptg.x) { - sum[tpitg.x] += (float) x[i] * (float) y[i]; + sum[ith] = 0.0f; + + for (int i = ith; i < ne00; i += nth) { + sum[ith] += (float) x[i] * (float) y[i]; } // accumulate the sum from all threads in the threadgroup threadgroup_barrier(mem_flags::mem_threadgroup); - for (uint i = tptg.x/2; i > 0; i /= 2) { - if (tpitg.x < i) { - sum[tpitg.x] += sum[tpitg.x + i]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith%4 == 0) { + for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i]; } - - if (tpitg.x == 0) { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith%16 == 0) { + for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith == 0) { + for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0]; } + + // Original implementation. Left behind commented out for now + //threadgroup_barrier(mem_flags::mem_threadgroup); + //for (uint i = tptg.x/2; i > 0; i /= 2) { + // if (tpitg.x < i) { + // sum[tpitg.x] += sum[tpitg.x + i]; + // } + // threadgroup_barrier(mem_flags::mem_threadgroup); + //} + // + //if (tpitg.x == 0) { + // dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0]; + //} } kernel void kernel_alibi_f32( diff --git a/ggml.c b/ggml.c index 336e90b4e..b2427562e 100644 --- a/ggml.c +++ b/ggml.c @@ -301,6 +301,10 @@ typedef double ggml_float; #endif #endif +#ifdef __riscv_v_intrinsic +#include +#endif + #ifdef __F16C__ #ifdef _MSC_VER @@ -2678,6 +2682,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * } *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl); + vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -2804,6 +2843,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * } *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -3038,6 +3109,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * } *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + uint32_t qh; + + // These temp values are for masking and shift operations + uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000}; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); + + // temporary registers + vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl); + vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl); + vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl); + vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl); + + // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl); + vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl); + vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl); + + // ((qh & (1u << (j + 16))) >> (j + 12)); + vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl); + vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl); + + // narrowing + vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl); + vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl); + + vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl); + vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl); + + // load + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl); + vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl); + + vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl); + vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -3294,6 +3435,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * } *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + uint32_t qh; + + // These temp values are for shift operations + uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); + + // temporary registers + vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl); + vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl); + + // load qh + vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl); + + // ((qh >> (j + 0)) << 4) & 0x10; + vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl); + vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl); + vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl); + + // ((qh >> (j + 12)) ) & 0x10; + vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl); + vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl); + + // narrowing + vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl); + vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl); + + vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl); + vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl); + + // load + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl); + vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl); + + vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -3405,6 +3612,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * } *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + size_t vl = __riscv_vsetvl_e8m1(qk); + + for (int i = 0; i < nb; i++) { + // load elements + vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl); + vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl); + + vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl); + + vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); + vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); + + sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); + } + + *s = sumf; #else // scalar float sumf = 0.0; diff --git a/grammars/c.gbnf b/grammars/c.gbnf new file mode 100644 index 000000000..4a0331dd2 --- /dev/null +++ b/grammars/c.gbnf @@ -0,0 +1,42 @@ +root ::= (declaration)* + +declaration ::= dataType identifier "(" parameter? ")" "{" statement* "}" + +dataType ::= "int" ws | "float" ws | "char" ws +identifier ::= [a-zA-Z_] [a-zA-Z_0-9]* + +parameter ::= dataType identifier + +statement ::= + ( dataType identifier ws "=" ws expression ";" ) | + ( identifier ws "=" ws expression ";" ) | + ( identifier ws "(" argList? ")" ";" ) | + ( "return" ws expression ";" ) | + ( "while" "(" condition ")" "{" statement* "}" ) | + ( "for" "(" forInit ";" ws condition ";" ws forUpdate ")" "{" statement* "}" ) | + ( "if" "(" condition ")" "{" statement* "}" ("else" "{" statement* "}")? ) | + ( singleLineComment ) | + ( multiLineComment ) + +forInit ::= dataType identifier ws "=" ws expression | identifier ws "=" ws expression +forUpdate ::= identifier ws "=" ws expression + +condition ::= expression relationOperator expression +relationOperator ::= ("<=" | "<" | "==" | "!=" | ">=" | ">") + +expression ::= term (("+" | "-") term)* +term ::= factor(("*" | "/") factor)* + +factor ::= identifier | number | unaryTerm | funcCall | parenExpression +unaryTerm ::= "-" factor +funcCall ::= identifier "(" argList? ")" +parenExpression ::= "(" ws expression ws ")" + +argList ::= expression ("," ws expression)* + +number ::= [0-9]+ + +singleLineComment ::= "//" [^\n]* "\n" +multiLineComment ::= "/*" ( [^*] | ("*" [^/]) )* "*/" + +ws ::= ([ \t\n]+) diff --git a/k_quants.c b/k_quants.c index 3a9b1dafd..3deeaedf7 100644 --- a/k_quants.c +++ b/k_quants.c @@ -183,13 +183,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t int ntry, float alpha) { float min = x[0]; float max = x[0]; - float sum_x = 0; - float sum_x2 = 0; for (int i = 1; i < n; ++i) { if (x[i] < min) min = x[i]; if (x[i] > max) max = x[i]; - sum_x += x[i]; - sum_x2 += x[i]*x[i]; } if (max == min) { for (int i = 0; i < n; ++i) L[i] = 0; @@ -2060,7 +2056,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri __m256 acc = _mm256_setzero_ps(); - uint32_t *aux; + const uint32_t *aux; for (int i = 0; i < nb; ++i) { @@ -2070,7 +2066,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const int8_t * restrict q8 = y[i].qs; // Set up scales - aux = (uint32_t *)x[i].scales; + aux = (const uint32_t *)x[i].scales; __m128i scales128 = _mm_set_epi32( ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), diff --git a/koboldcpp.py b/koboldcpp.py index 73275b1e0..25de22872 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -223,13 +223,24 @@ def load_model(model_filename): else: inputs.tensor_split[n] = 0 + # we must force an explicit tensor split + # otherwise the default will divide equally and multigpu crap will slow it down badly inputs.cublas_info = 0 - if (args.usecublas and "0" in args.usecublas): - os.environ["CUDA_VISIBLE_DEVICES"] = "0" - elif (args.usecublas and "1" in args.usecublas): - os.environ["CUDA_VISIBLE_DEVICES"] = "1" - elif (args.usecublas and "2" in args.usecublas): - os.environ["CUDA_VISIBLE_DEVICES"] = "2" + + if not args.tensor_split: + if (args.usecublas and "0" in args.usecublas): + os.environ["CUDA_VISIBLE_DEVICES"] = "0" + elif (args.usecublas and "1" in args.usecublas): + os.environ["CUDA_VISIBLE_DEVICES"] = "1" + elif (args.usecublas and "2" in args.usecublas): + os.environ["CUDA_VISIBLE_DEVICES"] = "2" + else: + if (args.usecublas and "0" in args.usecublas): + inputs.cublas_info = 0 + elif (args.usecublas and "1" in args.usecublas): + inputs.cublas_info = 1 + elif (args.usecublas and "2" in args.usecublas): + inputs.cublas_info = 2 inputs.executable_path = (getdirpath()+"/").encode("UTF-8") inputs.debugmode = args.debugmode diff --git a/llama.cpp b/llama.cpp index 74ce6d8ad..af0cf7d60 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3610,7 +3610,7 @@ static void llama_grammar_advance_stack( std::vector> & new_stacks) { if (stack.empty()) { - new_stacks.push_back(stack); + new_stacks.emplace_back(stack); return; } @@ -3647,7 +3647,7 @@ static void llama_grammar_advance_stack( } case LLAMA_GRETYPE_CHAR: case LLAMA_GRETYPE_CHAR_NOT: - new_stacks.push_back(stack); + new_stacks.emplace_back(stack); break; default: // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range @@ -4406,7 +4406,7 @@ struct llama_logit_info { } return min_heap; } - float probability_from_logit(float logit) { + float probability_from_logit(float logit) const { return normalizer * std::exp(logit - max_l); } }; @@ -4696,6 +4696,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s llm_load_arch(*ml, model); llm_load_hparams(*ml, model, 0, 0, 0); + if (params->only_copy) { + ftype = model.ftype; + } + const size_t align = GGUF_DEFAULT_ALIGNMENT; struct gguf_context * ctx_out = gguf_init_empty(); @@ -4782,18 +4786,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // quantize only 2D tensors quantize &= (tensor->n_dims == 2); quantize &= params->quantize_output_tensor || name != "output.weight"; - quantize &= quantized_type != tensor->type; + quantize &= !params->only_copy; enum ggml_type new_type; void * new_data; size_t new_size; - if (!quantize) { - new_type = tensor->type; - new_data = tensor->data; - new_size = ggml_nbytes(tensor); - LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); - } else { + if (quantize) { new_type = quantized_type; #ifdef GGML_USE_K_QUANTS // TODO: avoid hardcoded tensor names - use the TN_* constants @@ -4892,7 +4891,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } #endif - + // If we've decided to quantize to the same type the tensor is already + // in then there's nothing to do. + quantize = tensor->type != new_type; + } + if (!quantize) { + new_type = tensor->type; + new_data = tensor->data; + new_size = ggml_nbytes(tensor); + LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); + } else { const size_t nelements = ggml_nelements(tensor); float * f32_data; @@ -5323,6 +5331,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, /*.allow_requantize =*/ false, /*.quantize_output_tensor =*/ true, + /*.only_copy =*/ false, }; return result; diff --git a/llama.h b/llama.h index 6e5e1df63..422f28527 100644 --- a/llama.h +++ b/llama.h @@ -164,6 +164,7 @@ extern "C" { enum llama_ftype ftype; // quantize to this llama_ftype bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight + bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored } llama_model_quantize_params; // grammar types