Merge branch 'ggerganov:master' into t5-clean-2

This commit is contained in:
fairydreaming 2024-06-25 17:37:59 +02:00 committed by GitHub
commit 21d36842e8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
36 changed files with 1681 additions and 940 deletions

View file

@ -30,8 +30,10 @@ RUN make -j$(nproc) llama-server
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1
apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/llama-server /llama-server
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -20,10 +20,12 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
apt-get install -y libcurl4-openssl-dev curl
COPY --from=build /app/build/bin/llama-server /llama-server
ENV LC_ALL=C.utf8
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -43,8 +43,10 @@ ENV CXX=/opt/rocm/llvm/bin/clang++
# Enable cURL
ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
apt-get install -y libcurl4-openssl-dev curl
RUN make -j$(nproc) llama-server
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View file

@ -5,15 +5,11 @@ FROM ubuntu:$UBUNTU_VERSION as build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK
# Install Vulkan SDK and cURL
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
apt update -y && \
apt-get install -y vulkan-sdk
# Install cURL
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
# Build it
WORKDIR /app
@ -28,4 +24,6 @@ RUN cp /app/build/bin/llama-server /llama-server && \
ENV LC_ALL=C.utf8
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev
apt-get install -y build-essential git libcurl4-openssl-dev curl
WORKDIR /app
@ -22,4 +22,6 @@ COPY --from=build /app/llama-server /llama-server
ENV LC_ALL=C.utf8
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -102,7 +102,8 @@ option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM"
option(LLAMA_CUDA "llama: use CUDA" OFF)
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
option(LLAMA_CUDA_FORCE_MMQ "llama: always use mmq kernels instead of cuBLAS" OFF)
option(LLAMA_CUDA_FORCE_CUBLAS "llama: always use cuBLAS instead of mmq kernels" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
@ -416,13 +417,14 @@ if (LLAMA_CUDA)
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 60 == FP16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
# 70 == FP16 tensor cores
# 75 == int8 tensor cores
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
endif()
endif()
@ -447,6 +449,9 @@ if (LLAMA_CUDA)
if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()
if (LLAMA_CUDA_FORCE_CUBLAS)
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
endif()
if (LLAMA_CUDA_NO_VMM)
add_compile_definitions(GGML_CUDA_NO_VMM)
endif()

View file

@ -537,6 +537,9 @@ endif # LLAMA_CUDA_FORCE_DMMV
ifdef LLAMA_CUDA_FORCE_MMQ
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
endif # LLAMA_CUDA_FORCE_MMQ
ifdef LLAMA_CUDA_FORCE_CUBLAS
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
endif # LLAMA_CUDA_FORCE_CUBLAS
ifdef LLAMA_CUDA_DMMV_X
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
else

View file

@ -511,7 +511,8 @@ Building the program with BLAS support may lead to some performance improvements
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). Speed for large batch sizes will be worse but VRAM consumption will be lower. |
| LLAMA_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |

View file

@ -1263,11 +1263,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
// cvector params
if (arg == "--completions-file") {
CHECK_ARG
params.cvector_completions_file = argv[i];
return true;
}
if (arg == "--positive-file") {
CHECK_ARG
params.cvector_positive_file = argv[i];
@ -1278,11 +1273,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.cvector_negative_file = argv[i];
return true;
}
if (arg == "--completions") {
CHECK_ARG
params.n_completions = std::stoi(argv[i]);
return true;
}
if (arg == "--pca-batch") {
CHECK_ARG
params.n_pca_batch = std::stoi(argv[i]);
@ -1293,6 +1283,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.n_pca_iterations = std::stoi(argv[i]);
return true;
}
if (arg == "--method") {
CHECK_ARG
std::string value(argv[i]);
/**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
else { invalid_param = true; }
return true;
}
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
if (log_param_single_parse(argv[i])) {
@ -1444,7 +1442,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
"negative prompt file to use for guidance" });
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
"set custom jinja chat template (default: template taken from model's metadata)\n"
"only commonly used templates are accepted:\n"
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
options.push_back({ "grammar" });
options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
@ -1538,9 +1539,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
options.push_back({ "*", " --control-vector FNAME", "add a control vector" });
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
"note: this argument can be repeated to add multiple control vectors" });
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
"add a control vector with user defined scaling SCALE" });
"add a control vector with user defined scaling SCALE\n"
"note: this argument can be repeated to add multiple scaled control vectors" });
options.push_back({ "*", " --control-vector-layer-range START END",
"layer range to apply the control vector(s) to, start and end inclusive" });
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
@ -1621,11 +1624,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
options.push_back({ "cvector", " --completions-file FNAME",
"completions file (default: '%s')", params.cvector_completions_file.c_str() });
options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions });
options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
printf("usage: %s [options]\n", argv[0]);
@ -2602,12 +2603,67 @@ bool llama_should_add_bos_token(const llama_model * model) {
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
}
//
// Chat template utils
//
bool llama_chat_verify_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}};
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
return res >= 0;
}
std::string llama_chat_apply_template(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & msgs,
bool add_ass) {
int alloc_size = 0;
std::vector<llama_chat_message> chat;
for (auto & msg : msgs) {
chat.push_back({msg.role.c_str(), msg.content.c_str()});
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
}
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
std::vector<char> buf(alloc_size);
// run the first time to get the total output length
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
// if it turns out that our buffer is too small, we resize it
if ((size_t) res > buf.size()) {
buf.resize(res);
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
}
std::string formatted_chat(buf.data(), res);
return formatted_chat;
}
std::string llama_chat_format_single(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & past_msg,
const llama_chat_msg & new_msg,
bool add_ass) {
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
std::vector<llama_chat_msg> chat_new(past_msg);
chat_new.push_back(new_msg);
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
return formatted;
}
std::string llama_chat_format_example(const struct llama_model * model,
const std::string & tmpl) {
std::vector<llama_chat_msg> msgs = {
{"system", "You are a helpful assistant"},
{"user", "Hello"},
{"assistant", "Hi there"},
{"user", "How are you?"},
};
return llama_chat_apply_template(model, tmpl, msgs, true);
}
//
// KV cache utils
//

View file

@ -52,6 +52,12 @@ int32_t cpu_get_num_math();
// CLI argument parsing
//
// dimensionality reduction methods, used by cvector-generator
enum dimre_method {
DIMRE_METHOD_PCA,
DIMRE_METHOD_MEAN,
};
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
@ -238,11 +244,10 @@ struct gpt_params {
bool compute_ppl = true; // whether to compute perplexity
// cvector-generator params
int n_completions = 64;
int n_pca_batch = 20;
int n_pca_batch = 100;
int n_pca_iterations = 1000;
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
std::string cvector_outfile = "control_vector.gguf";
std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
};
@ -365,9 +370,32 @@ bool llama_should_add_bos_token(const llama_model * model);
// Chat template utils
//
// same with llama_chat_message, but uses std::string
struct llama_chat_msg {
std::string role;
std::string content;
};
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool llama_chat_verify_template(const std::string & tmpl);
// CPP wrapper for llama_chat_apply_template
std::string llama_chat_apply_template(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & chat,
bool add_ass);
// Format single message, while taking into account the position of that message in chat history
std::string llama_chat_format_single(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & past_msg,
const llama_chat_msg & new_msg,
bool add_ass);
// Returns an example of formatted chat
std::string llama_chat_format_example(const struct llama_model * model,
const std::string & tmpl);
//
// KV cache utils
//

View file

@ -65,7 +65,8 @@ class Model:
# subclasses should define this!
model_arch: gguf.MODEL_ARCH
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool,
model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
if type(self) is Model:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
self.dir_model = dir_model
@ -96,7 +97,8 @@ class Model:
ftype_lw: str = ftype_up.lower()
# allow templating the file name with the output ftype, useful with the "auto" ftype
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
@classmethod
def __init_subclass__(cls):
@ -332,6 +334,8 @@ class Model:
self.gguf_writer.close()
def write_vocab(self):
if len(self.gguf_writer.tensors) != 1:
raise ValueError('Splitting the vocabulary is not supported')
self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.close()
@ -2974,10 +2978,44 @@ def parse_args() -> argparse.Namespace:
"--verbose", action="store_true",
help="increase output verbosity",
)
parser.add_argument(
"--split-max-tensors", type=int, default=0,
help="max tensors in each split",
)
parser.add_argument(
"--split-max-size", type=str, default="0",
help="max size per split N(M|G)",
)
parser.add_argument(
"--dry-run", action="store_true",
help="only print out a split plan and exit, without writing any new files",
)
parser.add_argument(
"--no-tensor-first-split", action="store_true",
help="do not add tensors to the first split (disabled by default)"
)
return parser.parse_args()
def split_str_to_n_bytes(split_str: str) -> int:
if split_str.endswith("K"):
n = int(split_str[:-1]) * 1000
elif split_str.endswith("M"):
n = int(split_str[:-1]) * 1000 * 1000
elif split_str.endswith("G"):
n = int(split_str[:-1]) * 1000 * 1000 * 1000
elif split_str.isnumeric():
n = int(split_str)
else:
raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
if n < 0:
raise ValueError(f"Invalid split size: {split_str}, must be positive")
return n
def main() -> None:
args = parse_args()
@ -3010,6 +3048,10 @@ def main() -> None:
"auto": gguf.LlamaFileType.GUESSED,
}
if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
logger.error("Error: Cannot use temp file when splitting")
sys.exit(1)
if args.outfile is not None:
fname_out = args.outfile
else:
@ -3027,7 +3069,10 @@ def main() -> None:
logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1)
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file,
args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors,
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
small_first_shard=args.no_tensor_first_split)
logger.info("Set model parameters")
model_instance.set_gguf_parameters()
@ -3038,13 +3083,13 @@ def main() -> None:
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
if args.vocab_only:
logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
logger.info("Exporting model vocab...")
model_instance.write_vocab()
logger.info("Model vocab successfully exported.")
else:
logger.info(f"Exporting model to '{model_instance.fname_out}'")
logger.info("Exporting model...")
model_instance.write()
logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
logger.info("Model successfully exported.")
if __name__ == '__main__':

View file

@ -11,13 +11,16 @@ Related PRs:
```sh
# CPU only
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
./cvector-generator -m ./llama-3.Q4_K_M.gguf
# With GPU
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
# With advanced options
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
# Using mean value instead of PCA
./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
# To see help message
./cvector-generator -h
@ -32,3 +35,11 @@ If you have multiple lines per prompt, you can escape the newline character (cha
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
<|im_start|>system\nYou are in a very good mood today<|im_end|>
```
Example to use output file with `llama-cli`:
(Tips: The control vector works better when apply to layers higher than 10)
```sh
./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
```

View file

@ -2,6 +2,7 @@
#include "llama.h"
#include "ggml.h"
#include "pca.hpp"
#include "mean.hpp"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
@ -38,9 +39,10 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
printf("\nexample usage:\n");
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]);
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
printf("\n");
}
@ -223,17 +225,20 @@ struct train_context {
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
void build_v_diff() {
void build_v_diff(bool transpose) {
printf("build_v_diff\n");
for (int il = 0; il < n_layers - 1; il++) {
auto & diff_tmp = v_diff_tmp[il];
int n_elem = diff_tmp.size() / sizeof(float);
GGML_ASSERT(n_elem % n_embd == 0);
int n_rows = n_elem / n_embd;
struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
struct ggml_tensor * diff = transpose
? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
: ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
// copy data & transpose
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
if (transpose) {
// copy data & transpose
float * arr = (float *) diff_tmp.data();
for (int ir = 0; ir < n_rows; ++ir) {
for (int ic = 0; ic < n_embd; ++ic) {
@ -241,6 +246,10 @@ struct train_context {
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
}
}
} else {
// only copy
memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
}
v_diff.push_back(diff);
print_debug_tensor(diff);
// free memory of diff_tmp
@ -263,8 +272,8 @@ struct tokenized_prompt {
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
padding_seq(ctx, tokens_pos, max_seq_len);
padding_seq(ctx, tokens_neg, max_seq_len);
@ -373,20 +382,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
fprintf(stderr, "must provide at least one prompt pair\n");
return 1;
}
// create templated prompts
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
auto format_template = [](std::string persona, std::string suffix) {
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] "
return persona + suffix;
};
for (size_t i = 0; i < positive_prompts.size(); ++i) {
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
// TODO replicate the truncations done by the python implementation
ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
}
}
ctx_train.positive_entries = positive_prompts;
ctx_train.negative_entries = negative_prompts;
return 0;
}
@ -480,15 +477,22 @@ int main(int argc, char ** argv) {
llama_free(ctx);
llama_free_model(model);
// prepare ctx_train for PCA
ctx_train.build_v_diff();
bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
// prepare ctx_train for PCA
ctx_train.build_v_diff(use_pca);
if (use_pca) {
// run PCA
PCA::pca_params pca_params;
pca_params.n_threads = params.n_threads;
pca_params.n_batch = params.n_pca_batch;
pca_params.n_iterations = params.n_pca_iterations;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
} else {
// run mean
mean::run(ctx_train.v_diff, ctx_train.v_final);
}
// write output vectors to gguf
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);

View file

@ -0,0 +1,48 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#include <string>
#include <vector>
#include <math.h>
namespace mean {
static void run(
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
const std::vector<struct ggml_tensor *> & v_output) {
printf("%s: Running mean...\n", __func__);
for (size_t il = 0; il < v_input.size(); ++il) {
// prepare output vector
struct ggml_tensor * ctrl_out = v_output[il];
ggml_format_name(ctrl_out, "direction.%ld", il+1);
// calculate mean vector
struct ggml_tensor * t_layer = v_input[il];
GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
for (int ic = 0; ic < t_layer->ne[0]; ic++) {
float f = 0.0;
for (int ir = 0; ir < t_layer->ne[1]; ir++) {
f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
}
f /= t_layer->ne[1];
ggml_set_f32_1d(ctrl_out, ic, f);
}
// normalize output vector
float norm = 0.0;
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
float f = ggml_get_f32_1d(ctrl_out, i);
norm += f*f;
}
norm = sqrt(norm);
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
float f = ggml_get_f32_1d(ctrl_out, i);
ggml_set_f32_1d(ctrl_out, i, f / norm);
}
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
}
}
}

View file

@ -1 +1,4 @@
[INST] Act like a person who is extremely sad. [/INST]
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow

View file

@ -290,7 +290,7 @@ static void power_iteration(
}
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
__func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
__func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
}
// get output tensor
@ -298,6 +298,9 @@ static void power_iteration(
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
//print_debug_tensor(output);
ggml_gallocr_free(allocr);
// TODO @ngxson : The output vector is randomly inverted
// Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
}
static void run_pca(

View file

@ -1 +1,4 @@
[INST] Act like a person who is extremely happy. [/INST]
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!

View file

@ -39,12 +39,12 @@ static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;
static bool file_exists(const std::string &path) {
static bool file_exists(const std::string & path) {
std::ifstream f(path.c_str());
return f.good();
}
static bool file_is_empty(const std::string &path) {
static bool file_is_empty(const std::string & path) {
std::ifstream f;
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
@ -117,6 +117,14 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
LOG_TEE("%s", text);
}
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
llama_chat_msg new_msg{role, content};
auto formatted = llama_chat_format_single(
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
chat_msgs.push_back({role, content});
return formatted;
}
int main(int argc, char ** argv) {
gpt_params params;
g_params = &params;
@ -190,6 +198,7 @@ int main(int argc, char ** argv) {
llama_model * model;
llama_context * ctx;
llama_context * ctx_guidance = NULL;
std::vector<llama_chat_msg> chat_msgs;
g_model = &model;
g_ctx = &ctx;
@ -215,6 +224,8 @@ int main(int argc, char ** argv) {
__func__, n_ctx_train, n_ctx);
}
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
// print system information
{
LOG_TEE("\n");
@ -249,16 +260,21 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_inp;
{
auto prompt = params.conversation
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
: params.prompt;
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
} else {
LOG("use session tokens\n");
embd_inp = session_tokens;
}
LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
LOG("prompt: \"%s\"\n", log_tostr(prompt));
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
}
// Should not run without any tokens
if (embd_inp.empty()) {
@ -478,6 +494,7 @@ int main(int argc, char ** argv) {
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
std::ostringstream output_ss; g_output_ss = &output_ss;
std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
// the first thing we will do is to output the prompt, so set color accordingly
console::set_display(console::prompt);
@ -793,11 +810,18 @@ int main(int argc, char ** argv) {
is_antiprompt = true;
}
chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
is_interacting = true;
printf("\n");
}
}
// if current token is not EOG, we add it to current assistant message
if (params.conversation) {
auto id = llama_sampling_last(ctx_sampling);
assistant_ss << llama_token_to_piece(ctx, id, false);
}
if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n");
@ -848,8 +872,12 @@ int main(int argc, char ** argv) {
string_process_escapes(buffer);
}
std::string user_inp = params.conversation
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
: std::move(buffer);
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
@ -864,6 +892,9 @@ int main(int argc, char ** argv) {
output_ss << llama_token_to_piece(ctx, token);
}
// reset assistant message
assistant_ss.str("");
n_remain -= line_inp.size();
LOG("n_remain: %d\n", n_remain);
} else {

View file

@ -3,6 +3,13 @@
by Humans for All.
## quickstart
To run from the build dir
bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat
Continue reading for the details.
## overview
@ -14,6 +21,8 @@ own system prompts.
This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
or potentially as it is being generated, in a streamed manner from the server/ai-model.
![Chat and Settings screens](./simplechat_screens.webp "Chat and Settings screens")
Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
@ -170,17 +179,23 @@ It is attached to the document object. Some of these can also be updated using t
The histogram/freq based trimming logic is currently tuned for english language wrt its
is-it-a-alpabetic|numeral-char regex match logic.
chatRequestOptions - maintains the list of options/fields to send along with chat request,
apiRequestOptions - maintains the list of options/fields to send along with api request,
irrespective of whether /chat/completions or /completions endpoint.
If you want to add additional options/fields to send to the server/ai-model, and or
modify the existing options value or remove them, for now you can update this global var
using browser's development-tools/console.
For string and numeric fields in chatRequestOptions, including even those added by a user
at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto
For string, numeric and boolean fields in apiRequestOptions, including even those added by a
user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto
created.
cache_prompt option supported by example/server is allowed to be controlled by user, so that
any caching supported wrt system-prompt and chat history, if usable can get used. When chat
history sliding window is enabled, cache_prompt logic may or may not kick in at the backend
wrt same, based on aspects related to model, positional encoding, attention mechanism etal.
However system prompt should ideally get the benefit of caching.
headers - maintains the list of http headers sent when request is made to the server. By default
Content-Type is set to application/json. Additionally Authorization entry is provided, which can
be set if needed using the settings ui.
@ -197,10 +212,10 @@ It is attached to the document object. Some of these can also be updated using t
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
implications of loading of the ai-model's context window by chat history, wrt chat response to
some extent in a simple crude way. You may also want to control the context size enabled when
the server loads ai-model, on the server end.
By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control
the implications of loading of the ai-model's context window by chat history, wrt chat response to
some extent in a simple crude way. You may also want to control the context size enabled when the
server loads ai-model, on the server end.
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
@ -237,12 +252,12 @@ also be started with a model context size of 1k or more, to be on safe side.
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
to /completions endpoint handling code on server side.
NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions
wrt the set of fields sent to server along with the user query. To check how the model behaves
NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions
wrt the set of fields sent to server along with the user query, to check how the model behaves
wrt repeatations in general in the generated text response.
A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
using the providing settings ui.
using the provided settings ui (for settings exposed through the ui).
### OpenAi / Equivalent API WebService
@ -253,7 +268,7 @@ for a minimal chatting experimentation by setting the below.
* the baseUrl in settings ui
* https://api.openai.com/v1 or similar
* Wrt request body - gMe.chatRequestOptions
* Wrt request body - gMe.apiRequestOptions
* model (settings ui)
* any additional fields if required in future

View file

@ -222,8 +222,8 @@ class SimpleChat {
* @param {Object} obj
*/
request_jsonstr_extend(obj) {
for(let k in gMe.chatRequestOptions) {
obj[k] = gMe.chatRequestOptions[k];
for(let k in gMe.apiRequestOptions) {
obj[k] = gMe.apiRequestOptions[k];
}
if (gMe.bStream) {
obj["stream"] = true;
@ -740,11 +740,12 @@ class Me {
"Authorization": "", // Authorization: Bearer OPENAI_API_KEY
}
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
this.chatRequestOptions = {
this.apiRequestOptions = {
"model": "gpt-3.5-turbo",
"temperature": 0.7,
"max_tokens": 1024,
"n_predict": 1024,
"cache_prompt": false,
//"frequency_penalty": 1.2,
//"presence_penalty": 1.2,
};
@ -800,51 +801,55 @@ class Me {
ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
}
ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv);
ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv);
ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
}
/**
* Auto create ui input elements for fields in ChatRequestOptions
* Auto create ui input elements for fields in apiRequestOptions
* Currently supports text and number field types.
* @param {HTMLDivElement} elDiv
*/
show_settings_chatrequestoptions(elDiv) {
show_settings_apirequestoptions(elDiv) {
let typeDict = {
"string": "text",
"number": "number",
};
let fs = document.createElement("fieldset");
let legend = document.createElement("legend");
legend.innerText = "ChatRequestOptions";
legend.innerText = "ApiRequestOptions";
fs.appendChild(legend);
elDiv.appendChild(fs);
for(const k in this.chatRequestOptions) {
let val = this.chatRequestOptions[k];
for(const k in this.apiRequestOptions) {
let val = this.apiRequestOptions[k];
let type = typeof(val);
if (!((type == "string") || (type == "number"))) {
continue;
}
let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{
if (((type == "string") || (type == "number"))) {
let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{
if (type == "number") {
val = Number(val);
}
this.chatRequestOptions[k] = val;
this.apiRequestOptions[k] = val;
});
fs.appendChild(inp.div);
} else if (type == "boolean") {
let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{
this.apiRequestOptions[k] = userVal;
});
fs.appendChild(bbtn.div);
}
}
}
@ -870,6 +875,23 @@ class Me {
});
elDiv.appendChild(bb.div);
bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
this.bTrimGarbage = val;
});
elDiv.appendChild(bb.div);
this.show_settings_apirequestoptions(elDiv);
let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
this.apiEP = ApiEP.Type[val];
});
elDiv.appendChild(sel.div);
sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
});
elDiv.appendChild(sel.div);
bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
this.bCompletionFreshChatAlways = val;
});
@ -880,23 +902,6 @@ class Me {
});
elDiv.appendChild(bb.div);
bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
this.bTrimGarbage = val;
});
elDiv.appendChild(bb.div);
let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
});
elDiv.appendChild(sel.div);
sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
this.apiEP = ApiEP.Type[val];
});
elDiv.appendChild(sel.div);
this.show_settings_chatrequestoptions(elDiv);
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

View file

@ -2606,16 +2606,8 @@ int main(int argc, char ** argv) {
// print sample chat example to make it clear which template is used
{
json chat;
chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}});
chat.push_back({{"role", "user"}, {"content", "Hello"}});
chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
chat.push_back({{"role", "user"}, {"content", "How are you?"}});
const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
LOG_INFO("chat template", {
{"chat_example", chat_example},
{"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
{"built_in", params.chat_template.empty()},
});
}

View file

@ -118,36 +118,17 @@ static inline void server_log(const char * level, const char * function, int lin
// Format given chat. If tmpl is empty, we take the template from model metadata
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
size_t alloc_size = 0;
// vector holding all allocated string to be passed to llama_chat_apply_template
std::vector<std::string> str(messages.size() * 2);
std::vector<llama_chat_message> chat(messages.size());
std::vector<llama_chat_msg> chat;
for (size_t i = 0; i < messages.size(); ++i) {
const auto & curr_msg = messages[i];
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
alloc_size += str[i*2 + 1].length();
chat[i].role = str[i*2 + 0].c_str();
chat[i].content = str[i*2 + 1].c_str();
std::string role = json_value(curr_msg, "role", std::string(""));
std::string content = json_value(curr_msg, "content", std::string(""));
chat.push_back({role, content});
}
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
std::vector<char> buf(alloc_size * 2);
// run the first time to get the total output length
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
// if it turns out that our buffer is too small, we resize it
if ((size_t) res > buf.size()) {
buf.resize(res);
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
}
const std::string formatted_chat(buf.data(), res);
auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
return formatted_chat;
}

View file

@ -152,16 +152,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0;
#if defined(GGML_CUDA_FORCE_MMQ)
#ifdef GGML_CUDA_FORCE_MMQ
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
#else
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
#endif
#if defined(CUDA_USE_TENSOR_CORES)
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
#endif // GGML_CUDA_FORCE_MMQ
#ifdef GGML_CUDA_FORCE_CUBLAS
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
#else
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
#endif
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
#endif // GGML_CUDA_FORCE_CUBLAS
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0;
@ -1873,9 +1873,17 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
int64_t min_compute_capability = INT_MAX;
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
bool use_mul_mat_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
bool any_gpus_with_slow_fp16 = false;
bool any_pascal_with_slow_fp16 = false;
if (split) {
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
auto & tensor_split = buft_ctx->tensor_split;
@ -1885,55 +1893,18 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
continue;
}
if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
min_compute_capability = ggml_cuda_info().devices[id].cc;
}
if (ggml_cuda_info().devices[id].cc == 610) {
any_pascal_with_slow_fp16 = true;
}
const int cc = ggml_cuda_info().devices[id].cc;
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
}
} else {
min_compute_capability = ggml_cuda_info().devices[ctx.device].cc;
any_pascal_with_slow_fp16 = ggml_cuda_info().devices[ctx.device].cc == 610;
const int cc = ggml_cuda_info().devices[ctx.device].cc;
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
}
// check data types and tensor shapes for custom matrix multiplication kernels:
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
#ifdef CUDA_USE_TENSOR_CORES
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
#endif // CUDA_USE_TENSOR_CORES
#else
// fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
#ifdef CUDA_USE_TENSOR_CORES
// when tensor cores are available, use them for large batch size
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
#endif // CUDA_USE_TENSOR_CORES
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
// if mmvq is available it's a better choice than dmmv:
#ifndef GGML_CUDA_FORCE_DMMV
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
@ -1947,14 +1918,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
// KQ single-batch
if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
// FP32 precision KQ single-batch for batch size 1 without FlashAttention
ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// KQV single-batch
} else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// FP32 precision KQV single-batch for batch size 1 without FlashAttention
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// KQ + KQV multi-batch
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// KQ + KQV multi-batch without FlashAttention
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
} else if (use_dequantize_mul_mat_vec) {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);

View file

@ -146,23 +146,6 @@
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
// - 7B quantum model: +100-200 MB
// - 13B quantum model: +200-400 MB
//
//#define GGML_CUDA_FORCE_MMQ
// TODO: improve this to be correct for more hardware
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
#if !defined(GGML_CUDA_FORCE_MMQ)
#define CUDA_USE_TENSOR_CORES
#endif
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
#define MMQ_MAX_BATCH_SIZE 64 // max batch size to use MMQ kernels when tensor cores are available
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
#if defined(_MSC_VER)
@ -343,15 +326,15 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
#define INT8_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
static bool fast_fp16_available(const int cc) {
static constexpr bool fast_fp16_available(const int cc) {
return cc >= CC_PASCAL && cc != 610;
}
static bool fp16_mma_available(const int cc) {
static constexpr bool fp16_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
}
static bool int8_mma_available(const int cc) {
static constexpr bool int8_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
}
@ -643,19 +626,6 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
static constexpr int qi = QI3_S;
};
static int get_mmq_x_max_host(const int cc) {
#ifdef CUDA_USE_TENSOR_CORES
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
#else
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
#endif // CUDA_USE_TENSOR_CORES
}
// Round rows to this value for --split-mode row:
static int get_mmq_y_host(const int cc) {
return cc >= CC_VOLTA ? 128 : 64;
}
//////////////////////
struct ggml_cuda_device_info {

View file

@ -20,6 +20,20 @@ struct mma_int_A_I16K4 {
GGML_CUDA_ASSUME(ret < K);
return ret;
}
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
#if defined(INT8_MMA_AVAILABLE)
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
: "+r"(x[0]), "+r"(x[1])
: "l"(xs));
#else
#pragma unroll
for (int l = 0; l < ne; ++l) {
x[l] = xs0[get_i(l)*stride + get_k(l)];
}
#endif // defined(INT8_MMA_AVAILABLE)
}
};
struct mma_int_A_I16K8 {
@ -42,6 +56,20 @@ struct mma_int_A_I16K8 {
GGML_CUDA_ASSUME(ret < K);
return ret;
}
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
#if defined(INT8_MMA_AVAILABLE)
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
asm("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
: "l"(xs));
#else
#pragma unroll
for (int l = 0; l < ne; ++l) {
x[l] = xs0[get_i(l)*stride + get_k(l)];
}
#endif // defined(INT8_MMA_AVAILABLE)
}
};
struct mma_int_B_J8K4 {
@ -64,6 +92,20 @@ struct mma_int_B_J8K4 {
GGML_CUDA_ASSUME(ret < K);
return ret;
}
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
const int * xs = xs0 + (threadIdx.x%J)*stride;
asm("ldmatrix.sync.aligned.m8n8.x1.b16 {%0}, [%1];"
: "+r"(x[0])
: "l"(xs));
#else
#pragma unroll
for (int l = 0; l < ne; ++l) {
x[l] = xs0[get_j(l)*stride + get_k(l)];
}
#endif // defined(INT8_MMA_AVAILABLE)
}
};
struct mma_int_B_J8K8 {
@ -86,6 +128,20 @@ struct mma_int_B_J8K8 {
GGML_CUDA_ASSUME(ret < K);
return ret;
}
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
const int * xs = xs0 + (threadIdx.x%J)*stride + ((threadIdx.x/J)*(K/2)) % K;
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
: "+r"(x[0]), "+r"(x[1])
: "l"(xs));
#else
#pragma unroll
for (int l = 0; l < ne; ++l) {
x[l] = xs0[get_j(l)*stride + get_k(l)];
}
#endif // defined(INT8_MMA_AVAILABLE)
}
};
struct mma_int_C_I16J8 {

View file

@ -69,7 +69,13 @@ void ggml_cuda_op_mul_mat_q(
GGML_UNUSED(src1_ddf_i);
}
bool ggml_cuda_supports_mmq(enum ggml_type type) {
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
#ifdef GGML_CUDA_FORCE_CUBLAS
return false;
#endif // GGML_CUDA_FORCE_CUBLAS
bool mmq_supported;
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
@ -81,8 +87,32 @@ bool ggml_cuda_supports_mmq(enum ggml_type type) {
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
return true;
mmq_supported = true;
break;
default:
mmq_supported = false;
break;
}
if (!mmq_supported) {
return false;
}
if (int8_mma_available(cc)) {
return true;
}
if (cc < MIN_CC_DP4A) {
return false;
}
#ifdef GGML_CUDA_FORCE_MMQ
return true;
#endif //GGML_CUDA_FORCE_MMQ
if (cc < CC_OFFSET_AMD) {
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}
return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,7 @@
#include "common.cuh"
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
void ggml_cuda_op_mul_mat_vec_q(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,

View file

@ -4620,7 +4620,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// KQV single-batch
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// KQ + KQV multi-batch
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
} else if (use_dequantize_mul_mat_vec) {

View file

@ -75,6 +75,11 @@ class Keys:
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
class Split:
LLM_KV_SPLIT_NO = "split.no"
LLM_KV_SPLIT_COUNT = "split.count"
LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"
class SSM:
CONV_KERNEL = "{arch}.ssm.conv_kernel"
INNER_SIZE = "{arch}.ssm.inner_size"

View file

@ -69,6 +69,7 @@ class GGUFReader:
# I - same as host, S - swapped
byte_order: Literal['I'] | Literal['S'] = 'I'
alignment: int = GGUF_DEFAULT_ALIGNMENT
data_offset: int
# Note: Internal helper, API may change.
gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
@ -88,9 +89,13 @@ class GGUFReader:
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
self.data = np.memmap(path, mode = mode)
offs = 0
# Check for GGUF magic
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
raise ValueError('GGUF magic invalid')
offs += 4
# Check GGUF version
temp_version = self._get(offs, np.uint32)
if temp_version[0] & 65535 == 0:
# If we get 0 here that means it's (probably) a GGUF file created for
@ -103,12 +108,16 @@ class GGUFReader:
self.fields: OrderedDict[str, ReaderField] = OrderedDict()
self.tensors: list[ReaderTensor] = []
offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
# Check tensor count and kv count
temp_counts = self._get(offs, np.uint64, 2)
offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
tensor_count, kv_count = temp_counts
offs = self._build_fields(offs, kv_count)
offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
# Build Tensor Info Fields
offs, tensors_fields = self._build_tensor_info(offs, tensor_count)
new_align = self.fields.get('general.alignment')
if new_align is not None:
if new_align.types != [GGUFValueType.UINT32]:
@ -117,6 +126,7 @@ class GGUFReader:
padding = offs % self.alignment
if padding != 0:
offs += self.alignment - padding
self.data_offset = offs
self._build_tensors(offs, tensors_fields)
_DT = TypeVar('_DT', bound = npt.DTypeLike)
@ -193,18 +203,29 @@ class GGUFReader:
# We can't deal with this one.
raise ValueError('Unknown/unhandled field type {gtype}')
def _get_tensor(self, orig_offs: int) -> ReaderField:
def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
offs = orig_offs
# Get Tensor Name
name_len, name_data = self._get_str(offs)
offs += int(name_len.nbytes + name_data.nbytes)
# Get Tensor Dimensions Count
n_dims = self._get(offs, np.uint32)
offs += int(n_dims.nbytes)
# Get Tensor Dimension Array
dims = self._get(offs, np.uint64, n_dims[0])
offs += int(dims.nbytes)
# Get Tensor Encoding Scheme Type
raw_dtype = self._get(offs, np.uint32)
offs += int(raw_dtype.nbytes)
# Get Tensor Offset
offset_tensor = self._get(offs, np.uint64)
offs += int(offset_tensor.nbytes)
return ReaderField(
orig_offs,
str(bytes(name_data), encoding = 'utf-8'),
@ -233,10 +254,10 @@ class GGUFReader:
offs += field_size
return offs
def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
tensor_fields = []
for _ in range(count):
field = self._get_tensor(offs)
field = self._get_tensor_info_field(offs)
offs += sum(int(part.nbytes) for part in field.parts)
tensor_fields.append(field)
return offs, tensor_fields

View file

@ -7,6 +7,7 @@ import struct
import tempfile
from dataclasses import dataclass
from enum import Enum, auto
from pathlib import Path
from io import BufferedWriter
from typing import IO, Any, Sequence, Mapping
from string import ascii_letters, digits
@ -31,6 +32,9 @@ from .quants import quant_shape_from_byte_shape
logger = logging.getLogger(__name__)
SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
@dataclass
class TensorInfo:
shape: Sequence[int]
@ -55,11 +59,11 @@ class WriterState(Enum):
class GGUFWriter:
fout: BufferedWriter | None
path: os.PathLike[str] | str | None
fout: list[BufferedWriter] | None
path: Path | None
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
tensors: dict[str, TensorInfo]
kv_data: dict[str, GGUFValue]
tensors: list[dict[str, TensorInfo]]
kv_data: list[dict[str, GGUFValue]]
state: WriterState
_simple_value_packing = {
GGUFValueType.UINT8: "B",
@ -76,26 +80,38 @@ class GGUFWriter:
}
def __init__(
self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False,
endianess: GGUFEndian = GGUFEndian.LITTLE,
self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE,
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False
):
self.fout = None
self.path = path
self.path = Path(path) if path else None
self.arch = arch
self.endianess = endianess
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
self.use_temp_file = use_temp_file
self.temp_file = None
self.tensors = dict()
self.kv_data = dict()
self.tensors = [{}]
self.kv_data = [{}]
self.split_max_tensors = split_max_tensors
self.split_max_size = split_max_size
self.dry_run = dry_run
self.small_first_shard = small_first_shard
logger.info("gguf: This GGUF file is for {0} Endian only".format(
"Big" if self.endianess == GGUFEndian.BIG else "Little",
))
self.state = WriterState.NO_FILE
if self.small_first_shard:
self.tensors.append({})
self.add_architecture()
def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None:
def format_shard_names(self, path: Path) -> list[Path]:
if len(self.tensors) == 1:
return [path]
return [path.with_name(SHARD_NAME_FORMAT.format(path.stem, i + 1, len(self.tensors))) for i in range(len(self.tensors))]
def open_output_file(self, path: Path | None = None) -> None:
if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
# allow calling this multiple times as long as the path is the same
return
@ -106,22 +122,58 @@ class GGUFWriter:
self.path = path
if self.path is not None:
if self.fout is not None:
self.fout.close()
self.fout = open(self.path, "wb")
filenames = self.print_plan()
self.fout = [open(filename, "wb") for filename in filenames]
self.state = WriterState.EMPTY
def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None:
def print_plan(self) -> list[Path]:
logger.info("Writing the following files:")
assert self.path is not None
filenames = self.format_shard_names(self.path)
assert len(filenames) == len(self.tensors)
for name, tensors in zip(filenames, self.tensors):
logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}")
if self.dry_run:
logger.info("Dry run, not writing files")
exit()
return filenames
def add_shard_kv_data(self) -> None:
if len(self.tensors) == 1:
return
total_tensors = sum(len(t) for t in self.tensors)
assert self.fout is not None
total_splits = len(self.fout)
self.kv_data.extend({} for _ in range(len(self.kv_data), total_splits))
for i, kv_data in enumerate(self.kv_data):
kv_data[Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16)
kv_data[Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16)
kv_data[Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32)
def write_header_to_file(self, path: Path | None = None) -> None:
if len(self.tensors) == 1 and (self.split_max_tensors != 0 or self.split_max_size != 0):
logger.warning("Model fails split requirements, not splitting")
self.open_output_file(path)
if self.state is not WriterState.EMPTY:
raise ValueError(f'Expected output file to be empty, got {self.state}')
self._write_packed("<I", GGUF_MAGIC, skip_pack_prefix = True)
self._write_packed("I", GGUF_VERSION)
self._write_packed("Q", len(self.tensors))
self._write_packed("Q", len(self.kv_data))
self.flush()
assert self.fout is not None
assert len(self.fout) == len(self.tensors)
assert len(self.kv_data) == 1
self.add_shard_kv_data()
for fout, tensors, kv_data in zip(self.fout, self.tensors, self.kv_data):
fout.write(self._pack("<I", GGUF_MAGIC, skip_pack_prefix = True))
fout.write(self._pack("I", GGUF_VERSION))
fout.write(self._pack("Q", len(tensors)))
fout.write(self._pack("Q", len(kv_data)))
fout.flush()
self.state = WriterState.HEADER
def write_kv_data_to_file(self) -> None:
@ -129,13 +181,15 @@ class GGUFWriter:
raise ValueError(f'Expected output file to contain the header, got {self.state}')
assert self.fout is not None
kv_data = bytearray()
for fout, kv_data in zip(self.fout, self.kv_data):
kv_bytes = bytearray()
for key, val in self.kv_data.items():
kv_data += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
kv_data += self._pack_val(val.value, val.type, add_vtype=True)
for key, val in kv_data.items():
kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
kv_bytes += self._pack_val(val.value, val.type, add_vtype=True)
fout.write(kv_bytes)
self.fout.write(kv_data)
self.flush()
self.state = WriterState.KV_DATA
@ -144,28 +198,29 @@ class GGUFWriter:
raise ValueError(f'Expected output file to contain KV data, got {self.state}')
assert self.fout is not None
for fout, tensors in zip(self.fout, self.tensors):
ti_data = bytearray()
offset_tensor = 0
for name, ti in self.tensors.items():
for name, ti in tensors.items():
ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
n_dims = len(ti.shape)
ti_data += self._pack("I", n_dims)
for i in range(n_dims):
ti_data += self._pack("Q", ti.shape[n_dims - 1 - i])
for j in range(n_dims):
ti_data += self._pack("Q", ti.shape[n_dims - 1 - j])
ti_data += self._pack("I", ti.dtype)
ti_data += self._pack("Q", offset_tensor)
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
self.fout.write(ti_data)
self.flush()
fout.write(ti_data)
fout.flush()
self.state = WriterState.TI_DATA
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
if key in self.kv_data:
if any(key in kv_data for kv_data in self.kv_data):
raise ValueError(f'Duplicated key name {key!r}')
self.kv_data[key] = GGUFValue(value=val, type=vtype)
self.kv_data[0][key] = GGUFValue(value=val, type=vtype)
def add_uint8(self, key: str, val: int) -> None:
self.add_key_value(key,val, GGUFValueType.UINT8)
@ -206,9 +261,6 @@ class GGUFWriter:
self.add_key_value(key, val, GGUFValueType.STRING)
def add_array(self, key: str, val: Sequence[Any]) -> None:
if not isinstance(val, Sequence):
raise ValueError("Value must be a sequence for array type")
self.add_key_value(key, val, GGUFValueType.ARRAY)
@staticmethod
@ -222,7 +274,7 @@ class GGUFWriter:
if self.state is not WriterState.NO_FILE:
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
if name in self.tensors:
if any(name in tensors for tensors in self.tensors):
raise ValueError(f'Duplicated tensor name {name!r}')
if raw_dtype is None:
@ -247,7 +299,18 @@ class GGUFWriter:
if tensor_dtype == np.uint8:
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
self.tensors[name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
# make sure there is at least one tensor before splitting
if len(self.tensors[-1]) > 0:
if ( # split when over tensor limit
self.split_max_tensors != 0
and len(self.tensors[-1]) >= self.split_max_tensors
) or ( # split when over size limit
self.split_max_size != 0
and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size
):
self.tensors.append({})
self.tensors[-1][name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
def add_tensor(
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
@ -264,7 +327,7 @@ class GGUFWriter:
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
if self.temp_file is None:
self.tensors[name].tensor = tensor
self.tensors[-1][name].tensor = tensor
return
tensor.tofile(self.temp_file)
@ -282,9 +345,24 @@ class GGUFWriter:
if self.endianess == GGUFEndian.BIG:
tensor.byteswap(inplace=True)
self.write_padding(self.fout, self.fout.tell())
tensor.tofile(self.fout)
self.write_padding(self.fout, tensor.nbytes)
file_id = -1
for i, tensors in enumerate(self.tensors):
if len(tensors) > 0:
file_id = i
break
fout = self.fout[file_id]
# pop the first tensor info
# TODO: cleaner way to get the first key
first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0]
ti = self.tensors[file_id].pop(first_tensor_name)
assert ti.nbytes == tensor.nbytes
self.write_padding(fout, fout.tell())
tensor.tofile(fout)
self.write_padding(fout, tensor.nbytes)
self.state = WriterState.WEIGHTS
@ -293,31 +371,43 @@ class GGUFWriter:
assert self.fout is not None
self.write_padding(self.fout, self.fout.tell())
for fout in self.fout:
self.write_padding(fout, fout.tell())
if self.temp_file is None:
shard_bar = None
bar = None
if progress:
from tqdm import tqdm
total_bytes = sum(t.nbytes for t in self.tensors.values())
total_bytes = sum(ti.nbytes for t in self.tensors for ti in t.values())
if len(self.fout) > 1:
shard_bar = tqdm(desc=f"Shard (0/{len(self.fout)})", total=None, unit="byte", unit_scale=True)
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)):
if shard_bar is not None:
shard_bar.set_description(f"Shard ({i + 1}/{len(self.fout)})")
total = sum(ti.nbytes for ti in tensors.values())
shard_bar.reset(total=(total if total > 0 else None))
# relying on the fact that Python dicts preserve insertion order (since 3.7)
for ti in self.tensors.values():
for ti in tensors.values():
assert ti.tensor is not None # can only iterate once over the tensors
assert ti.tensor.nbytes == ti.nbytes
ti.tensor.tofile(self.fout)
ti.tensor.tofile(fout)
if shard_bar is not None:
shard_bar.update(ti.nbytes)
if bar is not None:
bar.update(ti.nbytes)
self.write_padding(self.fout, ti.nbytes)
self.write_padding(fout, ti.nbytes)
ti.tensor = None
else:
self.temp_file.seek(0)
shutil.copyfileobj(self.temp_file, self.fout)
shutil.copyfileobj(self.temp_file, self.fout[0 if not self.small_first_shard else 1])
self.flush()
self.temp_file.close()
@ -325,11 +415,13 @@ class GGUFWriter:
def flush(self) -> None:
assert self.fout is not None
self.fout.flush()
for fout in self.fout:
fout.flush()
def close(self) -> None:
if self.fout is not None:
self.fout.close()
for fout in self.fout:
fout.close()
self.fout = None
def add_architecture(self) -> None:
@ -626,6 +718,13 @@ class GGUFWriter:
return kv_data
def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None:
assert self.fout is not None
self.fout.write(self._pack(fmt, value, skip_pack_prefix))
@staticmethod
def format_n_bytes_to_str(num: int) -> str:
if num == 0:
return "negligible - metadata only"
fnum = float(num)
for unit in ("", "K", "M", "G"):
if abs(fnum) < 1000.0:
return f"{fnum:3.1f}{unit}"
fnum /= 1000.0
return f"{fnum:.1f}T - over 1TB, split recommended"

View file

@ -208,7 +208,9 @@ def translate_tensor_name(name):
'ssm_d': 'State space model skip connection',
'ssm_dt': 'State space model time step',
'ssm_out': 'State space model output projection',
'blk': 'Block'
'blk': 'Block',
'enc': 'Encoder',
'dec': 'Decoder',
}
expanded_words = []
@ -291,6 +293,10 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
tensor_group_name = "base"
if tensor_components[0] == 'blk':
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}"
elif tensor_components[0] in ['enc', 'dec'] and tensor_components[1] == 'blk':
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}.{tensor_components[2]}"
elif tensor_components[0] in ['enc', 'dec']:
tensor_group_name = f"{tensor_components[0]}"
# Check if new Tensor Group
if tensor_group_name not in tensor_groups:
@ -313,6 +319,27 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
markdown_content += "\n"
markdown_content += "### Tensor Data Offset\n"
markdown_content += '\n'
markdown_content += 'This table contains the offset and data segment relative to start of file\n'
markdown_content += '\n'
tensor_mapping_table: list[dict[str, str | int]] = []
for key, tensor in enumerate(reader.tensors):
data_offset_pretty = '{0:#16x}'.format(tensor.data_offset)
data_size_pretty = '{0:#16x}'.format(tensor.n_bytes)
tensor_mapping_table.append({"t_id":key, "layer_name":tensor.name, "data_offset":data_offset_pretty, "data_size":data_size_pretty})
tensors_mapping_table_header_map = [
{'key_name':'t_id', 'header_name':'T_ID', 'align':'right'},
{'key_name':'layer_name', 'header_name':'Tensor Layer Name', 'align':'left'},
{'key_name':'data_offset', 'header_name':'Data Offset (B)', 'align':'right'},
{'key_name':'data_size', 'header_name':'Data Size (B)', 'align':'right'},
]
markdown_content += markdown_table_with_alignment_support(tensors_mapping_table_header_map, tensor_mapping_table)
markdown_content += "\n"
for group in tensor_prefix_order:
tensors = tensor_groups[group]
group_elements = sum(tensor.n_elements for tensor in tensors)
@ -364,6 +391,8 @@ def main() -> None:
parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
parser.add_argument("--json", action="store_true", help="Produce JSON output")
parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
parser.add_argument("--data-offset", action="store_true", help="Start of data offset")
parser.add_argument("--data-alignment", action="store_true", help="Data alignment applied globally to data field")
parser.add_argument("--markdown", action="store_true", help="Produce markdown output")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
@ -371,7 +400,7 @@ def main() -> None:
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
if not args.json and not args.markdown:
if not args.json and not args.markdown and not args.data_offset and not args.data_alignment:
logger.info(f'* Loading: {args.model}')
reader = GGUFReader(args.model, 'r')
@ -380,6 +409,10 @@ def main() -> None:
dump_metadata_json(reader, args)
elif args.markdown:
dump_markdown_metadata(reader, args)
elif args.data_offset:
print(reader.data_offset) # noqa: NP100
elif args.data_alignment:
print(reader.alignment) # noqa: NP100
else:
dump_metadata(reader, args)

View file

@ -19363,10 +19363,10 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "<|im_start|>assistant\n";
}
} else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
} else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) {
// llama2 template and its variants
// [variant] support system message
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos || tmpl == "mistral";
// [variant] space before + after response
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
// [variant] add BOS inside history

View file

@ -7,6 +7,7 @@
#include <cassert>
#include "llama.h"
#include "common.h"
int main(void) {
llama_chat_message conversation[] = {
@ -119,5 +120,24 @@ int main(void) {
std::cout << output << "\n-------------------------\n";
assert(output == expected);
}
// test llama_chat_format_single
std::cout << "\n\n=== llama_chat_format_single ===\n\n";
std::vector<llama_chat_msg> chat2;
chat2.push_back({"system", "You are a helpful assistant"});
chat2.push_back({"user", "Hello"});
chat2.push_back({"assistant", "I am assistant"});
llama_chat_msg new_msg{"user", "How are you"};
auto fmt_single = [&](std::string tmpl) {
auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n";
return output;
};
assert(fmt_single("chatml") == "<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
assert(fmt_single("llama2") == "[INST] How are you [/INST]");
assert(fmt_single("gemma") == "<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
return 0;
}