Merge branch 'ggerganov:master' into jukofyork-command_r-control-vector-fix
This commit is contained in:
commit
ed90e43c70
51 changed files with 24218 additions and 23412 deletions
6
.github/workflows/docker.yml
vendored
6
.github/workflows/docker.yml
vendored
|
@ -33,15 +33,13 @@ jobs:
|
||||||
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
|
||||||
# have disabled them for now until the reason why
|
|
||||||
# is understood.
|
|
||||||
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
# Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
|
||||||
|
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
steps:
|
steps:
|
||||||
|
|
2
.github/workflows/server.yml
vendored
2
.github/workflows/server.yml
vendored
|
@ -30,7 +30,7 @@ jobs:
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
||||||
build_type: [RelWithDebInfo]
|
build_type: [RelWithDebInfo]
|
||||||
include:
|
include:
|
||||||
- build_type: Release
|
- build_type: Release
|
||||||
|
|
|
@ -102,7 +102,8 @@ option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM"
|
||||||
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
||||||
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
option(LLAMA_CUDA_FORCE_MMQ "llama: always use mmq kernels instead of cuBLAS" OFF)
|
||||||
|
option(LLAMA_CUDA_FORCE_CUBLAS "llama: always use cuBLAS instead of mmq kernels" OFF)
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||||
|
@ -144,9 +145,6 @@ option(LLAMA_BUILD_SERVER "llama: build server example"
|
||||||
option(LLAMA_LASX "llama: enable lasx" ON)
|
option(LLAMA_LASX "llama: enable lasx" ON)
|
||||||
option(LLAMA_LSX "llama: enable lsx" ON)
|
option(LLAMA_LSX "llama: enable lsx" ON)
|
||||||
|
|
||||||
# add perf arguments
|
|
||||||
option(LLAMA_PERF "llama: enable perf" OFF)
|
|
||||||
|
|
||||||
# Required for relocatable CMake package
|
# Required for relocatable CMake package
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||||
|
|
||||||
|
@ -419,13 +417,14 @@ if (LLAMA_CUDA)
|
||||||
|
|
||||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||||
# 52 == lowest CUDA 12 standard
|
# 52 == lowest CUDA 12 standard
|
||||||
# 60 == f16 CUDA intrinsics
|
# 60 == FP16 CUDA intrinsics
|
||||||
# 61 == integer CUDA intrinsics
|
# 61 == integer CUDA intrinsics
|
||||||
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
# 70 == FP16 tensor cores
|
||||||
|
# 75 == int8 tensor cores
|
||||||
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
|
||||||
else()
|
else()
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
|
||||||
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
|
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
@ -450,6 +449,9 @@ if (LLAMA_CUDA)
|
||||||
if (LLAMA_CUDA_FORCE_MMQ)
|
if (LLAMA_CUDA_FORCE_MMQ)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_CUDA_FORCE_CUBLAS)
|
||||||
|
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
|
||||||
|
endif()
|
||||||
if (LLAMA_CUDA_NO_VMM)
|
if (LLAMA_CUDA_NO_VMM)
|
||||||
add_compile_definitions(GGML_CUDA_NO_VMM)
|
add_compile_definitions(GGML_CUDA_NO_VMM)
|
||||||
endif()
|
endif()
|
||||||
|
@ -870,10 +872,6 @@ if (LLAMA_CPU_HBM)
|
||||||
target_link_libraries(ggml PUBLIC memkind)
|
target_link_libraries(ggml PUBLIC memkind)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_PERF)
|
|
||||||
add_compile_definitions(GGML_PERF)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
function(get_flags CCID CCVER)
|
function(get_flags CCID CCVER)
|
||||||
set(C_FLAGS "")
|
set(C_FLAGS "")
|
||||||
set(CXX_FLAGS "")
|
set(CXX_FLAGS "")
|
||||||
|
|
6
Makefile
6
Makefile
|
@ -344,9 +344,6 @@ ifdef LLAMA_GPROF
|
||||||
MK_CFLAGS += -pg
|
MK_CFLAGS += -pg
|
||||||
MK_CXXFLAGS += -pg
|
MK_CXXFLAGS += -pg
|
||||||
endif
|
endif
|
||||||
ifdef LLAMA_PERF
|
|
||||||
MK_CPPFLAGS += -DGGML_PERF
|
|
||||||
endif
|
|
||||||
|
|
||||||
# Architecture specific
|
# Architecture specific
|
||||||
# TODO: probably these flags need to be tweaked on some architectures
|
# TODO: probably these flags need to be tweaked on some architectures
|
||||||
|
@ -540,6 +537,9 @@ endif # LLAMA_CUDA_FORCE_DMMV
|
||||||
ifdef LLAMA_CUDA_FORCE_MMQ
|
ifdef LLAMA_CUDA_FORCE_MMQ
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||||
endif # LLAMA_CUDA_FORCE_MMQ
|
endif # LLAMA_CUDA_FORCE_MMQ
|
||||||
|
ifdef LLAMA_CUDA_FORCE_CUBLAS
|
||||||
|
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
|
||||||
|
endif # LLAMA_CUDA_FORCE_CUBLAS
|
||||||
ifdef LLAMA_CUDA_DMMV_X
|
ifdef LLAMA_CUDA_DMMV_X
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||||
else
|
else
|
||||||
|
|
|
@ -511,7 +511,8 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
||||||
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
|
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). Speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
||||||
|
| LLAMA_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
|
||||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -52,6 +52,12 @@ int32_t cpu_get_num_math();
|
||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
|
enum dimre_method {
|
||||||
|
DIMRE_METHOD_PCA,
|
||||||
|
DIMRE_METHOD_MEAN,
|
||||||
|
};
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||||
|
|
||||||
|
@ -152,7 +158,6 @@ struct gpt_params {
|
||||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||||
|
|
||||||
bool embedding = false; // get only sentence embedding
|
|
||||||
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
||||||
bool multiline_input = false; // reverse the usage of `\`
|
bool multiline_input = false; // reverse the usage of `\`
|
||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
|
@ -179,6 +184,12 @@ struct gpt_params {
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
|
// embedding
|
||||||
|
bool embedding = false; // get only sentence embedding
|
||||||
|
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||||
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||||
|
std::string embd_sep = "\n"; // separator of embendings
|
||||||
|
|
||||||
// server params
|
// server params
|
||||||
int32_t port = 8080; // server listens on this network port
|
int32_t port = 8080; // server listens on this network port
|
||||||
int32_t timeout_read = 600; // http read timeout in seconds
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
|
@ -233,11 +244,10 @@ struct gpt_params {
|
||||||
bool compute_ppl = true; // whether to compute perplexity
|
bool compute_ppl = true; // whether to compute perplexity
|
||||||
|
|
||||||
// cvector-generator params
|
// cvector-generator params
|
||||||
int n_completions = 64;
|
int n_pca_batch = 100;
|
||||||
int n_pca_batch = 20;
|
|
||||||
int n_pca_iterations = 1000;
|
int n_pca_iterations = 1000;
|
||||||
|
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||||
std::string cvector_outfile = "control_vector.gguf";
|
std::string cvector_outfile = "control_vector.gguf";
|
||||||
std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
|
|
||||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||||
};
|
};
|
||||||
|
@ -360,9 +370,32 @@ bool llama_should_add_bos_token(const llama_model * model);
|
||||||
// Chat template utils
|
// Chat template utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// same with llama_chat_message, but uses std::string
|
||||||
|
struct llama_chat_msg {
|
||||||
|
std::string role;
|
||||||
|
std::string content;
|
||||||
|
};
|
||||||
|
|
||||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||||
bool llama_chat_verify_template(const std::string & tmpl);
|
bool llama_chat_verify_template(const std::string & tmpl);
|
||||||
|
|
||||||
|
// CPP wrapper for llama_chat_apply_template
|
||||||
|
std::string llama_chat_apply_template(const struct llama_model * model,
|
||||||
|
const std::string & tmpl,
|
||||||
|
const std::vector<llama_chat_msg> & chat,
|
||||||
|
bool add_ass);
|
||||||
|
|
||||||
|
// Format single message, while taking into account the position of that message in chat history
|
||||||
|
std::string llama_chat_format_single(const struct llama_model * model,
|
||||||
|
const std::string & tmpl,
|
||||||
|
const std::vector<llama_chat_msg> & past_msg,
|
||||||
|
const llama_chat_msg & new_msg,
|
||||||
|
bool add_ass);
|
||||||
|
|
||||||
|
// Returns an example of formatted chat
|
||||||
|
std::string llama_chat_format_example(const struct llama_model * model,
|
||||||
|
const std::string & tmpl);
|
||||||
|
|
||||||
//
|
//
|
||||||
// KV cache utils
|
// KV cache utils
|
||||||
//
|
//
|
||||||
|
@ -377,7 +410,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
|
||||||
// Embedding utils
|
// Embedding utils
|
||||||
//
|
//
|
||||||
|
|
||||||
void llama_embd_normalize(const float * inp, float * out, int n);
|
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
||||||
|
|
||||||
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,8 @@ class Model:
|
||||||
# subclasses should define this!
|
# subclasses should define this!
|
||||||
model_arch: gguf.MODEL_ARCH
|
model_arch: gguf.MODEL_ARCH
|
||||||
|
|
||||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool,
|
||||||
|
model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
||||||
if type(self) is Model:
|
if type(self) is Model:
|
||||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||||
self.dir_model = dir_model
|
self.dir_model = dir_model
|
||||||
|
@ -80,7 +81,7 @@ class Model:
|
||||||
if not self.is_safetensors:
|
if not self.is_safetensors:
|
||||||
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
||||||
self.hparams = Model.load_hparams(self.dir_model)
|
self.hparams = Model.load_hparams(self.dir_model)
|
||||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
||||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||||
self.tensor_names = None
|
self.tensor_names = None
|
||||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||||
|
@ -96,7 +97,8 @@ class Model:
|
||||||
ftype_lw: str = ftype_up.lower()
|
ftype_lw: str = ftype_up.lower()
|
||||||
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
||||||
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
||||||
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
||||||
|
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __init_subclass__(cls):
|
def __init_subclass__(cls):
|
||||||
|
@ -332,6 +334,8 @@ class Model:
|
||||||
self.gguf_writer.close()
|
self.gguf_writer.close()
|
||||||
|
|
||||||
def write_vocab(self):
|
def write_vocab(self):
|
||||||
|
if len(self.gguf_writer.tensors) != 1:
|
||||||
|
raise ValueError('Splitting the vocabulary is not supported')
|
||||||
self.gguf_writer.write_header_to_file(self.fname_out)
|
self.gguf_writer.write_header_to_file(self.fname_out)
|
||||||
self.gguf_writer.write_kv_data_to_file()
|
self.gguf_writer.write_kv_data_to_file()
|
||||||
self.gguf_writer.close()
|
self.gguf_writer.close()
|
||||||
|
@ -973,8 +977,6 @@ class XverseModel(Model):
|
||||||
if max_vocab_index >= vocab_size:
|
if max_vocab_index >= vocab_size:
|
||||||
raise ValueError("Vocabulary size exceeds expected maximum size.")
|
raise ValueError("Vocabulary size exceeds expected maximum size.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
|
|
||||||
|
@ -1406,6 +1408,48 @@ class LlamaModel(Model):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("BitnetForCausalLM")
|
||||||
|
class BitnetModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.BITNET
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||||
|
|
||||||
|
def weight_quant(self, weight):
|
||||||
|
dtype = weight.dtype
|
||||||
|
weight = weight.float()
|
||||||
|
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
||||||
|
weight = (weight * s).round().clamp(-1, 1) / s
|
||||||
|
scale = weight.abs().max().unsqueeze(0)
|
||||||
|
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
||||||
|
weight = torch.sign(weight).type(dtype)
|
||||||
|
return weight.type(dtype), scale.type(torch.float32)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
new_name = self.map_tensor_name(name)
|
||||||
|
|
||||||
|
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
|
||||||
|
gguf.MODEL_TENSOR.ATTN_Q,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_K,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_V,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_OUT,
|
||||||
|
gguf.MODEL_TENSOR.FFN_UP,
|
||||||
|
gguf.MODEL_TENSOR.FFN_DOWN,
|
||||||
|
gguf.MODEL_TENSOR.FFN_GATE,
|
||||||
|
]):
|
||||||
|
# transform weight into 1/0/-1 (in fp32)
|
||||||
|
weight_torch, scale_torch = self.weight_quant(data_torch)
|
||||||
|
yield (new_name, weight_torch)
|
||||||
|
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
||||||
|
else:
|
||||||
|
yield (new_name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GrokForCausalLM")
|
@Model.register("GrokForCausalLM")
|
||||||
class GrokModel(Model):
|
class GrokModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GROK
|
model_arch = gguf.MODEL_ARCH.GROK
|
||||||
|
@ -2731,6 +2775,124 @@ class DeepseekV2Model(Model):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("T5ForConditionalGeneration")
|
||||||
|
@Model.register("T5WithLMHeadModel")
|
||||||
|
class T5Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.T5
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
# to avoid TypeError: Descriptors cannot be created directly
|
||||||
|
# exception when importing sentencepiece_model_pb2
|
||||||
|
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
from sentencepiece import sentencepiece_model_pb2 as model
|
||||||
|
|
||||||
|
tokenizer_path = self.dir_model / 'spiece.model'
|
||||||
|
|
||||||
|
if not tokenizer_path.is_file():
|
||||||
|
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
||||||
|
|
||||||
|
sentencepiece_model = model.ModelProto()
|
||||||
|
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
||||||
|
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
||||||
|
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
||||||
|
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
||||||
|
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
||||||
|
|
||||||
|
tokenizer = SentencePieceProcessor()
|
||||||
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||||
|
|
||||||
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||||
|
|
||||||
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||||
|
scores: list[float] = [-10000.0] * vocab_size
|
||||||
|
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||||
|
|
||||||
|
for token_id in range(tokenizer.vocab_size()):
|
||||||
|
piece = tokenizer.IdToPiece(token_id)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score = tokenizer.GetScore(token_id)
|
||||||
|
|
||||||
|
toktype = SentencePieceTokenTypes.NORMAL
|
||||||
|
if tokenizer.IsUnknown(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||||
|
elif tokenizer.IsControl(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.CONTROL
|
||||||
|
elif tokenizer.IsUnused(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNUSED
|
||||||
|
elif tokenizer.IsByte(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.BYTE
|
||||||
|
|
||||||
|
tokens[token_id] = text
|
||||||
|
scores[token_id] = score
|
||||||
|
toktypes[token_id] = toktype
|
||||||
|
|
||||||
|
added_tokens_file = self.dir_model / 'added_tokens.json'
|
||||||
|
if added_tokens_file.is_file():
|
||||||
|
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||||
|
added_tokens_json = json.load(f)
|
||||||
|
for key in added_tokens_json:
|
||||||
|
token_id = added_tokens_json[key]
|
||||||
|
if (token_id >= vocab_size):
|
||||||
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
tokens[token_id] = key.encode("utf-8")
|
||||||
|
scores[token_id] = -1000.0
|
||||||
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
|
||||||
|
if vocab_size > len(tokens):
|
||||||
|
pad_count = vocab_size - len(tokens)
|
||||||
|
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
||||||
|
for i in range(1, pad_count + 1):
|
||||||
|
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
||||||
|
scores.append(-1000.0)
|
||||||
|
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("t5")
|
||||||
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
self.gguf_writer.add_add_space_prefix(add_prefix)
|
||||||
|
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
||||||
|
if precompiled_charsmap:
|
||||||
|
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
self.gguf_writer.add_add_eos_token(True)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
self.gguf_writer.add_name("T5")
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
||||||
|
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
||||||
|
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["num_heads"])
|
||||||
|
self.gguf_writer.add_key_length(self.hparams["d_kv"])
|
||||||
|
self.gguf_writer.add_value_length(self.hparams["d_kv"])
|
||||||
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
del bid # unused
|
||||||
|
|
||||||
|
# Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
|
||||||
|
# "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
|
||||||
|
# To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
|
||||||
|
if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight":
|
||||||
|
logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
@ -2816,10 +2978,44 @@ def parse_args() -> argparse.Namespace:
|
||||||
"--verbose", action="store_true",
|
"--verbose", action="store_true",
|
||||||
help="increase output verbosity",
|
help="increase output verbosity",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-max-tensors", type=int, default=0,
|
||||||
|
help="max tensors in each split",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-max-size", type=str, default="0",
|
||||||
|
help="max size per split N(M|G)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dry-run", action="store_true",
|
||||||
|
help="only print out a split plan and exit, without writing any new files",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-tensor-first-split", action="store_true",
|
||||||
|
help="do not add tensors to the first split (disabled by default)"
|
||||||
|
)
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def split_str_to_n_bytes(split_str: str) -> int:
|
||||||
|
if split_str.endswith("K"):
|
||||||
|
n = int(split_str[:-1]) * 1000
|
||||||
|
elif split_str.endswith("M"):
|
||||||
|
n = int(split_str[:-1]) * 1000 * 1000
|
||||||
|
elif split_str.endswith("G"):
|
||||||
|
n = int(split_str[:-1]) * 1000 * 1000 * 1000
|
||||||
|
elif split_str.isnumeric():
|
||||||
|
n = int(split_str)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
|
||||||
|
|
||||||
|
if n < 0:
|
||||||
|
raise ValueError(f"Invalid split size: {split_str}, must be positive")
|
||||||
|
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
|
@ -2852,6 +3048,10 @@ def main() -> None:
|
||||||
"auto": gguf.LlamaFileType.GUESSED,
|
"auto": gguf.LlamaFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
|
||||||
|
logger.error("Error: Cannot use temp file when splitting")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
if args.outfile is not None:
|
if args.outfile is not None:
|
||||||
fname_out = args.outfile
|
fname_out = args.outfile
|
||||||
else:
|
else:
|
||||||
|
@ -2869,7 +3069,10 @@ def main() -> None:
|
||||||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file,
|
||||||
|
args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors,
|
||||||
|
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
||||||
|
small_first_shard=args.no_tensor_first_split)
|
||||||
|
|
||||||
logger.info("Set model parameters")
|
logger.info("Set model parameters")
|
||||||
model_instance.set_gguf_parameters()
|
model_instance.set_gguf_parameters()
|
||||||
|
@ -2880,13 +3083,13 @@ def main() -> None:
|
||||||
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
||||||
|
|
||||||
if args.vocab_only:
|
if args.vocab_only:
|
||||||
logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
|
logger.info("Exporting model vocab...")
|
||||||
model_instance.write_vocab()
|
model_instance.write_vocab()
|
||||||
|
logger.info("Model vocab successfully exported.")
|
||||||
else:
|
else:
|
||||||
logger.info(f"Exporting model to '{model_instance.fname_out}'")
|
logger.info("Exporting model...")
|
||||||
model_instance.write()
|
model_instance.write()
|
||||||
|
logger.info("Model successfully exported.")
|
||||||
logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -11,13 +11,16 @@ Related PRs:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# CPU only
|
# CPU only
|
||||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
|
./cvector-generator -m ./llama-3.Q4_K_M.gguf
|
||||||
|
|
||||||
# With GPU
|
# With GPU
|
||||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
|
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
|
||||||
|
|
||||||
# With advanced options
|
# With advanced options
|
||||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100
|
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
|
||||||
|
|
||||||
|
# Using mean value instead of PCA
|
||||||
|
./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
|
||||||
|
|
||||||
# To see help message
|
# To see help message
|
||||||
./cvector-generator -h
|
./cvector-generator -h
|
||||||
|
@ -32,3 +35,11 @@ If you have multiple lines per prompt, you can escape the newline character (cha
|
||||||
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
|
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
|
||||||
<|im_start|>system\nYou are in a very good mood today<|im_end|>
|
<|im_start|>system\nYou are in a very good mood today<|im_end|>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Example to use output file with `llama-cli`:
|
||||||
|
|
||||||
|
(Tips: The control vector works better when apply to layers higher than 10)
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
|
||||||
|
```
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "pca.hpp"
|
#include "pca.hpp"
|
||||||
|
#include "mean.hpp"
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
|
@ -38,9 +39,10 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
gpt_params_print_usage(argc, argv, params);
|
||||||
|
|
||||||
printf("\nexample usage:\n");
|
printf("\nexample usage:\n");
|
||||||
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
|
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
|
||||||
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
||||||
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]);
|
printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
|
||||||
|
printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -223,17 +225,20 @@ struct train_context {
|
||||||
|
|
||||||
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
|
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
|
||||||
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
|
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
|
||||||
void build_v_diff() {
|
void build_v_diff(bool transpose) {
|
||||||
printf("build_v_diff\n");
|
printf("build_v_diff\n");
|
||||||
for (int il = 0; il < n_layers - 1; il++) {
|
for (int il = 0; il < n_layers - 1; il++) {
|
||||||
auto & diff_tmp = v_diff_tmp[il];
|
auto & diff_tmp = v_diff_tmp[il];
|
||||||
int n_elem = diff_tmp.size() / sizeof(float);
|
int n_elem = diff_tmp.size() / sizeof(float);
|
||||||
GGML_ASSERT(n_elem % n_embd == 0);
|
GGML_ASSERT(n_elem % n_embd == 0);
|
||||||
int n_rows = n_elem / n_embd;
|
int n_rows = n_elem / n_embd;
|
||||||
struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
|
struct ggml_tensor * diff = transpose
|
||||||
|
? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
|
||||||
|
: ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
|
||||||
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
|
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
|
||||||
// copy data & transpose
|
|
||||||
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
|
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
|
||||||
|
if (transpose) {
|
||||||
|
// copy data & transpose
|
||||||
float * arr = (float *) diff_tmp.data();
|
float * arr = (float *) diff_tmp.data();
|
||||||
for (int ir = 0; ir < n_rows; ++ir) {
|
for (int ir = 0; ir < n_rows; ++ir) {
|
||||||
for (int ic = 0; ic < n_embd; ++ic) {
|
for (int ic = 0; ic < n_embd; ++ic) {
|
||||||
|
@ -241,6 +246,10 @@ struct train_context {
|
||||||
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
|
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// only copy
|
||||||
|
memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
|
||||||
|
}
|
||||||
v_diff.push_back(diff);
|
v_diff.push_back(diff);
|
||||||
print_debug_tensor(diff);
|
print_debug_tensor(diff);
|
||||||
// free memory of diff_tmp
|
// free memory of diff_tmp
|
||||||
|
@ -263,8 +272,8 @@ struct tokenized_prompt {
|
||||||
|
|
||||||
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
|
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
|
||||||
tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
|
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
|
||||||
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
||||||
padding_seq(ctx, tokens_pos, max_seq_len);
|
padding_seq(ctx, tokens_pos, max_seq_len);
|
||||||
padding_seq(ctx, tokens_neg, max_seq_len);
|
padding_seq(ctx, tokens_neg, max_seq_len);
|
||||||
|
@ -373,20 +382,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
||||||
fprintf(stderr, "must provide at least one prompt pair\n");
|
fprintf(stderr, "must provide at least one prompt pair\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
ctx_train.positive_entries = positive_prompts;
|
||||||
// create templated prompts
|
ctx_train.negative_entries = negative_prompts;
|
||||||
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
|
|
||||||
auto format_template = [](std::string persona, std::string suffix) {
|
|
||||||
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] "
|
|
||||||
return persona + suffix;
|
|
||||||
};
|
|
||||||
for (size_t i = 0; i < positive_prompts.size(); ++i) {
|
|
||||||
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
|
|
||||||
// TODO replicate the truncations done by the python implementation
|
|
||||||
ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
|
|
||||||
ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -480,15 +477,22 @@ int main(int argc, char ** argv) {
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
// prepare ctx_train for PCA
|
bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
|
||||||
ctx_train.build_v_diff();
|
|
||||||
|
|
||||||
|
// prepare ctx_train for PCA
|
||||||
|
ctx_train.build_v_diff(use_pca);
|
||||||
|
|
||||||
|
if (use_pca) {
|
||||||
// run PCA
|
// run PCA
|
||||||
PCA::pca_params pca_params;
|
PCA::pca_params pca_params;
|
||||||
pca_params.n_threads = params.n_threads;
|
pca_params.n_threads = params.n_threads;
|
||||||
pca_params.n_batch = params.n_pca_batch;
|
pca_params.n_batch = params.n_pca_batch;
|
||||||
pca_params.n_iterations = params.n_pca_iterations;
|
pca_params.n_iterations = params.n_pca_iterations;
|
||||||
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
||||||
|
} else {
|
||||||
|
// run mean
|
||||||
|
mean::run(ctx_train.v_diff, ctx_train.v_final);
|
||||||
|
}
|
||||||
|
|
||||||
// write output vectors to gguf
|
// write output vectors to gguf
|
||||||
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
|
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
|
||||||
|
|
48
examples/cvector-generator/mean.hpp
Normal file
48
examples/cvector-generator/mean.hpp
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
namespace mean {
|
||||||
|
|
||||||
|
static void run(
|
||||||
|
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
|
||||||
|
const std::vector<struct ggml_tensor *> & v_output) {
|
||||||
|
printf("%s: Running mean...\n", __func__);
|
||||||
|
for (size_t il = 0; il < v_input.size(); ++il) {
|
||||||
|
// prepare output vector
|
||||||
|
struct ggml_tensor * ctrl_out = v_output[il];
|
||||||
|
ggml_format_name(ctrl_out, "direction.%ld", il+1);
|
||||||
|
|
||||||
|
// calculate mean vector
|
||||||
|
struct ggml_tensor * t_layer = v_input[il];
|
||||||
|
GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
|
||||||
|
for (int ic = 0; ic < t_layer->ne[0]; ic++) {
|
||||||
|
float f = 0.0;
|
||||||
|
for (int ir = 0; ir < t_layer->ne[1]; ir++) {
|
||||||
|
f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
|
||||||
|
}
|
||||||
|
f /= t_layer->ne[1];
|
||||||
|
ggml_set_f32_1d(ctrl_out, ic, f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalize output vector
|
||||||
|
float norm = 0.0;
|
||||||
|
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
|
||||||
|
float f = ggml_get_f32_1d(ctrl_out, i);
|
||||||
|
norm += f*f;
|
||||||
|
}
|
||||||
|
norm = sqrt(norm);
|
||||||
|
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
|
||||||
|
float f = ggml_get_f32_1d(ctrl_out, i);
|
||||||
|
ggml_set_f32_1d(ctrl_out, i, f / norm);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1 +1,4 @@
|
||||||
[INST] Act like a person who is extremely sad. [/INST]
|
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
|
|
@ -290,7 +290,7 @@ static void power_iteration(
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
|
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
|
||||||
__func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
|
__func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
|
||||||
}
|
}
|
||||||
|
|
||||||
// get output tensor
|
// get output tensor
|
||||||
|
@ -298,6 +298,9 @@ static void power_iteration(
|
||||||
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
|
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
|
||||||
//print_debug_tensor(output);
|
//print_debug_tensor(output);
|
||||||
ggml_gallocr_free(allocr);
|
ggml_gallocr_free(allocr);
|
||||||
|
|
||||||
|
// TODO @ngxson : The output vector is randomly inverted
|
||||||
|
// Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
|
||||||
}
|
}
|
||||||
|
|
||||||
static void run_pca(
|
static void run_pca(
|
||||||
|
|
|
@ -1 +1,4 @@
|
||||||
[INST] Act like a person who is extremely happy. [/INST]
|
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!
|
|
@ -19,3 +19,43 @@ llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
||||||
```
|
```
|
||||||
|
|
||||||
The above command will output space-separated float values.
|
The above command will output space-separated float values.
|
||||||
|
|
||||||
|
## extra parameters
|
||||||
|
### --embd-normalize $integer$
|
||||||
|
| $integer$ | description | formula |
|
||||||
|
|-----------|---------------------|---------|
|
||||||
|
| $-1$ | none |
|
||||||
|
| $0$ | max absolute int16 | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$
|
||||||
|
| $1$ | taxicab | $\Large{x_i \over\sum \lvert x_i\rvert}$
|
||||||
|
| $2$ | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$
|
||||||
|
| $>2$ | p-norm | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$
|
||||||
|
|
||||||
|
### --embd-output-format $'string'$
|
||||||
|
| $'string'$ | description | |
|
||||||
|
|------------|------------------------------|--|
|
||||||
|
| '' | same as before | (default)
|
||||||
|
| 'array' | single embeddings | $[[x_1,...,x_n]]$
|
||||||
|
| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
|
||||||
|
| 'json' | openai style |
|
||||||
|
| 'json+' | add cosine similarity matrix |
|
||||||
|
|
||||||
|
### --embd-separator $"string"$
|
||||||
|
| $"string"$ | |
|
||||||
|
|--------------|-|
|
||||||
|
| "\n" | (default)
|
||||||
|
| "<#embSep#>" | for exemple
|
||||||
|
| "<#sep#>" | other exemple
|
||||||
|
|
||||||
|
## examples
|
||||||
|
### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||||
|
```
|
||||||
|
|
||||||
|
### Windows:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||||
|
```
|
||||||
|
|
||||||
|
|
|
@ -7,13 +7,19 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static std::vector<std::string> split_lines(const std::string & s) {
|
static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
|
||||||
std::string line;
|
|
||||||
std::vector<std::string> lines;
|
std::vector<std::string> lines;
|
||||||
std::stringstream ss(s);
|
size_t start = 0;
|
||||||
while (std::getline(ss, line)) {
|
size_t end = s.find(separator);
|
||||||
lines.push_back(line);
|
|
||||||
|
while (end != std::string::npos) {
|
||||||
|
lines.push_back(s.substr(start, end - start));
|
||||||
|
start = end + separator.length();
|
||||||
|
end = s.find(separator, start);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lines.push_back(s.substr(start)); // Add the last part
|
||||||
|
|
||||||
return lines;
|
return lines;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -24,7 +30,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
||||||
// clear previous kv_cache values (irrelevant for embeddings)
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
|
@ -44,13 +50,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||||
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
|
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
|
||||||
|
|
||||||
float * out = output + batch.seq_id[i][0] * n_embd;
|
float * out = output + batch.seq_id[i][0] * n_embd;
|
||||||
//TODO: I would also add a parameter here to enable normalization or not.
|
llama_embd_normalize(embd, out, n_embd, embd_norm);
|
||||||
/*fprintf(stdout, "unnormalized_embedding:");
|
|
||||||
for (int hh = 0; hh < n_embd; hh++) {
|
|
||||||
fprintf(stdout, "%9.6f ", embd[hh]);
|
|
||||||
}
|
|
||||||
fprintf(stdout, "\n");*/
|
|
||||||
llama_embd_normalize(embd, out, n_embd);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -110,7 +110,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// split the prompt into lines
|
// split the prompt into lines
|
||||||
std::vector<std::string> prompts = split_lines(params.prompt);
|
std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
|
||||||
|
|
||||||
// max batch size
|
// max batch size
|
||||||
const uint64_t n_batch = params.n_batch;
|
const uint64_t n_batch = params.n_batch;
|
||||||
|
@ -170,7 +170,7 @@ int main(int argc, char ** argv) {
|
||||||
// encode if at capacity
|
// encode if at capacity
|
||||||
if (batch.n_tokens + n_toks > n_batch) {
|
if (batch.n_tokens + n_toks > n_batch) {
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd;
|
||||||
batch_decode(ctx, batch, out, s, n_embd);
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
p += s;
|
p += s;
|
||||||
s = 0;
|
s = 0;
|
||||||
|
@ -183,15 +183,20 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// final batch
|
// final batch
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd;
|
||||||
batch_decode(ctx, batch, out, s, n_embd);
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||||
|
|
||||||
|
if (params.embd_out.empty()) {
|
||||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
fprintf(stdout, "embedding %d: ", j);
|
fprintf(stdout, "embedding %d: ", j);
|
||||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
||||||
|
if (params.embd_normalize == 0) {
|
||||||
|
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
||||||
|
} else {
|
||||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -199,14 +204,58 @@ int main(int argc, char ** argv) {
|
||||||
if (n_prompts > 1) {
|
if (n_prompts > 1) {
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
printf("cosine similarity matrix:\n\n");
|
printf("cosine similarity matrix:\n\n");
|
||||||
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
|
fprintf(stdout, "%6.6s ", prompts[i].c_str());
|
||||||
|
}
|
||||||
|
fprintf(stdout, "\n");
|
||||||
for (int i = 0; i < n_prompts; i++) {
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
fprintf(stdout, "%6.2f ", sim);
|
fprintf(stdout, "%6.2f ", sim);
|
||||||
}
|
}
|
||||||
|
fprintf(stdout, "%1.10s", prompts[i].c_str());
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
|
||||||
|
const bool notArray = params.embd_out != "array";
|
||||||
|
|
||||||
|
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
|
||||||
|
for (int j = 0;;) { // at least one iteration (one prompt)
|
||||||
|
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
||||||
|
fprintf(stdout, "[");
|
||||||
|
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
||||||
|
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
||||||
|
i++;
|
||||||
|
if (i < n_embd) fprintf(stdout, ","); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, notArray ? "]\n }" : "]");
|
||||||
|
j++;
|
||||||
|
if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, notArray ? "\n ]" : "]\n");
|
||||||
|
|
||||||
|
if (params.embd_out == "json+" && n_prompts > 1) {
|
||||||
|
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
|
||||||
|
for (int i = 0;;) { // at least two iteration (n_prompts > 1)
|
||||||
|
fprintf(stdout, " [");
|
||||||
|
for (int j = 0;;) { // at least two iteration (n_prompts > 1)
|
||||||
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
|
fprintf(stdout, "%6.2f", sim);
|
||||||
|
j++;
|
||||||
|
if (j < n_prompts) fprintf(stdout, ", "); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, " ]");
|
||||||
|
i++;
|
||||||
|
if (i < n_prompts) fprintf(stdout, ",\n"); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, "\n ]");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (notArray) fprintf(stdout, "\n}\n");
|
||||||
|
}
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
|
|
|
@ -39,12 +39,12 @@ static std::ostringstream * g_output_ss;
|
||||||
static std::vector<llama_token> * g_output_tokens;
|
static std::vector<llama_token> * g_output_tokens;
|
||||||
static bool is_interacting = false;
|
static bool is_interacting = false;
|
||||||
|
|
||||||
static bool file_exists(const std::string &path) {
|
static bool file_exists(const std::string & path) {
|
||||||
std::ifstream f(path.c_str());
|
std::ifstream f(path.c_str());
|
||||||
return f.good();
|
return f.good();
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool file_is_empty(const std::string &path) {
|
static bool file_is_empty(const std::string & path) {
|
||||||
std::ifstream f;
|
std::ifstream f;
|
||||||
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||||
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
|
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
|
||||||
|
@ -117,6 +117,14 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
|
||||||
LOG_TEE("%s", text);
|
LOG_TEE("%s", text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
|
||||||
|
llama_chat_msg new_msg{role, content};
|
||||||
|
auto formatted = llama_chat_format_single(
|
||||||
|
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
||||||
|
chat_msgs.push_back({role, content});
|
||||||
|
return formatted;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
@ -190,6 +198,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
llama_context * ctx_guidance = NULL;
|
llama_context * ctx_guidance = NULL;
|
||||||
|
std::vector<llama_chat_msg> chat_msgs;
|
||||||
g_model = &model;
|
g_model = &model;
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
|
|
||||||
|
@ -215,6 +224,8 @@ int main(int argc, char ** argv) {
|
||||||
__func__, n_ctx_train, n_ctx);
|
__func__, n_ctx_train, n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
@ -249,16 +260,21 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
||||||
|
{
|
||||||
|
auto prompt = params.conversation
|
||||||
|
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
||||||
|
: params.prompt;
|
||||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
LOG("tokenize the prompt\n");
|
LOG("tokenize the prompt\n");
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
|
||||||
} else {
|
} else {
|
||||||
LOG("use session tokens\n");
|
LOG("use session tokens\n");
|
||||||
embd_inp = session_tokens;
|
embd_inp = session_tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
|
LOG("prompt: \"%s\"\n", log_tostr(prompt));
|
||||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
// Should not run without any tokens
|
// Should not run without any tokens
|
||||||
if (embd_inp.empty()) {
|
if (embd_inp.empty()) {
|
||||||
|
@ -478,6 +494,7 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
||||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
||||||
std::ostringstream output_ss; g_output_ss = &output_ss;
|
std::ostringstream output_ss; g_output_ss = &output_ss;
|
||||||
|
std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
|
||||||
|
|
||||||
// the first thing we will do is to output the prompt, so set color accordingly
|
// the first thing we will do is to output the prompt, so set color accordingly
|
||||||
console::set_display(console::prompt);
|
console::set_display(console::prompt);
|
||||||
|
@ -793,11 +810,18 @@ int main(int argc, char ** argv) {
|
||||||
is_antiprompt = true;
|
is_antiprompt = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if current token is not EOG, we add it to current assistant message
|
||||||
|
if (params.conversation) {
|
||||||
|
auto id = llama_sampling_last(ctx_sampling);
|
||||||
|
assistant_ss << llama_token_to_piece(ctx, id, false);
|
||||||
|
}
|
||||||
|
|
||||||
if (n_past > 0 && is_interacting) {
|
if (n_past > 0 && is_interacting) {
|
||||||
LOG("waiting for user input\n");
|
LOG("waiting for user input\n");
|
||||||
|
|
||||||
|
@ -848,8 +872,12 @@ int main(int argc, char ** argv) {
|
||||||
string_process_escapes(buffer);
|
string_process_escapes(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string user_inp = params.conversation
|
||||||
|
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
||||||
|
: std::move(buffer);
|
||||||
|
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
||||||
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
||||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
|
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation);
|
||||||
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||||
|
|
||||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
||||||
|
@ -864,6 +892,9 @@ int main(int argc, char ** argv) {
|
||||||
output_ss << llama_token_to_piece(ctx, token);
|
output_ss << llama_token_to_piece(ctx, token);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reset assistant message
|
||||||
|
assistant_ss.str("");
|
||||||
|
|
||||||
n_remain -= line_inp.size();
|
n_remain -= line_inp.size();
|
||||||
LOG("n_remain: %d\n", n_remain);
|
LOG("n_remain: %d\n", n_remain);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -634,12 +634,12 @@ return html`
|
||||||
<div>
|
<div>
|
||||||
<div class="grammar">
|
<div class="grammar">
|
||||||
<label for="template"></label>
|
<label for="template"></label>
|
||||||
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON-Scheme + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON Schema + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
||||||
</div>
|
</div>
|
||||||
<div class="grammar-columns">
|
<div class="grammar-columns">
|
||||||
<div class="json-schema-controls">
|
<div class="json-schema-controls">
|
||||||
<input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
<input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
||||||
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON-Scheme</button>
|
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -3,6 +3,13 @@
|
||||||
|
|
||||||
by Humans for All.
|
by Humans for All.
|
||||||
|
|
||||||
|
## quickstart
|
||||||
|
|
||||||
|
To run from the build dir
|
||||||
|
|
||||||
|
bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat
|
||||||
|
|
||||||
|
Continue reading for the details.
|
||||||
|
|
||||||
## overview
|
## overview
|
||||||
|
|
||||||
|
@ -14,6 +21,8 @@ own system prompts.
|
||||||
This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
|
This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
|
||||||
or potentially as it is being generated, in a streamed manner from the server/ai-model.
|
or potentially as it is being generated, in a streamed manner from the server/ai-model.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
|
Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
|
||||||
open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
|
open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
|
||||||
|
|
||||||
|
@ -170,17 +179,23 @@ It is attached to the document object. Some of these can also be updated using t
|
||||||
The histogram/freq based trimming logic is currently tuned for english language wrt its
|
The histogram/freq based trimming logic is currently tuned for english language wrt its
|
||||||
is-it-a-alpabetic|numeral-char regex match logic.
|
is-it-a-alpabetic|numeral-char regex match logic.
|
||||||
|
|
||||||
chatRequestOptions - maintains the list of options/fields to send along with chat request,
|
apiRequestOptions - maintains the list of options/fields to send along with api request,
|
||||||
irrespective of whether /chat/completions or /completions endpoint.
|
irrespective of whether /chat/completions or /completions endpoint.
|
||||||
|
|
||||||
If you want to add additional options/fields to send to the server/ai-model, and or
|
If you want to add additional options/fields to send to the server/ai-model, and or
|
||||||
modify the existing options value or remove them, for now you can update this global var
|
modify the existing options value or remove them, for now you can update this global var
|
||||||
using browser's development-tools/console.
|
using browser's development-tools/console.
|
||||||
|
|
||||||
For string and numeric fields in chatRequestOptions, including even those added by a user
|
For string, numeric and boolean fields in apiRequestOptions, including even those added by a
|
||||||
at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto
|
user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto
|
||||||
created.
|
created.
|
||||||
|
|
||||||
|
cache_prompt option supported by example/server is allowed to be controlled by user, so that
|
||||||
|
any caching supported wrt system-prompt and chat history, if usable can get used. When chat
|
||||||
|
history sliding window is enabled, cache_prompt logic may or may not kick in at the backend
|
||||||
|
wrt same, based on aspects related to model, positional encoding, attention mechanism etal.
|
||||||
|
However system prompt should ideally get the benefit of caching.
|
||||||
|
|
||||||
headers - maintains the list of http headers sent when request is made to the server. By default
|
headers - maintains the list of http headers sent when request is made to the server. By default
|
||||||
Content-Type is set to application/json. Additionally Authorization entry is provided, which can
|
Content-Type is set to application/json. Additionally Authorization entry is provided, which can
|
||||||
be set if needed using the settings ui.
|
be set if needed using the settings ui.
|
||||||
|
@ -197,10 +212,10 @@ It is attached to the document object. Some of these can also be updated using t
|
||||||
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
||||||
|
|
||||||
|
|
||||||
By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
|
By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control
|
||||||
implications of loading of the ai-model's context window by chat history, wrt chat response to
|
the implications of loading of the ai-model's context window by chat history, wrt chat response to
|
||||||
some extent in a simple crude way. You may also want to control the context size enabled when
|
some extent in a simple crude way. You may also want to control the context size enabled when the
|
||||||
the server loads ai-model, on the server end.
|
server loads ai-model, on the server end.
|
||||||
|
|
||||||
|
|
||||||
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
||||||
|
@ -237,12 +252,12 @@ also be started with a model context size of 1k or more, to be on safe side.
|
||||||
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
||||||
to /completions endpoint handling code on server side.
|
to /completions endpoint handling code on server side.
|
||||||
|
|
||||||
NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions
|
NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions
|
||||||
wrt the set of fields sent to server along with the user query. To check how the model behaves
|
wrt the set of fields sent to server along with the user query, to check how the model behaves
|
||||||
wrt repeatations in general in the generated text response.
|
wrt repeatations in general in the generated text response.
|
||||||
|
|
||||||
A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
|
A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
|
||||||
using the providing settings ui.
|
using the provided settings ui (for settings exposed through the ui).
|
||||||
|
|
||||||
|
|
||||||
### OpenAi / Equivalent API WebService
|
### OpenAi / Equivalent API WebService
|
||||||
|
@ -253,7 +268,7 @@ for a minimal chatting experimentation by setting the below.
|
||||||
* the baseUrl in settings ui
|
* the baseUrl in settings ui
|
||||||
* https://api.openai.com/v1 or similar
|
* https://api.openai.com/v1 or similar
|
||||||
|
|
||||||
* Wrt request body - gMe.chatRequestOptions
|
* Wrt request body - gMe.apiRequestOptions
|
||||||
* model (settings ui)
|
* model (settings ui)
|
||||||
* any additional fields if required in future
|
* any additional fields if required in future
|
||||||
|
|
||||||
|
|
|
@ -222,8 +222,8 @@ class SimpleChat {
|
||||||
* @param {Object} obj
|
* @param {Object} obj
|
||||||
*/
|
*/
|
||||||
request_jsonstr_extend(obj) {
|
request_jsonstr_extend(obj) {
|
||||||
for(let k in gMe.chatRequestOptions) {
|
for(let k in gMe.apiRequestOptions) {
|
||||||
obj[k] = gMe.chatRequestOptions[k];
|
obj[k] = gMe.apiRequestOptions[k];
|
||||||
}
|
}
|
||||||
if (gMe.bStream) {
|
if (gMe.bStream) {
|
||||||
obj["stream"] = true;
|
obj["stream"] = true;
|
||||||
|
@ -740,11 +740,12 @@ class Me {
|
||||||
"Authorization": "", // Authorization: Bearer OPENAI_API_KEY
|
"Authorization": "", // Authorization: Bearer OPENAI_API_KEY
|
||||||
}
|
}
|
||||||
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||||
this.chatRequestOptions = {
|
this.apiRequestOptions = {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
"temperature": 0.7,
|
"temperature": 0.7,
|
||||||
"max_tokens": 1024,
|
"max_tokens": 1024,
|
||||||
"n_predict": 1024,
|
"n_predict": 1024,
|
||||||
|
"cache_prompt": false,
|
||||||
//"frequency_penalty": 1.2,
|
//"frequency_penalty": 1.2,
|
||||||
//"presence_penalty": 1.2,
|
//"presence_penalty": 1.2,
|
||||||
};
|
};
|
||||||
|
@ -800,51 +801,55 @@ class Me {
|
||||||
|
|
||||||
ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
|
ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
|
||||||
|
|
||||||
|
ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
|
||||||
|
|
||||||
|
ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
|
||||||
|
|
||||||
|
ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
|
||||||
|
|
||||||
ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
|
ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
|
||||||
|
|
||||||
ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
|
ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
|
||||||
|
|
||||||
ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
|
|
||||||
|
|
||||||
ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
|
|
||||||
|
|
||||||
ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv);
|
ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv);
|
||||||
ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
|
ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Auto create ui input elements for fields in ChatRequestOptions
|
* Auto create ui input elements for fields in apiRequestOptions
|
||||||
* Currently supports text and number field types.
|
* Currently supports text and number field types.
|
||||||
* @param {HTMLDivElement} elDiv
|
* @param {HTMLDivElement} elDiv
|
||||||
*/
|
*/
|
||||||
show_settings_chatrequestoptions(elDiv) {
|
show_settings_apirequestoptions(elDiv) {
|
||||||
let typeDict = {
|
let typeDict = {
|
||||||
"string": "text",
|
"string": "text",
|
||||||
"number": "number",
|
"number": "number",
|
||||||
};
|
};
|
||||||
let fs = document.createElement("fieldset");
|
let fs = document.createElement("fieldset");
|
||||||
let legend = document.createElement("legend");
|
let legend = document.createElement("legend");
|
||||||
legend.innerText = "ChatRequestOptions";
|
legend.innerText = "ApiRequestOptions";
|
||||||
fs.appendChild(legend);
|
fs.appendChild(legend);
|
||||||
elDiv.appendChild(fs);
|
elDiv.appendChild(fs);
|
||||||
for(const k in this.chatRequestOptions) {
|
for(const k in this.apiRequestOptions) {
|
||||||
let val = this.chatRequestOptions[k];
|
let val = this.apiRequestOptions[k];
|
||||||
let type = typeof(val);
|
let type = typeof(val);
|
||||||
if (!((type == "string") || (type == "number"))) {
|
if (((type == "string") || (type == "number"))) {
|
||||||
continue;
|
let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{
|
||||||
}
|
|
||||||
let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{
|
|
||||||
if (type == "number") {
|
if (type == "number") {
|
||||||
val = Number(val);
|
val = Number(val);
|
||||||
}
|
}
|
||||||
this.chatRequestOptions[k] = val;
|
this.apiRequestOptions[k] = val;
|
||||||
});
|
});
|
||||||
fs.appendChild(inp.div);
|
fs.appendChild(inp.div);
|
||||||
|
} else if (type == "boolean") {
|
||||||
|
let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{
|
||||||
|
this.apiRequestOptions[k] = userVal;
|
||||||
|
});
|
||||||
|
fs.appendChild(bbtn.div);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -870,6 +875,23 @@ class Me {
|
||||||
});
|
});
|
||||||
elDiv.appendChild(bb.div);
|
elDiv.appendChild(bb.div);
|
||||||
|
|
||||||
|
bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
|
||||||
|
this.bTrimGarbage = val;
|
||||||
|
});
|
||||||
|
elDiv.appendChild(bb.div);
|
||||||
|
|
||||||
|
this.show_settings_apirequestoptions(elDiv);
|
||||||
|
|
||||||
|
let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
|
||||||
|
this.apiEP = ApiEP.Type[val];
|
||||||
|
});
|
||||||
|
elDiv.appendChild(sel.div);
|
||||||
|
|
||||||
|
sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
|
||||||
|
this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
|
||||||
|
});
|
||||||
|
elDiv.appendChild(sel.div);
|
||||||
|
|
||||||
bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
|
bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
|
||||||
this.bCompletionFreshChatAlways = val;
|
this.bCompletionFreshChatAlways = val;
|
||||||
});
|
});
|
||||||
|
@ -880,23 +902,6 @@ class Me {
|
||||||
});
|
});
|
||||||
elDiv.appendChild(bb.div);
|
elDiv.appendChild(bb.div);
|
||||||
|
|
||||||
bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
|
|
||||||
this.bTrimGarbage = val;
|
|
||||||
});
|
|
||||||
elDiv.appendChild(bb.div);
|
|
||||||
|
|
||||||
let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
|
|
||||||
this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
|
|
||||||
});
|
|
||||||
elDiv.appendChild(sel.div);
|
|
||||||
|
|
||||||
sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
|
|
||||||
this.apiEP = ApiEP.Type[val];
|
|
||||||
});
|
|
||||||
elDiv.appendChild(sel.div);
|
|
||||||
|
|
||||||
this.show_settings_chatrequestoptions(elDiv);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
BIN
examples/server/public_simplechat/simplechat_screens.webp
Normal file
BIN
examples/server/public_simplechat/simplechat_screens.webp
Normal file
Binary file not shown.
After Width: | Height: | Size: 21 KiB |
|
@ -2606,16 +2606,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// print sample chat example to make it clear which template is used
|
// print sample chat example to make it clear which template is used
|
||||||
{
|
{
|
||||||
json chat;
|
|
||||||
chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}});
|
|
||||||
chat.push_back({{"role", "user"}, {"content", "Hello"}});
|
|
||||||
chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
|
|
||||||
chat.push_back({{"role", "user"}, {"content", "How are you?"}});
|
|
||||||
|
|
||||||
const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
|
|
||||||
|
|
||||||
LOG_INFO("chat template", {
|
LOG_INFO("chat template", {
|
||||||
{"chat_example", chat_example},
|
{"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
|
||||||
{"built_in", params.chat_template.empty()},
|
{"built_in", params.chat_template.empty()},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -118,36 +118,17 @@ static inline void server_log(const char * level, const char * function, int lin
|
||||||
|
|
||||||
// Format given chat. If tmpl is empty, we take the template from model metadata
|
// Format given chat. If tmpl is empty, we take the template from model metadata
|
||||||
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
||||||
size_t alloc_size = 0;
|
std::vector<llama_chat_msg> chat;
|
||||||
// vector holding all allocated string to be passed to llama_chat_apply_template
|
|
||||||
std::vector<std::string> str(messages.size() * 2);
|
|
||||||
std::vector<llama_chat_message> chat(messages.size());
|
|
||||||
|
|
||||||
for (size_t i = 0; i < messages.size(); ++i) {
|
for (size_t i = 0; i < messages.size(); ++i) {
|
||||||
const auto & curr_msg = messages[i];
|
const auto & curr_msg = messages[i];
|
||||||
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
|
std::string role = json_value(curr_msg, "role", std::string(""));
|
||||||
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
|
std::string content = json_value(curr_msg, "content", std::string(""));
|
||||||
alloc_size += str[i*2 + 1].length();
|
chat.push_back({role, content});
|
||||||
chat[i].role = str[i*2 + 0].c_str();
|
|
||||||
chat[i].content = str[i*2 + 1].c_str();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
|
auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
|
||||||
std::vector<char> buf(alloc_size * 2);
|
|
||||||
|
|
||||||
// run the first time to get the total output length
|
|
||||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
|
||||||
|
|
||||||
// if it turns out that our buffer is too small, we resize it
|
|
||||||
if ((size_t) res > buf.size()) {
|
|
||||||
buf.resize(res);
|
|
||||||
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string formatted_chat(buf.data(), res);
|
|
||||||
|
|
||||||
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
||||||
|
|
||||||
return formatted_chat;
|
return formatted_chat;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
90
ggml-cuda.cu
90
ggml-cuda.cu
|
@ -152,16 +152,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
|
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
|
||||||
|
|
||||||
int64_t total_vram = 0;
|
int64_t total_vram = 0;
|
||||||
#if defined(GGML_CUDA_FORCE_MMQ)
|
#ifdef GGML_CUDA_FORCE_MMQ
|
||||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
||||||
#else
|
#else
|
||||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
||||||
#endif
|
#endif // GGML_CUDA_FORCE_MMQ
|
||||||
#if defined(CUDA_USE_TENSOR_CORES)
|
#ifdef GGML_CUDA_FORCE_CUBLAS
|
||||||
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
|
||||||
#else
|
#else
|
||||||
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
|
||||||
#endif
|
#endif // GGML_CUDA_FORCE_CUBLAS
|
||||||
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
int device_vmm = 0;
|
int device_vmm = 0;
|
||||||
|
@ -1873,9 +1873,17 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
||||||
|
|
||||||
int64_t min_compute_capability = INT_MAX;
|
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
|
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
||||||
|
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
|
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||||
|
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||||
|
|
||||||
|
bool any_gpus_with_slow_fp16 = false;
|
||||||
|
|
||||||
bool any_pascal_with_slow_fp16 = false;
|
|
||||||
if (split) {
|
if (split) {
|
||||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
||||||
auto & tensor_split = buft_ctx->tensor_split;
|
auto & tensor_split = buft_ctx->tensor_split;
|
||||||
|
@ -1885,55 +1893,18 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
|
const int cc = ggml_cuda_info().devices[id].cc;
|
||||||
min_compute_capability = ggml_cuda_info().devices[id].cc;
|
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
|
||||||
}
|
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||||
if (ggml_cuda_info().devices[id].cc == 610) {
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||||
any_pascal_with_slow_fp16 = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
min_compute_capability = ggml_cuda_info().devices[ctx.device].cc;
|
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||||
any_pascal_with_slow_fp16 = ggml_cuda_info().devices[ctx.device].cc == 610;
|
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
|
||||||
|
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||||
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||||
}
|
}
|
||||||
|
|
||||||
// check data types and tensor shapes for custom matrix multiplication kernels:
|
|
||||||
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
||||||
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
|
||||||
|
|
||||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
||||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
|
||||||
|
|
||||||
bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
|
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
||||||
|
|
||||||
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
|
||||||
|
|
||||||
#ifdef CUDA_USE_TENSOR_CORES
|
|
||||||
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
|
||||||
#endif // CUDA_USE_TENSOR_CORES
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
// fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
|
|
||||||
const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
|
|
||||||
|
|
||||||
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
|
|
||||||
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
|
|
||||||
use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
|
|
||||||
|
|
||||||
#ifdef CUDA_USE_TENSOR_CORES
|
|
||||||
// when tensor cores are available, use them for large batch size
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
|
||||||
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
|
||||||
#endif // CUDA_USE_TENSOR_CORES
|
|
||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
||||||
|
|
||||||
// if mmvq is available it's a better choice than dmmv:
|
// if mmvq is available it's a better choice than dmmv:
|
||||||
#ifndef GGML_CUDA_FORCE_DMMV
|
#ifndef GGML_CUDA_FORCE_DMMV
|
||||||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||||
|
@ -1947,14 +1918,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||||
|
|
||||||
if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||||
// KQ single-batch
|
// FP32 precision KQ single-batch for batch size 1 without FlashAttention
|
||||||
ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
|
ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
|
||||||
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
} else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||||
// KQV single-batch
|
// FP32 precision KQV single-batch for batch size 1 without FlashAttention
|
||||||
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
||||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
|
||||||
// KQ + KQV multi-batch
|
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||||
|
// KQ + KQV multi-batch without FlashAttention
|
||||||
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
||||||
} else if (use_dequantize_mul_mat_vec) {
|
} else if (use_dequantize_mul_mat_vec) {
|
||||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
|
||||||
|
|
|
@ -146,23 +146,6 @@
|
||||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||||
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
||||||
|
|
||||||
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
|
||||||
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
|
||||||
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
|
||||||
// - 7B quantum model: +100-200 MB
|
|
||||||
// - 13B quantum model: +200-400 MB
|
|
||||||
//
|
|
||||||
//#define GGML_CUDA_FORCE_MMQ
|
|
||||||
|
|
||||||
// TODO: improve this to be correct for more hardware
|
|
||||||
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
|
||||||
#if !defined(GGML_CUDA_FORCE_MMQ)
|
|
||||||
#define CUDA_USE_TENSOR_CORES
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
|
|
||||||
#define MMQ_MAX_BATCH_SIZE 64 // max batch size to use MMQ kernels when tensor cores are available
|
|
||||||
|
|
||||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
|
@ -343,15 +326,15 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
|
||||||
#define INT8_MMA_AVAILABLE
|
#define INT8_MMA_AVAILABLE
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||||
|
|
||||||
static bool fast_fp16_available(const int cc) {
|
static constexpr bool fast_fp16_available(const int cc) {
|
||||||
return cc >= CC_PASCAL && cc != 610;
|
return cc >= CC_PASCAL && cc != 610;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool fp16_mma_available(const int cc) {
|
static constexpr bool fp16_mma_available(const int cc) {
|
||||||
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
|
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool int8_mma_available(const int cc) {
|
static constexpr bool int8_mma_available(const int cc) {
|
||||||
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
|
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -643,19 +626,6 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
|
||||||
static constexpr int qi = QI3_S;
|
static constexpr int qi = QI3_S;
|
||||||
};
|
};
|
||||||
|
|
||||||
static int get_mmq_x_max_host(const int cc) {
|
|
||||||
#ifdef CUDA_USE_TENSOR_CORES
|
|
||||||
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
|
|
||||||
#else
|
|
||||||
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
|
|
||||||
#endif // CUDA_USE_TENSOR_CORES
|
|
||||||
}
|
|
||||||
|
|
||||||
// Round rows to this value for --split-mode row:
|
|
||||||
static int get_mmq_y_host(const int cc) {
|
|
||||||
return cc >= CC_VOLTA ? 128 : 64;
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////
|
//////////////////////
|
||||||
|
|
||||||
struct ggml_cuda_device_info {
|
struct ggml_cuda_device_info {
|
||||||
|
|
|
@ -20,6 +20,20 @@ struct mma_int_A_I16K4 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE)
|
||||||
|
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||||
|
: "+r"(x[0]), "+r"(x[1])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_i(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_A_I16K8 {
|
struct mma_int_A_I16K8 {
|
||||||
|
@ -42,6 +56,20 @@ struct mma_int_A_I16K8 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE)
|
||||||
|
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
|
||||||
|
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_i(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_B_J8K4 {
|
struct mma_int_B_J8K4 {
|
||||||
|
@ -64,6 +92,20 @@ struct mma_int_B_J8K4 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
|
||||||
|
const int * xs = xs0 + (threadIdx.x%J)*stride;
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x1.b16 {%0}, [%1];"
|
||||||
|
: "+r"(x[0])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_j(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_B_J8K8 {
|
struct mma_int_B_J8K8 {
|
||||||
|
@ -86,6 +128,20 @@ struct mma_int_B_J8K8 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
|
||||||
|
const int * xs = xs0 + (threadIdx.x%J)*stride + ((threadIdx.x/J)*(K/2)) % K;
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||||
|
: "+r"(x[0]), "+r"(x[1])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_j(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_C_I16J8 {
|
struct mma_int_C_I16J8 {
|
||||||
|
|
|
@ -69,7 +69,13 @@ void ggml_cuda_op_mul_mat_q(
|
||||||
GGML_UNUSED(src1_ddf_i);
|
GGML_UNUSED(src1_ddf_i);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||||
|
#ifdef GGML_CUDA_FORCE_CUBLAS
|
||||||
|
return false;
|
||||||
|
#endif // GGML_CUDA_FORCE_CUBLAS
|
||||||
|
|
||||||
|
bool mmq_supported;
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
|
@ -81,8 +87,32 @@ bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
return true;
|
mmq_supported = true;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
|
mmq_supported = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!mmq_supported) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (int8_mma_available(cc)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cc < MIN_CC_DP4A) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_CUDA_FORCE_MMQ
|
||||||
|
return true;
|
||||||
|
#endif //GGML_CUDA_FORCE_MMQ
|
||||||
|
|
||||||
|
if (cc < CC_OFFSET_AMD) {
|
||||||
|
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||||
}
|
}
|
||||||
|
|
1289
ggml-cuda/mmq.cuh
1289
ggml-cuda/mmq.cuh
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,7 @@
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
|
|
||||||
|
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
|
||||||
|
|
||||||
void ggml_cuda_op_mul_mat_vec_q(
|
void ggml_cuda_op_mul_mat_vec_q(
|
||||||
ggml_backend_cuda_context & ctx,
|
ggml_backend_cuda_context & ctx,
|
||||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||||
|
|
|
@ -4620,7 +4620,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||||
// KQV single-batch
|
// KQV single-batch
|
||||||
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
|
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
|
||||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||||
// KQ + KQV multi-batch
|
// KQ + KQV multi-batch
|
||||||
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
||||||
} else if (use_dequantize_mul_mat_vec) {
|
} else if (use_dequantize_mul_mat_vec) {
|
||||||
|
|
39661
ggml-vulkan-shaders.hpp
39661
ggml-vulkan-shaders.hpp
File diff suppressed because it is too large
Load diff
2053
ggml-vulkan.cpp
2053
ggml-vulkan.cpp
File diff suppressed because it is too large
Load diff
35
ggml.h
35
ggml.h
|
@ -591,11 +591,7 @@ extern "C" {
|
||||||
struct ggml_tensor * grad;
|
struct ggml_tensor * grad;
|
||||||
struct ggml_tensor * src[GGML_MAX_SRC];
|
struct ggml_tensor * src[GGML_MAX_SRC];
|
||||||
|
|
||||||
// performance
|
// source tensor and offset for views
|
||||||
int perf_runs;
|
|
||||||
int64_t perf_cycles;
|
|
||||||
int64_t perf_time_us;
|
|
||||||
|
|
||||||
struct ggml_tensor * view_src;
|
struct ggml_tensor * view_src;
|
||||||
size_t view_offs;
|
size_t view_offs;
|
||||||
|
|
||||||
|
@ -605,7 +601,7 @@ extern "C" {
|
||||||
|
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
char padding[8];
|
// char padding[4];
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
@ -652,11 +648,6 @@ extern "C" {
|
||||||
struct ggml_hash_set visited_hash_table;
|
struct ggml_hash_set visited_hash_table;
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order order;
|
enum ggml_cgraph_eval_order order;
|
||||||
|
|
||||||
// performance
|
|
||||||
int perf_runs;
|
|
||||||
int64_t perf_cycles;
|
|
||||||
int64_t perf_time_us;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// scratch buffer
|
// scratch buffer
|
||||||
|
@ -673,28 +664,6 @@ extern "C" {
|
||||||
bool no_alloc; // don't allocate memory for the tensor data
|
bool no_alloc; // don't allocate memory for the tensor data
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// compute types
|
|
||||||
|
|
||||||
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
|
||||||
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
|
||||||
enum ggml_task_type {
|
|
||||||
GGML_TASK_TYPE_INIT = 0,
|
|
||||||
GGML_TASK_TYPE_COMPUTE,
|
|
||||||
GGML_TASK_TYPE_FINALIZE,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_compute_params {
|
|
||||||
enum ggml_task_type type;
|
|
||||||
|
|
||||||
// ith = thread index, nth = number of threads
|
|
||||||
int ith, nth;
|
|
||||||
|
|
||||||
// work buffer for all threads
|
|
||||||
size_t wsize;
|
|
||||||
void * wdata;
|
|
||||||
};
|
|
||||||
|
|
||||||
// numa strategies
|
// numa strategies
|
||||||
enum ggml_numa_strategy {
|
enum ggml_numa_strategy {
|
||||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||||
|
|
|
@ -49,6 +49,7 @@ class Keys:
|
||||||
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
||||||
POOLING_TYPE = "{arch}.pooling_type"
|
POOLING_TYPE = "{arch}.pooling_type"
|
||||||
LOGIT_SCALE = "{arch}.logit_scale"
|
LOGIT_SCALE = "{arch}.logit_scale"
|
||||||
|
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
@ -62,6 +63,7 @@ class Keys:
|
||||||
CAUSAL = "{arch}.attention.causal"
|
CAUSAL = "{arch}.attention.causal"
|
||||||
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
||||||
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
||||||
|
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
|
@ -73,6 +75,11 @@ class Keys:
|
||||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||||
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
||||||
|
|
||||||
|
class Split:
|
||||||
|
LLM_KV_SPLIT_NO = "split.no"
|
||||||
|
LLM_KV_SPLIT_COUNT = "split.count"
|
||||||
|
LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"
|
||||||
|
|
||||||
class SSM:
|
class SSM:
|
||||||
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
||||||
INNER_SIZE = "{arch}.ssm.inner_size"
|
INNER_SIZE = "{arch}.ssm.inner_size"
|
||||||
|
@ -97,6 +104,8 @@ class Keys:
|
||||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||||
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
||||||
|
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
|
||||||
|
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
|
||||||
HF_JSON = "tokenizer.huggingface.json"
|
HF_JSON = "tokenizer.huggingface.json"
|
||||||
RWKV = "tokenizer.rwkv.world"
|
RWKV = "tokenizer.rwkv.world"
|
||||||
CHAT_TEMPLATE = "tokenizer.chat_template"
|
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||||
|
@ -149,6 +158,8 @@ class MODEL_ARCH(IntEnum):
|
||||||
OLMO = auto()
|
OLMO = auto()
|
||||||
ARCTIC = auto()
|
ARCTIC = auto()
|
||||||
DEEPSEEK2 = auto()
|
DEEPSEEK2 = auto()
|
||||||
|
BITNET = auto()
|
||||||
|
T5 = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -200,6 +211,36 @@ class MODEL_TENSOR(IntEnum):
|
||||||
ATTN_KV_B = auto()
|
ATTN_KV_B = auto()
|
||||||
ATTN_Q_A_NORM = auto()
|
ATTN_Q_A_NORM = auto()
|
||||||
ATTN_KV_A_NORM = auto()
|
ATTN_KV_A_NORM = auto()
|
||||||
|
FFN_SUB_NORM = auto()
|
||||||
|
ATTN_SUB_NORM = auto()
|
||||||
|
DEC_ATTN_NORM = auto()
|
||||||
|
DEC_ATTN_Q = auto()
|
||||||
|
DEC_ATTN_K = auto()
|
||||||
|
DEC_ATTN_V = auto()
|
||||||
|
DEC_ATTN_OUT = auto()
|
||||||
|
DEC_ATTN_REL_B = auto()
|
||||||
|
DEC_CROSS_ATTN_NORM = auto()
|
||||||
|
DEC_CROSS_ATTN_Q = auto()
|
||||||
|
DEC_CROSS_ATTN_K = auto()
|
||||||
|
DEC_CROSS_ATTN_V = auto()
|
||||||
|
DEC_CROSS_ATTN_OUT = auto()
|
||||||
|
DEC_CROSS_ATTN_REL_B = auto()
|
||||||
|
DEC_FFN_NORM = auto()
|
||||||
|
DEC_FFN_GATE = auto()
|
||||||
|
DEC_FFN_DOWN = auto()
|
||||||
|
DEC_FFN_UP = auto()
|
||||||
|
DEC_OUTPUT_NORM = auto()
|
||||||
|
ENC_ATTN_NORM = auto()
|
||||||
|
ENC_ATTN_Q = auto()
|
||||||
|
ENC_ATTN_K = auto()
|
||||||
|
ENC_ATTN_V = auto()
|
||||||
|
ENC_ATTN_OUT = auto()
|
||||||
|
ENC_ATTN_REL_B = auto()
|
||||||
|
ENC_FFN_NORM = auto()
|
||||||
|
ENC_FFN_GATE = auto()
|
||||||
|
ENC_FFN_DOWN = auto()
|
||||||
|
ENC_FFN_UP = auto()
|
||||||
|
ENC_OUTPUT_NORM = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
|
@ -237,6 +278,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.OLMO: "olmo",
|
MODEL_ARCH.OLMO: "olmo",
|
||||||
MODEL_ARCH.ARCTIC: "arctic",
|
MODEL_ARCH.ARCTIC: "arctic",
|
||||||
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
||||||
|
MODEL_ARCH.BITNET: "bitnet",
|
||||||
|
MODEL_ARCH.T5: "t5",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -288,6 +331,36 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
||||||
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
||||||
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
||||||
|
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
|
||||||
|
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
|
||||||
|
MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
|
||||||
|
MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
|
||||||
|
MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
|
||||||
|
MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
|
||||||
|
MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
|
||||||
|
MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
|
||||||
|
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
|
||||||
|
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
@ -808,6 +881,53 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.BITNET: [
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_SUB_NORM,
|
||||||
|
MODEL_TENSOR.FFN_SUB_NORM,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.T5: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_NORM,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_Q,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_K,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_V,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_OUT,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_REL_B,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_K,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_V,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B,
|
||||||
|
MODEL_TENSOR.DEC_FFN_NORM,
|
||||||
|
MODEL_TENSOR.DEC_FFN_GATE,
|
||||||
|
MODEL_TENSOR.DEC_FFN_DOWN,
|
||||||
|
MODEL_TENSOR.DEC_FFN_UP,
|
||||||
|
MODEL_TENSOR.DEC_OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_Q,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_K,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_V,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_REL_B,
|
||||||
|
MODEL_TENSOR.ENC_FFN_NORM,
|
||||||
|
MODEL_TENSOR.ENC_FFN_GATE,
|
||||||
|
MODEL_TENSOR.ENC_FFN_DOWN,
|
||||||
|
MODEL_TENSOR.ENC_FFN_UP,
|
||||||
|
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
||||||
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -69,6 +69,7 @@ class GGUFReader:
|
||||||
# I - same as host, S - swapped
|
# I - same as host, S - swapped
|
||||||
byte_order: Literal['I'] | Literal['S'] = 'I'
|
byte_order: Literal['I'] | Literal['S'] = 'I'
|
||||||
alignment: int = GGUF_DEFAULT_ALIGNMENT
|
alignment: int = GGUF_DEFAULT_ALIGNMENT
|
||||||
|
data_offset: int
|
||||||
|
|
||||||
# Note: Internal helper, API may change.
|
# Note: Internal helper, API may change.
|
||||||
gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
|
gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
|
||||||
|
@ -88,9 +89,13 @@ class GGUFReader:
|
||||||
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
|
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
|
||||||
self.data = np.memmap(path, mode = mode)
|
self.data = np.memmap(path, mode = mode)
|
||||||
offs = 0
|
offs = 0
|
||||||
|
|
||||||
|
# Check for GGUF magic
|
||||||
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
|
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
|
||||||
raise ValueError('GGUF magic invalid')
|
raise ValueError('GGUF magic invalid')
|
||||||
offs += 4
|
offs += 4
|
||||||
|
|
||||||
|
# Check GGUF version
|
||||||
temp_version = self._get(offs, np.uint32)
|
temp_version = self._get(offs, np.uint32)
|
||||||
if temp_version[0] & 65535 == 0:
|
if temp_version[0] & 65535 == 0:
|
||||||
# If we get 0 here that means it's (probably) a GGUF file created for
|
# If we get 0 here that means it's (probably) a GGUF file created for
|
||||||
|
@ -103,12 +108,16 @@ class GGUFReader:
|
||||||
self.fields: OrderedDict[str, ReaderField] = OrderedDict()
|
self.fields: OrderedDict[str, ReaderField] = OrderedDict()
|
||||||
self.tensors: list[ReaderTensor] = []
|
self.tensors: list[ReaderTensor] = []
|
||||||
offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
|
offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
|
||||||
|
|
||||||
|
# Check tensor count and kv count
|
||||||
temp_counts = self._get(offs, np.uint64, 2)
|
temp_counts = self._get(offs, np.uint64, 2)
|
||||||
offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
|
offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
|
||||||
offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
|
offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
|
||||||
tensor_count, kv_count = temp_counts
|
tensor_count, kv_count = temp_counts
|
||||||
offs = self._build_fields(offs, kv_count)
|
offs = self._build_fields(offs, kv_count)
|
||||||
offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
|
|
||||||
|
# Build Tensor Info Fields
|
||||||
|
offs, tensors_fields = self._build_tensor_info(offs, tensor_count)
|
||||||
new_align = self.fields.get('general.alignment')
|
new_align = self.fields.get('general.alignment')
|
||||||
if new_align is not None:
|
if new_align is not None:
|
||||||
if new_align.types != [GGUFValueType.UINT32]:
|
if new_align.types != [GGUFValueType.UINT32]:
|
||||||
|
@ -117,6 +126,7 @@ class GGUFReader:
|
||||||
padding = offs % self.alignment
|
padding = offs % self.alignment
|
||||||
if padding != 0:
|
if padding != 0:
|
||||||
offs += self.alignment - padding
|
offs += self.alignment - padding
|
||||||
|
self.data_offset = offs
|
||||||
self._build_tensors(offs, tensors_fields)
|
self._build_tensors(offs, tensors_fields)
|
||||||
|
|
||||||
_DT = TypeVar('_DT', bound = npt.DTypeLike)
|
_DT = TypeVar('_DT', bound = npt.DTypeLike)
|
||||||
|
@ -193,18 +203,29 @@ class GGUFReader:
|
||||||
# We can't deal with this one.
|
# We can't deal with this one.
|
||||||
raise ValueError('Unknown/unhandled field type {gtype}')
|
raise ValueError('Unknown/unhandled field type {gtype}')
|
||||||
|
|
||||||
def _get_tensor(self, orig_offs: int) -> ReaderField:
|
def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
|
||||||
offs = orig_offs
|
offs = orig_offs
|
||||||
|
|
||||||
|
# Get Tensor Name
|
||||||
name_len, name_data = self._get_str(offs)
|
name_len, name_data = self._get_str(offs)
|
||||||
offs += int(name_len.nbytes + name_data.nbytes)
|
offs += int(name_len.nbytes + name_data.nbytes)
|
||||||
|
|
||||||
|
# Get Tensor Dimensions Count
|
||||||
n_dims = self._get(offs, np.uint32)
|
n_dims = self._get(offs, np.uint32)
|
||||||
offs += int(n_dims.nbytes)
|
offs += int(n_dims.nbytes)
|
||||||
|
|
||||||
|
# Get Tensor Dimension Array
|
||||||
dims = self._get(offs, np.uint64, n_dims[0])
|
dims = self._get(offs, np.uint64, n_dims[0])
|
||||||
offs += int(dims.nbytes)
|
offs += int(dims.nbytes)
|
||||||
|
|
||||||
|
# Get Tensor Encoding Scheme Type
|
||||||
raw_dtype = self._get(offs, np.uint32)
|
raw_dtype = self._get(offs, np.uint32)
|
||||||
offs += int(raw_dtype.nbytes)
|
offs += int(raw_dtype.nbytes)
|
||||||
|
|
||||||
|
# Get Tensor Offset
|
||||||
offset_tensor = self._get(offs, np.uint64)
|
offset_tensor = self._get(offs, np.uint64)
|
||||||
offs += int(offset_tensor.nbytes)
|
offs += int(offset_tensor.nbytes)
|
||||||
|
|
||||||
return ReaderField(
|
return ReaderField(
|
||||||
orig_offs,
|
orig_offs,
|
||||||
str(bytes(name_data), encoding = 'utf-8'),
|
str(bytes(name_data), encoding = 'utf-8'),
|
||||||
|
@ -233,10 +254,10 @@ class GGUFReader:
|
||||||
offs += field_size
|
offs += field_size
|
||||||
return offs
|
return offs
|
||||||
|
|
||||||
def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
|
def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
|
||||||
tensor_fields = []
|
tensor_fields = []
|
||||||
for _ in range(count):
|
for _ in range(count):
|
||||||
field = self._get_tensor(offs)
|
field = self._get_tensor_info_field(offs)
|
||||||
offs += sum(int(part.nbytes) for part in field.parts)
|
offs += sum(int(part.nbytes) for part in field.parts)
|
||||||
tensor_fields.append(field)
|
tensor_fields.append(field)
|
||||||
return offs, tensor_fields
|
return offs, tensor_fields
|
||||||
|
|
|
@ -7,6 +7,7 @@ import struct
|
||||||
import tempfile
|
import tempfile
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
|
from pathlib import Path
|
||||||
from io import BufferedWriter
|
from io import BufferedWriter
|
||||||
from typing import IO, Any, Sequence, Mapping
|
from typing import IO, Any, Sequence, Mapping
|
||||||
from string import ascii_letters, digits
|
from string import ascii_letters, digits
|
||||||
|
@ -31,6 +32,9 @@ from .quants import quant_shape_from_byte_shape
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TensorInfo:
|
class TensorInfo:
|
||||||
shape: Sequence[int]
|
shape: Sequence[int]
|
||||||
|
@ -55,11 +59,11 @@ class WriterState(Enum):
|
||||||
|
|
||||||
|
|
||||||
class GGUFWriter:
|
class GGUFWriter:
|
||||||
fout: BufferedWriter | None
|
fout: list[BufferedWriter] | None
|
||||||
path: os.PathLike[str] | str | None
|
path: Path | None
|
||||||
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
|
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
|
||||||
tensors: dict[str, TensorInfo]
|
tensors: list[dict[str, TensorInfo]]
|
||||||
kv_data: dict[str, GGUFValue]
|
kv_data: list[dict[str, GGUFValue]]
|
||||||
state: WriterState
|
state: WriterState
|
||||||
_simple_value_packing = {
|
_simple_value_packing = {
|
||||||
GGUFValueType.UINT8: "B",
|
GGUFValueType.UINT8: "B",
|
||||||
|
@ -76,26 +80,38 @@ class GGUFWriter:
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False,
|
self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE,
|
||||||
endianess: GGUFEndian = GGUFEndian.LITTLE,
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False
|
||||||
):
|
):
|
||||||
self.fout = None
|
self.fout = None
|
||||||
self.path = path
|
self.path = Path(path) if path else None
|
||||||
self.arch = arch
|
self.arch = arch
|
||||||
self.endianess = endianess
|
self.endianess = endianess
|
||||||
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
|
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
|
||||||
self.use_temp_file = use_temp_file
|
self.use_temp_file = use_temp_file
|
||||||
self.temp_file = None
|
self.temp_file = None
|
||||||
self.tensors = dict()
|
self.tensors = [{}]
|
||||||
self.kv_data = dict()
|
self.kv_data = [{}]
|
||||||
|
self.split_max_tensors = split_max_tensors
|
||||||
|
self.split_max_size = split_max_size
|
||||||
|
self.dry_run = dry_run
|
||||||
|
self.small_first_shard = small_first_shard
|
||||||
logger.info("gguf: This GGUF file is for {0} Endian only".format(
|
logger.info("gguf: This GGUF file is for {0} Endian only".format(
|
||||||
"Big" if self.endianess == GGUFEndian.BIG else "Little",
|
"Big" if self.endianess == GGUFEndian.BIG else "Little",
|
||||||
))
|
))
|
||||||
self.state = WriterState.NO_FILE
|
self.state = WriterState.NO_FILE
|
||||||
|
|
||||||
|
if self.small_first_shard:
|
||||||
|
self.tensors.append({})
|
||||||
|
|
||||||
self.add_architecture()
|
self.add_architecture()
|
||||||
|
|
||||||
def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None:
|
def format_shard_names(self, path: Path) -> list[Path]:
|
||||||
|
if len(self.tensors) == 1:
|
||||||
|
return [path]
|
||||||
|
return [path.with_name(SHARD_NAME_FORMAT.format(path.stem, i + 1, len(self.tensors))) for i in range(len(self.tensors))]
|
||||||
|
|
||||||
|
def open_output_file(self, path: Path | None = None) -> None:
|
||||||
if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
|
if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
|
||||||
# allow calling this multiple times as long as the path is the same
|
# allow calling this multiple times as long as the path is the same
|
||||||
return
|
return
|
||||||
|
@ -106,22 +122,58 @@ class GGUFWriter:
|
||||||
self.path = path
|
self.path = path
|
||||||
|
|
||||||
if self.path is not None:
|
if self.path is not None:
|
||||||
if self.fout is not None:
|
filenames = self.print_plan()
|
||||||
self.fout.close()
|
self.fout = [open(filename, "wb") for filename in filenames]
|
||||||
self.fout = open(self.path, "wb")
|
|
||||||
self.state = WriterState.EMPTY
|
self.state = WriterState.EMPTY
|
||||||
|
|
||||||
def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None:
|
def print_plan(self) -> list[Path]:
|
||||||
|
logger.info("Writing the following files:")
|
||||||
|
assert self.path is not None
|
||||||
|
filenames = self.format_shard_names(self.path)
|
||||||
|
assert len(filenames) == len(self.tensors)
|
||||||
|
for name, tensors in zip(filenames, self.tensors):
|
||||||
|
logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}")
|
||||||
|
|
||||||
|
if self.dry_run:
|
||||||
|
logger.info("Dry run, not writing files")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
return filenames
|
||||||
|
|
||||||
|
def add_shard_kv_data(self) -> None:
|
||||||
|
if len(self.tensors) == 1:
|
||||||
|
return
|
||||||
|
|
||||||
|
total_tensors = sum(len(t) for t in self.tensors)
|
||||||
|
assert self.fout is not None
|
||||||
|
total_splits = len(self.fout)
|
||||||
|
self.kv_data.extend({} for _ in range(len(self.kv_data), total_splits))
|
||||||
|
for i, kv_data in enumerate(self.kv_data):
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16)
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16)
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32)
|
||||||
|
|
||||||
|
def write_header_to_file(self, path: Path | None = None) -> None:
|
||||||
|
if len(self.tensors) == 1 and (self.split_max_tensors != 0 or self.split_max_size != 0):
|
||||||
|
logger.warning("Model fails split requirements, not splitting")
|
||||||
|
|
||||||
self.open_output_file(path)
|
self.open_output_file(path)
|
||||||
|
|
||||||
if self.state is not WriterState.EMPTY:
|
if self.state is not WriterState.EMPTY:
|
||||||
raise ValueError(f'Expected output file to be empty, got {self.state}')
|
raise ValueError(f'Expected output file to be empty, got {self.state}')
|
||||||
|
|
||||||
self._write_packed("<I", GGUF_MAGIC, skip_pack_prefix = True)
|
assert self.fout is not None
|
||||||
self._write_packed("I", GGUF_VERSION)
|
assert len(self.fout) == len(self.tensors)
|
||||||
self._write_packed("Q", len(self.tensors))
|
assert len(self.kv_data) == 1
|
||||||
self._write_packed("Q", len(self.kv_data))
|
|
||||||
self.flush()
|
self.add_shard_kv_data()
|
||||||
|
|
||||||
|
for fout, tensors, kv_data in zip(self.fout, self.tensors, self.kv_data):
|
||||||
|
fout.write(self._pack("<I", GGUF_MAGIC, skip_pack_prefix = True))
|
||||||
|
fout.write(self._pack("I", GGUF_VERSION))
|
||||||
|
fout.write(self._pack("Q", len(tensors)))
|
||||||
|
fout.write(self._pack("Q", len(kv_data)))
|
||||||
|
fout.flush()
|
||||||
self.state = WriterState.HEADER
|
self.state = WriterState.HEADER
|
||||||
|
|
||||||
def write_kv_data_to_file(self) -> None:
|
def write_kv_data_to_file(self) -> None:
|
||||||
|
@ -129,13 +181,15 @@ class GGUFWriter:
|
||||||
raise ValueError(f'Expected output file to contain the header, got {self.state}')
|
raise ValueError(f'Expected output file to contain the header, got {self.state}')
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
|
|
||||||
kv_data = bytearray()
|
for fout, kv_data in zip(self.fout, self.kv_data):
|
||||||
|
kv_bytes = bytearray()
|
||||||
|
|
||||||
for key, val in self.kv_data.items():
|
for key, val in kv_data.items():
|
||||||
kv_data += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
|
kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
|
||||||
kv_data += self._pack_val(val.value, val.type, add_vtype=True)
|
kv_bytes += self._pack_val(val.value, val.type, add_vtype=True)
|
||||||
|
|
||||||
|
fout.write(kv_bytes)
|
||||||
|
|
||||||
self.fout.write(kv_data)
|
|
||||||
self.flush()
|
self.flush()
|
||||||
self.state = WriterState.KV_DATA
|
self.state = WriterState.KV_DATA
|
||||||
|
|
||||||
|
@ -144,28 +198,29 @@ class GGUFWriter:
|
||||||
raise ValueError(f'Expected output file to contain KV data, got {self.state}')
|
raise ValueError(f'Expected output file to contain KV data, got {self.state}')
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
|
|
||||||
|
for fout, tensors in zip(self.fout, self.tensors):
|
||||||
ti_data = bytearray()
|
ti_data = bytearray()
|
||||||
offset_tensor = 0
|
offset_tensor = 0
|
||||||
|
|
||||||
for name, ti in self.tensors.items():
|
for name, ti in tensors.items():
|
||||||
ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
|
ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
|
||||||
n_dims = len(ti.shape)
|
n_dims = len(ti.shape)
|
||||||
ti_data += self._pack("I", n_dims)
|
ti_data += self._pack("I", n_dims)
|
||||||
for i in range(n_dims):
|
for j in range(n_dims):
|
||||||
ti_data += self._pack("Q", ti.shape[n_dims - 1 - i])
|
ti_data += self._pack("Q", ti.shape[n_dims - 1 - j])
|
||||||
ti_data += self._pack("I", ti.dtype)
|
ti_data += self._pack("I", ti.dtype)
|
||||||
ti_data += self._pack("Q", offset_tensor)
|
ti_data += self._pack("Q", offset_tensor)
|
||||||
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
|
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
|
||||||
|
|
||||||
self.fout.write(ti_data)
|
fout.write(ti_data)
|
||||||
self.flush()
|
fout.flush()
|
||||||
self.state = WriterState.TI_DATA
|
self.state = WriterState.TI_DATA
|
||||||
|
|
||||||
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
|
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
|
||||||
if key in self.kv_data:
|
if any(key in kv_data for kv_data in self.kv_data):
|
||||||
raise ValueError(f'Duplicated key name {key!r}')
|
raise ValueError(f'Duplicated key name {key!r}')
|
||||||
|
|
||||||
self.kv_data[key] = GGUFValue(value=val, type=vtype)
|
self.kv_data[0][key] = GGUFValue(value=val, type=vtype)
|
||||||
|
|
||||||
def add_uint8(self, key: str, val: int) -> None:
|
def add_uint8(self, key: str, val: int) -> None:
|
||||||
self.add_key_value(key,val, GGUFValueType.UINT8)
|
self.add_key_value(key,val, GGUFValueType.UINT8)
|
||||||
|
@ -206,9 +261,6 @@ class GGUFWriter:
|
||||||
self.add_key_value(key, val, GGUFValueType.STRING)
|
self.add_key_value(key, val, GGUFValueType.STRING)
|
||||||
|
|
||||||
def add_array(self, key: str, val: Sequence[Any]) -> None:
|
def add_array(self, key: str, val: Sequence[Any]) -> None:
|
||||||
if not isinstance(val, Sequence):
|
|
||||||
raise ValueError("Value must be a sequence for array type")
|
|
||||||
|
|
||||||
self.add_key_value(key, val, GGUFValueType.ARRAY)
|
self.add_key_value(key, val, GGUFValueType.ARRAY)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -222,7 +274,7 @@ class GGUFWriter:
|
||||||
if self.state is not WriterState.NO_FILE:
|
if self.state is not WriterState.NO_FILE:
|
||||||
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
|
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
|
||||||
|
|
||||||
if name in self.tensors:
|
if any(name in tensors for tensors in self.tensors):
|
||||||
raise ValueError(f'Duplicated tensor name {name!r}')
|
raise ValueError(f'Duplicated tensor name {name!r}')
|
||||||
|
|
||||||
if raw_dtype is None:
|
if raw_dtype is None:
|
||||||
|
@ -247,7 +299,18 @@ class GGUFWriter:
|
||||||
if tensor_dtype == np.uint8:
|
if tensor_dtype == np.uint8:
|
||||||
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
||||||
|
|
||||||
self.tensors[name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
|
# make sure there is at least one tensor before splitting
|
||||||
|
if len(self.tensors[-1]) > 0:
|
||||||
|
if ( # split when over tensor limit
|
||||||
|
self.split_max_tensors != 0
|
||||||
|
and len(self.tensors[-1]) >= self.split_max_tensors
|
||||||
|
) or ( # split when over size limit
|
||||||
|
self.split_max_size != 0
|
||||||
|
and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size
|
||||||
|
):
|
||||||
|
self.tensors.append({})
|
||||||
|
|
||||||
|
self.tensors[-1][name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
|
||||||
|
|
||||||
def add_tensor(
|
def add_tensor(
|
||||||
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
|
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
|
||||||
|
@ -264,7 +327,7 @@ class GGUFWriter:
|
||||||
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
|
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
|
||||||
|
|
||||||
if self.temp_file is None:
|
if self.temp_file is None:
|
||||||
self.tensors[name].tensor = tensor
|
self.tensors[-1][name].tensor = tensor
|
||||||
return
|
return
|
||||||
|
|
||||||
tensor.tofile(self.temp_file)
|
tensor.tofile(self.temp_file)
|
||||||
|
@ -282,9 +345,24 @@ class GGUFWriter:
|
||||||
|
|
||||||
if self.endianess == GGUFEndian.BIG:
|
if self.endianess == GGUFEndian.BIG:
|
||||||
tensor.byteswap(inplace=True)
|
tensor.byteswap(inplace=True)
|
||||||
self.write_padding(self.fout, self.fout.tell())
|
|
||||||
tensor.tofile(self.fout)
|
file_id = -1
|
||||||
self.write_padding(self.fout, tensor.nbytes)
|
for i, tensors in enumerate(self.tensors):
|
||||||
|
if len(tensors) > 0:
|
||||||
|
file_id = i
|
||||||
|
break
|
||||||
|
|
||||||
|
fout = self.fout[file_id]
|
||||||
|
|
||||||
|
# pop the first tensor info
|
||||||
|
# TODO: cleaner way to get the first key
|
||||||
|
first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0]
|
||||||
|
ti = self.tensors[file_id].pop(first_tensor_name)
|
||||||
|
assert ti.nbytes == tensor.nbytes
|
||||||
|
|
||||||
|
self.write_padding(fout, fout.tell())
|
||||||
|
tensor.tofile(fout)
|
||||||
|
self.write_padding(fout, tensor.nbytes)
|
||||||
|
|
||||||
self.state = WriterState.WEIGHTS
|
self.state = WriterState.WEIGHTS
|
||||||
|
|
||||||
|
@ -293,31 +371,43 @@ class GGUFWriter:
|
||||||
|
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
|
|
||||||
self.write_padding(self.fout, self.fout.tell())
|
for fout in self.fout:
|
||||||
|
self.write_padding(fout, fout.tell())
|
||||||
|
|
||||||
if self.temp_file is None:
|
if self.temp_file is None:
|
||||||
|
shard_bar = None
|
||||||
bar = None
|
bar = None
|
||||||
|
|
||||||
if progress:
|
if progress:
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
total_bytes = sum(t.nbytes for t in self.tensors.values())
|
total_bytes = sum(ti.nbytes for t in self.tensors for ti in t.values())
|
||||||
|
|
||||||
|
if len(self.fout) > 1:
|
||||||
|
shard_bar = tqdm(desc=f"Shard (0/{len(self.fout)})", total=None, unit="byte", unit_scale=True)
|
||||||
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
||||||
|
|
||||||
|
for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)):
|
||||||
|
if shard_bar is not None:
|
||||||
|
shard_bar.set_description(f"Shard ({i + 1}/{len(self.fout)})")
|
||||||
|
total = sum(ti.nbytes for ti in tensors.values())
|
||||||
|
shard_bar.reset(total=(total if total > 0 else None))
|
||||||
|
|
||||||
# relying on the fact that Python dicts preserve insertion order (since 3.7)
|
# relying on the fact that Python dicts preserve insertion order (since 3.7)
|
||||||
for ti in self.tensors.values():
|
for ti in tensors.values():
|
||||||
assert ti.tensor is not None # can only iterate once over the tensors
|
assert ti.tensor is not None # can only iterate once over the tensors
|
||||||
assert ti.tensor.nbytes == ti.nbytes
|
assert ti.tensor.nbytes == ti.nbytes
|
||||||
ti.tensor.tofile(self.fout)
|
ti.tensor.tofile(fout)
|
||||||
|
if shard_bar is not None:
|
||||||
|
shard_bar.update(ti.nbytes)
|
||||||
if bar is not None:
|
if bar is not None:
|
||||||
bar.update(ti.nbytes)
|
bar.update(ti.nbytes)
|
||||||
self.write_padding(self.fout, ti.nbytes)
|
self.write_padding(fout, ti.nbytes)
|
||||||
ti.tensor = None
|
ti.tensor = None
|
||||||
else:
|
else:
|
||||||
self.temp_file.seek(0)
|
self.temp_file.seek(0)
|
||||||
|
|
||||||
shutil.copyfileobj(self.temp_file, self.fout)
|
shutil.copyfileobj(self.temp_file, self.fout[0 if not self.small_first_shard else 1])
|
||||||
self.flush()
|
self.flush()
|
||||||
self.temp_file.close()
|
self.temp_file.close()
|
||||||
|
|
||||||
|
@ -325,11 +415,13 @@ class GGUFWriter:
|
||||||
|
|
||||||
def flush(self) -> None:
|
def flush(self) -> None:
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
self.fout.flush()
|
for fout in self.fout:
|
||||||
|
fout.flush()
|
||||||
|
|
||||||
def close(self) -> None:
|
def close(self) -> None:
|
||||||
if self.fout is not None:
|
if self.fout is not None:
|
||||||
self.fout.close()
|
for fout in self.fout:
|
||||||
|
fout.close()
|
||||||
self.fout = None
|
self.fout = None
|
||||||
|
|
||||||
def add_architecture(self) -> None:
|
def add_architecture(self) -> None:
|
||||||
|
@ -400,6 +492,9 @@ class GGUFWriter:
|
||||||
def add_parallel_residual(self, use: bool) -> None:
|
def add_parallel_residual(self, use: bool) -> None:
|
||||||
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||||
|
|
||||||
|
def add_decoder_start_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
|
||||||
|
|
||||||
def add_head_count(self, count: int) -> None:
|
def add_head_count(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
@ -448,6 +543,9 @@ class GGUFWriter:
|
||||||
def add_kv_lora_rank(self, length: int) -> None:
|
def add_kv_lora_rank(self, length: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
|
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_relative_attn_buckets_count(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_pooling_type(self, value: PoolingType) -> None:
|
def add_pooling_type(self, value: PoolingType) -> None:
|
||||||
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
|
@ -538,6 +636,12 @@ class GGUFWriter:
|
||||||
def add_add_space_prefix(self, value: bool) -> None:
|
def add_add_space_prefix(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
||||||
|
|
||||||
|
def add_remove_extra_whitespaces(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Tokenizer.REMOVE_EXTRA_WS, value)
|
||||||
|
|
||||||
|
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
|
||||||
|
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
|
||||||
|
|
||||||
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
||||||
if not isinstance(value, str):
|
if not isinstance(value, str):
|
||||||
template_default = None
|
template_default = None
|
||||||
|
@ -599,6 +703,9 @@ class GGUFWriter:
|
||||||
kv_data += self._pack("Q", len(encoded_val))
|
kv_data += self._pack("Q", len(encoded_val))
|
||||||
kv_data += encoded_val
|
kv_data += encoded_val
|
||||||
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
|
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
|
||||||
|
if isinstance(val, bytes):
|
||||||
|
ltype = GGUFValueType.UINT8
|
||||||
|
else:
|
||||||
ltype = GGUFValueType.get_type(val[0])
|
ltype = GGUFValueType.get_type(val[0])
|
||||||
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
|
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
|
||||||
raise ValueError("All items in a GGUF array should be of the same type")
|
raise ValueError("All items in a GGUF array should be of the same type")
|
||||||
|
@ -611,6 +718,13 @@ class GGUFWriter:
|
||||||
|
|
||||||
return kv_data
|
return kv_data
|
||||||
|
|
||||||
def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None:
|
@staticmethod
|
||||||
assert self.fout is not None
|
def format_n_bytes_to_str(num: int) -> str:
|
||||||
self.fout.write(self._pack(fmt, value, skip_pack_prefix))
|
if num == 0:
|
||||||
|
return "negligible - metadata only"
|
||||||
|
fnum = float(num)
|
||||||
|
for unit in ("", "K", "M", "G"):
|
||||||
|
if abs(fnum) < 1000.0:
|
||||||
|
return f"{fnum:3.1f}{unit}"
|
||||||
|
fnum /= 1000.0
|
||||||
|
return f"{fnum:.1f}T - over 1TB, split recommended"
|
||||||
|
|
|
@ -24,6 +24,7 @@ class TensorNameMap:
|
||||||
"backbone.embedding", # mamba
|
"backbone.embedding", # mamba
|
||||||
"backbone.embeddings", # mamba-hf
|
"backbone.embeddings", # mamba-hf
|
||||||
"transformer.in_out_embed", # Grok
|
"transformer.in_out_embed", # Grok
|
||||||
|
"shared", # t5
|
||||||
),
|
),
|
||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
|
@ -413,6 +414,128 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.ATTN_KV_A_NORM: (
|
MODEL_TENSOR.ATTN_KV_A_NORM: (
|
||||||
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
|
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_SUB_NORM: (
|
||||||
|
"model.layers.{bid}.self_attn.inner_attn_ln", # bitnet
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_SUB_NORM: (
|
||||||
|
"model.layers.{bid}.mlp.ffn_layernorm", # bitnet
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.0.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_Q: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_K: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_V: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_OUT: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_REL_B: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.1.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_K: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_V: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.2.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_GATE: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_UP: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_DOWN: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_OUTPUT_NORM: (
|
||||||
|
"decoder.final_layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_NORM: (
|
||||||
|
"encoder.block.{bid}.layer.0.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_Q: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_K: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_V: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_OUT: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_REL_B: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_NORM: (
|
||||||
|
"encoder.block.{bid}.layer.1.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_GATE: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_UP: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_DOWN: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_OUTPUT_NORM: (
|
||||||
|
"encoder.final_layer_norm", # t5
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
# architecture-specific block mappings
|
# architecture-specific block mappings
|
||||||
|
|
|
@ -208,7 +208,9 @@ def translate_tensor_name(name):
|
||||||
'ssm_d': 'State space model skip connection',
|
'ssm_d': 'State space model skip connection',
|
||||||
'ssm_dt': 'State space model time step',
|
'ssm_dt': 'State space model time step',
|
||||||
'ssm_out': 'State space model output projection',
|
'ssm_out': 'State space model output projection',
|
||||||
'blk': 'Block'
|
'blk': 'Block',
|
||||||
|
'enc': 'Encoder',
|
||||||
|
'dec': 'Decoder',
|
||||||
}
|
}
|
||||||
|
|
||||||
expanded_words = []
|
expanded_words = []
|
||||||
|
@ -291,6 +293,10 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
|
||||||
tensor_group_name = "base"
|
tensor_group_name = "base"
|
||||||
if tensor_components[0] == 'blk':
|
if tensor_components[0] == 'blk':
|
||||||
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}"
|
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}"
|
||||||
|
elif tensor_components[0] in ['enc', 'dec'] and tensor_components[1] == 'blk':
|
||||||
|
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}.{tensor_components[2]}"
|
||||||
|
elif tensor_components[0] in ['enc', 'dec']:
|
||||||
|
tensor_group_name = f"{tensor_components[0]}"
|
||||||
|
|
||||||
# Check if new Tensor Group
|
# Check if new Tensor Group
|
||||||
if tensor_group_name not in tensor_groups:
|
if tensor_group_name not in tensor_groups:
|
||||||
|
@ -313,6 +319,27 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
|
||||||
|
|
||||||
markdown_content += "\n"
|
markdown_content += "\n"
|
||||||
|
|
||||||
|
markdown_content += "### Tensor Data Offset\n"
|
||||||
|
markdown_content += '\n'
|
||||||
|
markdown_content += 'This table contains the offset and data segment relative to start of file\n'
|
||||||
|
markdown_content += '\n'
|
||||||
|
|
||||||
|
tensor_mapping_table: list[dict[str, str | int]] = []
|
||||||
|
for key, tensor in enumerate(reader.tensors):
|
||||||
|
data_offset_pretty = '{0:#16x}'.format(tensor.data_offset)
|
||||||
|
data_size_pretty = '{0:#16x}'.format(tensor.n_bytes)
|
||||||
|
tensor_mapping_table.append({"t_id":key, "layer_name":tensor.name, "data_offset":data_offset_pretty, "data_size":data_size_pretty})
|
||||||
|
|
||||||
|
tensors_mapping_table_header_map = [
|
||||||
|
{'key_name':'t_id', 'header_name':'T_ID', 'align':'right'},
|
||||||
|
{'key_name':'layer_name', 'header_name':'Tensor Layer Name', 'align':'left'},
|
||||||
|
{'key_name':'data_offset', 'header_name':'Data Offset (B)', 'align':'right'},
|
||||||
|
{'key_name':'data_size', 'header_name':'Data Size (B)', 'align':'right'},
|
||||||
|
]
|
||||||
|
|
||||||
|
markdown_content += markdown_table_with_alignment_support(tensors_mapping_table_header_map, tensor_mapping_table)
|
||||||
|
markdown_content += "\n"
|
||||||
|
|
||||||
for group in tensor_prefix_order:
|
for group in tensor_prefix_order:
|
||||||
tensors = tensor_groups[group]
|
tensors = tensor_groups[group]
|
||||||
group_elements = sum(tensor.n_elements for tensor in tensors)
|
group_elements = sum(tensor.n_elements for tensor in tensors)
|
||||||
|
@ -364,6 +391,8 @@ def main() -> None:
|
||||||
parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
|
parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
|
||||||
parser.add_argument("--json", action="store_true", help="Produce JSON output")
|
parser.add_argument("--json", action="store_true", help="Produce JSON output")
|
||||||
parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
|
parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
|
||||||
|
parser.add_argument("--data-offset", action="store_true", help="Start of data offset")
|
||||||
|
parser.add_argument("--data-alignment", action="store_true", help="Data alignment applied globally to data field")
|
||||||
parser.add_argument("--markdown", action="store_true", help="Produce markdown output")
|
parser.add_argument("--markdown", action="store_true", help="Produce markdown output")
|
||||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||||
|
|
||||||
|
@ -371,7 +400,7 @@ def main() -> None:
|
||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||||
|
|
||||||
if not args.json and not args.markdown:
|
if not args.json and not args.markdown and not args.data_offset and not args.data_alignment:
|
||||||
logger.info(f'* Loading: {args.model}')
|
logger.info(f'* Loading: {args.model}')
|
||||||
|
|
||||||
reader = GGUFReader(args.model, 'r')
|
reader = GGUFReader(args.model, 'r')
|
||||||
|
@ -380,6 +409,10 @@ def main() -> None:
|
||||||
dump_metadata_json(reader, args)
|
dump_metadata_json(reader, args)
|
||||||
elif args.markdown:
|
elif args.markdown:
|
||||||
dump_markdown_metadata(reader, args)
|
dump_markdown_metadata(reader, args)
|
||||||
|
elif args.data_offset:
|
||||||
|
print(reader.data_offset) # noqa: NP100
|
||||||
|
elif args.data_alignment:
|
||||||
|
print(reader.alignment) # noqa: NP100
|
||||||
else:
|
else:
|
||||||
dump_metadata(reader, args)
|
dump_metadata(reader, args)
|
||||||
|
|
||||||
|
|
245
llama.cpp
245
llama.cpp
|
@ -225,6 +225,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_OLMO,
|
LLM_ARCH_OLMO,
|
||||||
LLM_ARCH_ARCTIC,
|
LLM_ARCH_ARCTIC,
|
||||||
LLM_ARCH_DEEPSEEK2,
|
LLM_ARCH_DEEPSEEK2,
|
||||||
|
LLM_ARCH_BITNET,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -263,6 +264,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_OLMO, "olmo" },
|
{ LLM_ARCH_OLMO, "olmo" },
|
||||||
{ LLM_ARCH_ARCTIC, "arctic" },
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
||||||
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
||||||
|
{ LLM_ARCH_BITNET, "bitnet" },
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -500,6 +502,8 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_ATTN_KV_B,
|
LLM_TENSOR_ATTN_KV_B,
|
||||||
LLM_TENSOR_ATTN_Q_A_NORM,
|
LLM_TENSOR_ATTN_Q_A_NORM,
|
||||||
LLM_TENSOR_ATTN_KV_A_NORM,
|
LLM_TENSOR_ATTN_KV_A_NORM,
|
||||||
|
LLM_TENSOR_ATTN_SUB_NORM,
|
||||||
|
LLM_TENSOR_FFN_SUB_NORM,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
||||||
|
@ -1113,6 +1117,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_BITNET,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_SUB_NORM, "blk.%d.attn_sub_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
{
|
{
|
||||||
|
@ -2118,6 +2140,8 @@ struct llama_layer {
|
||||||
struct ggml_tensor * attn_out_norm_b;
|
struct ggml_tensor * attn_out_norm_b;
|
||||||
struct ggml_tensor * attn_q_a_norm;
|
struct ggml_tensor * attn_q_a_norm;
|
||||||
struct ggml_tensor * attn_kv_a_norm;
|
struct ggml_tensor * attn_kv_a_norm;
|
||||||
|
struct ggml_tensor * attn_sub_norm;
|
||||||
|
struct ggml_tensor * ffn_sub_norm;
|
||||||
|
|
||||||
// attention
|
// attention
|
||||||
struct ggml_tensor * wq;
|
struct ggml_tensor * wq;
|
||||||
|
@ -2185,6 +2209,15 @@ struct llama_layer {
|
||||||
// long rope factors
|
// long rope factors
|
||||||
struct ggml_tensor * rope_long = nullptr;
|
struct ggml_tensor * rope_long = nullptr;
|
||||||
struct ggml_tensor * rope_short = nullptr;
|
struct ggml_tensor * rope_short = nullptr;
|
||||||
|
|
||||||
|
// bitnet scale
|
||||||
|
struct ggml_tensor * wq_scale;
|
||||||
|
struct ggml_tensor * wk_scale;
|
||||||
|
struct ggml_tensor * wv_scale;
|
||||||
|
struct ggml_tensor * wo_scale;
|
||||||
|
struct ggml_tensor * ffn_gate_scale;
|
||||||
|
struct ggml_tensor * ffn_up_scale;
|
||||||
|
struct ggml_tensor * ffn_down_scale;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_kv_cell {
|
struct llama_kv_cell {
|
||||||
|
@ -4710,6 +4743,15 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 26: model.type = e_model::MODEL_3B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default: (void)0;
|
default: (void)0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6655,6 +6697,44 @@ static bool llm_load_tensors(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
|
||||||
|
// output
|
||||||
|
{
|
||||||
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
|
||||||
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
|
||||||
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
|
||||||
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
|
||||||
|
|
||||||
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||||
|
layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
|
||||||
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("unknown architecture");
|
throw std::runtime_error("unknown architecture");
|
||||||
}
|
}
|
||||||
|
@ -7295,7 +7375,10 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
|
|
||||||
ggml_build_forward_expand(graph, cur);
|
ggml_build_forward_expand(graph, cur);
|
||||||
|
|
||||||
|
if (wo) {
|
||||||
cur = ggml_mul_mat(ctx, wo, cur);
|
cur = ggml_mul_mat(ctx, wo, cur);
|
||||||
|
}
|
||||||
|
|
||||||
if (wo_b) {
|
if (wo_b) {
|
||||||
cb(cur, "kqv_wo", il);
|
cb(cur, "kqv_wo", il);
|
||||||
}
|
}
|
||||||
|
@ -11797,6 +11880,153 @@ struct llm_build_context {
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_bitnet() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// B1.K
|
||||||
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
|
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// B1.V
|
||||||
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
||||||
|
nullptr, nullptr,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.layers[il].attn_sub_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_sub_norm", il);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
||||||
|
if (model.layers[il].bo) {
|
||||||
|
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
||||||
|
}
|
||||||
|
cb(cur, "attn_o_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// feed-forward forward
|
||||||
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||||
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
struct ggml_tensor *tmp = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
|
||||||
|
tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_up_scale);
|
||||||
|
cb(tmp, "ffn_up", il);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur);
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_gate_scale);
|
||||||
|
cb(cur, "ffn_gate", il);
|
||||||
|
|
||||||
|
cur = ggml_silu(ctx0, cur);
|
||||||
|
cb(cur, "ffn_silu", il);
|
||||||
|
|
||||||
|
cur = ggml_mul(ctx0, cur, tmp);
|
||||||
|
cb(cur, "ffn_gate_par", il);
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.layers[il].ffn_sub_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_sub_norm", il);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
||||||
|
cb(cur, "ffn_down", il);
|
||||||
|
}
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||||
|
@ -12020,6 +12250,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_deepseek2();
|
result = llm.build_deepseek2();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
|
{
|
||||||
|
result = llm.build_bitnet();
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
@ -12639,12 +12873,6 @@ static int llama_decode_internal(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_PERF
|
|
||||||
// print timing information per ggml operation (for debugging purposes)
|
|
||||||
// requires GGML_PERF to be defined
|
|
||||||
ggml_graph_print(gf);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// plot the computation graph in dot format (for debugging purposes)
|
// plot the computation graph in dot format (for debugging purposes)
|
||||||
//if (n_past%100 == 0) {
|
//if (n_past%100 == 0) {
|
||||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||||
|
@ -16848,6 +17076,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_BERT:
|
case LLM_ARCH_BERT:
|
||||||
case LLM_ARCH_NOMIC_BERT:
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
case LLM_ARCH_STABLELM:
|
case LLM_ARCH_STABLELM:
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
case LLM_ARCH_QWEN:
|
case LLM_ARCH_QWEN:
|
||||||
case LLM_ARCH_QWEN2:
|
case LLM_ARCH_QWEN2:
|
||||||
case LLM_ARCH_QWEN2MOE:
|
case LLM_ARCH_QWEN2MOE:
|
||||||
|
@ -18677,10 +18906,10 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|im_start|>assistant\n";
|
ss << "<|im_start|>assistant\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
|
} else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) {
|
||||||
// llama2 template and its variants
|
// llama2 template and its variants
|
||||||
// [variant] support system message
|
// [variant] support system message
|
||||||
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos || tmpl == "mistral";
|
||||||
// [variant] space before + after response
|
// [variant] space before + after response
|
||||||
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
|
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
|
||||||
// [variant] add BOS inside history
|
// [variant] add BOS inside history
|
||||||
|
|
2
llama.h
2
llama.h
|
@ -786,7 +786,7 @@ extern "C" {
|
||||||
// Get the number of threads used for prompt and batch processing (multiple token).
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
||||||
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||||
|
|
||||||
// Set whether the model is in embeddings model or not
|
// Set whether the model is in embeddings mode or not
|
||||||
// If true, embeddings will be returned but logits will not
|
// If true, embeddings will be returned but logits will not
|
||||||
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
||||||
|
|
||||||
|
|
37
sgemm.cpp
37
sgemm.cpp
|
@ -249,8 +249,7 @@ class tinyBLAS {
|
||||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(int64_t m, int64_t n, int task) {
|
void matmul(int64_t m, int64_t n) {
|
||||||
if (task == GGML_TASK_TYPE_COMPUTE)
|
|
||||||
mnpack(0, m, 0, n);
|
mnpack(0, m, 0, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -458,8 +457,7 @@ class tinyBLAS_Q0_ARM {
|
||||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(int64_t m, int64_t n, int task) {
|
void matmul(int64_t m, int64_t n) {
|
||||||
if (task == GGML_TASK_TYPE_COMPUTE)
|
|
||||||
mnpack(0, m, 0, n);
|
mnpack(0, m, 0, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -596,8 +594,7 @@ class tinyBLAS_Q0_AVX {
|
||||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(int64_t m, int64_t n, int task) {
|
void matmul(int64_t m, int64_t n) {
|
||||||
if (task == GGML_TASK_TYPE_COMPUTE)
|
|
||||||
mnpack(0, m, 0, n);
|
mnpack(0, m, 0, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -829,7 +826,7 @@ class tinyBLAS_Q0_AVX {
|
||||||
* For example, for single-threaded single-precision GEMM you can say
|
* For example, for single-threaded single-precision GEMM you can say
|
||||||
*
|
*
|
||||||
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
|
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
|
||||||
* 0, 1, GGML_TASK_TYPE_COMPUTE,
|
* 0, 1,
|
||||||
* GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
|
* GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
|
||||||
*
|
*
|
||||||
* @param m is rows in `A` and `C`
|
* @param m is rows in `A` and `C`
|
||||||
|
@ -843,14 +840,13 @@ class tinyBLAS_Q0_AVX {
|
||||||
* @param ldc is row stride of `C`
|
* @param ldc is row stride of `C`
|
||||||
* @param ith is thread id (must be less than `nth`)
|
* @param ith is thread id (must be less than `nth`)
|
||||||
* @param nth is number of threads (must be greater than zero)
|
* @param nth is number of threads (must be greater than zero)
|
||||||
* @param task is GGML task type
|
|
||||||
* @param Atype is GGML data type of `A`
|
* @param Atype is GGML data type of `A`
|
||||||
* @param Btype is GGML data type of `B`
|
* @param Btype is GGML data type of `B`
|
||||||
* @param Ctype is GGML data type of `C`
|
* @param Ctype is GGML data type of `C`
|
||||||
* @return true if this function was able to service the matmul request
|
* @return true if this function was able to service the matmul request
|
||||||
*/
|
*/
|
||||||
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
||||||
int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) {
|
int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
|
||||||
|
|
||||||
assert(m >= 0);
|
assert(m >= 0);
|
||||||
assert(n >= 0);
|
assert(n >= 0);
|
||||||
|
@ -877,7 +873,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__AVX__) || defined(__AVX2__)
|
#elif defined(__AVX__) || defined(__AVX2__)
|
||||||
if (k % 8)
|
if (k % 8)
|
||||||
|
@ -887,7 +883,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_NEON)
|
#elif defined(__ARM_NEON)
|
||||||
if (n < 4)
|
if (n < 4)
|
||||||
|
@ -899,7 +895,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -917,7 +913,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
||||||
if (k % 8)
|
if (k % 8)
|
||||||
|
@ -929,7 +925,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
||||||
if (n < 8)
|
if (n < 8)
|
||||||
|
@ -943,7 +939,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const ggml_fp16_t *)B, ldb,
|
(const ggml_fp16_t *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||||
if (k % 4)
|
if (k % 4)
|
||||||
|
@ -955,7 +951,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -971,7 +967,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||||
tinyBLAS_Q0_ARM<block_q8_0> tb{
|
tinyBLAS_Q0_ARM<block_q8_0> tb{
|
||||||
|
@ -979,7 +975,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -995,7 +991,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||||
tinyBLAS_Q0_ARM<block_q4_0> tb{
|
tinyBLAS_Q0_ARM<block_q4_0> tb{
|
||||||
|
@ -1003,7 +999,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -1025,7 +1021,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(void)ldc;
|
(void)ldc;
|
||||||
(void)ith;
|
(void)ith;
|
||||||
(void)nth;
|
(void)nth;
|
||||||
(void)task;
|
|
||||||
(void)Atype;
|
(void)Atype;
|
||||||
(void)Btype;
|
(void)Btype;
|
||||||
(void)Ctype;
|
(void)Ctype;
|
||||||
|
|
2
sgemm.h
2
sgemm.h
|
@ -7,7 +7,7 @@ extern "C" {
|
||||||
|
|
||||||
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
||||||
const void *, int64_t, void *, int64_t, int, int,
|
const void *, int64_t, void *, int64_t, int, int,
|
||||||
int, int, int, int);
|
int, int, int);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -785,6 +785,10 @@ struct test_cpy : public test_case {
|
||||||
return VARS_TO_STR3(type_src, type_dst, ne);
|
return VARS_TO_STR3(type_src, type_dst, ne);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double max_nmse_err() override {
|
||||||
|
return 1e-6;
|
||||||
|
}
|
||||||
|
|
||||||
size_t op_size(ggml_tensor * t) override {
|
size_t op_size(ggml_tensor * t) override {
|
||||||
return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
|
return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
int main(void) {
|
int main(void) {
|
||||||
llama_chat_message conversation[] = {
|
llama_chat_message conversation[] = {
|
||||||
|
@ -119,5 +120,24 @@ int main(void) {
|
||||||
std::cout << output << "\n-------------------------\n";
|
std::cout << output << "\n-------------------------\n";
|
||||||
assert(output == expected);
|
assert(output == expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// test llama_chat_format_single
|
||||||
|
std::cout << "\n\n=== llama_chat_format_single ===\n\n";
|
||||||
|
std::vector<llama_chat_msg> chat2;
|
||||||
|
chat2.push_back({"system", "You are a helpful assistant"});
|
||||||
|
chat2.push_back({"user", "Hello"});
|
||||||
|
chat2.push_back({"assistant", "I am assistant"});
|
||||||
|
llama_chat_msg new_msg{"user", "How are you"};
|
||||||
|
|
||||||
|
auto fmt_single = [&](std::string tmpl) {
|
||||||
|
auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
|
||||||
|
std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n";
|
||||||
|
return output;
|
||||||
|
};
|
||||||
|
assert(fmt_single("chatml") == "<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
|
||||||
|
assert(fmt_single("llama2") == "[INST] How are you [/INST]");
|
||||||
|
assert(fmt_single("gemma") == "<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
|
||||||
|
assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
const uint tid = gl_LocalInvocationID.x;
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue