Merge branch 'ggerganov:master' into master

This commit is contained in:
bmwl 2024-02-13 20:34:16 -08:00 committed by GitHub
commit 0e05042b45
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 854 additions and 429 deletions

View file

@ -569,6 +569,14 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
$(info I CXX: $(shell $(CXX) --version | head -n 1)) $(info I CXX: $(shell $(CXX) --version | head -n 1))
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
ifndef CUDA_DOCKER_ARCH
ifndef CUDA_POWER_ARCH
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
endif # LLAMA_CUBLAS endif # LLAMA_CUBLAS
$(info ) $(info )

View file

@ -568,6 +568,50 @@ function gg_sum_open_llama_7b_v2 {
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
} }
# bge-small
function gg_run_embd_bge_small {
cd ${SRC}
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
path_models="../models-mnt/bge-small"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert-hf-to-gguf.py ${path_models}
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
set +e
}
function gg_sum_embd_bge_small {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'BGE Small (BERT):\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
}
## main ## main
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
@ -591,6 +635,8 @@ test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run embd_bge_small
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then if [ -z ${GG_BUILD_CUDA} ]; then
test $ret -eq 0 && gg_run open_llama_3b_v2 test $ret -eq 0 && gg_run open_llama_3b_v2

View file

@ -10,7 +10,7 @@ import re
import sys import sys
from enum import IntEnum from enum import IntEnum
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast
import numpy as np import numpy as np
import torch import torch
@ -25,15 +25,6 @@ import gguf
from convert import HfVocab from convert import HfVocab
# check for any of the given keys in the dictionary and return the value of the first key found
def get_key_opts(d, keys):
for k in keys:
if k in d:
return d[k]
print(f"Could not find any of {keys}")
sys.exit()
###### MODEL DEFINITIONS ###### ###### MODEL DEFINITIONS ######
class SentencePieceTokenTypes(IntEnum): class SentencePieceTokenTypes(IntEnum):
@ -58,6 +49,15 @@ class Model:
self.hparams = Model.load_hparams(self.dir_model) self.hparams = Model.load_hparams(self.dir_model)
self.model_arch = self._get_model_architecture() self.model_arch = self._get_model_architecture()
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
key = next((k for k in keys if k in self.hparams), None)
if key is not None:
return self.hparams[key]
if optional:
return None
raise KeyError(f"could not find any of: {keys}")
def set_vocab(self): def set_vocab(self):
self._set_vocab_gpt2() self._set_vocab_gpt2()
@ -79,28 +79,33 @@ class Model:
def set_gguf_parameters(self): def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_block_count(self.hparams.get( self.gguf_writer.add_block_count(self.block_count)
"n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
)) if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
self.gguf_writer.add_context_length(n_ctx) self.gguf_writer.add_context_length(n_ctx)
if (n_embd := self.hparams.get("hidden_size")) is not None:
self.gguf_writer.add_embedding_length(n_embd) n_embd = self.find_hparam(["hidden_size", "n_embd"])
if (n_ff := self.hparams.get("intermediate_size")) is not None: self.gguf_writer.add_embedding_length(n_embd)
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
self.gguf_writer.add_feed_forward_length(n_ff) self.gguf_writer.add_feed_forward_length(n_ff)
if (n_head := self.hparams.get("num_attention_heads")) is not None:
self.gguf_writer.add_head_count(n_head) n_head = self.find_hparam(["num_attention_heads", "n_head"])
self.gguf_writer.add_head_count(n_head)
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
self.gguf_writer.add_head_count_kv(n_head_kv) self.gguf_writer.add_head_count_kv(n_head_kv)
if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None: if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps) self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None:
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
if (n_experts := self.hparams.get("num_local_experts")) is not None: if (n_experts := self.hparams.get("num_local_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts) self.gguf_writer.add_expert_count(n_experts)
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used) self.gguf_writer.add_expert_used_count(n_experts_used)
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) self.gguf_writer.add_file_type(self.ftype)
def write_tensors(self): def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@ -211,6 +216,8 @@ class Model:
return MiniCPMModel return MiniCPMModel
if model_architecture == "BertModel": if model_architecture == "BertModel":
return BertModel return BertModel
if model_architecture == "NomicBertModel":
return NomicBertModel
return Model return Model
def _is_model_safetensors(self) -> bool: def _is_model_safetensors(self) -> bool:
@ -268,6 +275,8 @@ class Model:
return gguf.MODEL_ARCH.MINICPM return gguf.MODEL_ARCH.MINICPM
if arch == "BertModel": if arch == "BertModel":
return gguf.MODEL_ARCH.BERT return gguf.MODEL_ARCH.BERT
if arch == "NomicBertModel":
return gguf.MODEL_ARCH.NOMIC_BERT
raise NotImplementedError(f'Architecture "{arch}" not supported!') raise NotImplementedError(f'Architecture "{arch}" not supported!')
@ -1297,21 +1306,21 @@ class GPT2Model(Model):
class Phi2Model(Model): class Phi2Model(Model):
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"]) block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"]) rot_pct = self.find_hparam(["partial_rotary_factor"])
n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"]) n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"]) n_head = self.find_hparam(["num_attention_heads", "n_head"])
self.gguf_writer.add_name("Phi2") self.gguf_writer.add_name("Phi2")
self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"])) self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_embedding_length(n_embd)
self.gguf_writer.add_feed_forward_length(4 * n_embd) self.gguf_writer.add_feed_forward_length(4 * n_embd)
self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count(n_head)
self.gguf_writer.add_head_count_kv(n_head) self.gguf_writer.add_head_count_kv(n_head)
self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"])) self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_add_bos_token(False) self.gguf_writer.add_add_bos_token(False)
@ -1636,19 +1645,12 @@ in chat mode so that the conversation can end normally.")
class BertModel(Model): class BertModel(Model):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.block_count = self.hparams["num_hidden_layers"] self.vocab_size = None
def set_gguf_parameters(self): def set_gguf_parameters(self):
# TODO(cebtenzzre): merge with parent class super().set_gguf_parameters()
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
self.gguf_writer.add_causal_attention(False) self.gguf_writer.add_causal_attention(False)
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_pooling_layer(True)
def set_vocab(self): def set_vocab(self):
path = self.dir_model path = self.dir_model
@ -1658,6 +1660,7 @@ class BertModel(Model):
vocab = HfVocab(path, added_tokens_path) vocab = HfVocab(path, added_tokens_path)
tokens, scores, toktypes = zip(*vocab.all_tokens()) tokens, scores, toktypes = zip(*vocab.all_tokens())
assert len(tokens) == vocab.vocab_size assert len(tokens) == vocab.vocab_size
self.vocab_size = vocab.vocab_size
# we need this to validate the size of the token_type embeddings # we need this to validate the size of the token_type embeddings
# though currently we are passing all zeros to the token_type embeddings # though currently we are passing all zeros to the token_type embeddings
@ -1671,7 +1674,7 @@ class BertModel(Model):
if tok.startswith(b"##"): if tok.startswith(b"##"):
return tok[2:] return tok[2:]
return b"\xe2\x96\x81" + tok return b"\xe2\x96\x81" + tok
tokens = [phantom(t, y) for t, y in zip(tokens, toktypes)] tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
# set up bos and eos tokens (cls and sep) # set up bos and eos tokens (cls and sep)
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id) self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
@ -1723,6 +1726,43 @@ class BertModel(Model):
self.gguf_writer.add_tensor(new_name, data) self.gguf_writer.add_tensor(new_name, data)
class NomicBertModel(BertModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# the HF config claims n_ctx=8192, but it uses RoPE scaling
self.hparams["n_ctx"] = 2048
# SwigLU activation
assert self.hparams["activation_function"] == "swiglu"
# this doesn't do anything in the HF version
assert self.hparams["causal"] is False
# no bias tensors
assert self.hparams["qkv_proj_bias"] is False
assert self.hparams["mlp_fc1_bias"] is False
assert self.hparams["mlp_fc2_bias"] is False
# norm at end of layer
assert self.hparams["prenorm"] is False
# standard RoPE
assert self.hparams["rotary_emb_fraction"] == 1.0
assert self.hparams["rotary_emb_interleaved"] is False
assert self.hparams["rotary_emb_scale_base"] is None
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
def get_tensors(self):
assert self.vocab_size is not None
for name, data in super().get_tensors():
# Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
data = data[:self.vocab_size, :]
yield name, data
###### CONVERSION LOGIC ###### ###### CONVERSION LOGIC ######

View file

@ -38,6 +38,7 @@ else()
add_subdirectory(speculative) add_subdirectory(speculative)
add_subdirectory(lookahead) add_subdirectory(lookahead)
add_subdirectory(lookup) add_subdirectory(lookup)
add_subdirectory(gguf)
add_subdirectory(train-text-from-scratch) add_subdirectory(train-text-from-scratch)
add_subdirectory(imatrix) add_subdirectory(imatrix)
if (LLAMA_BUILD_SERVER) if (LLAMA_BUILD_SERVER)

View file

@ -7,6 +7,51 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static std::vector<std::string> split_lines(const std::string & s) {
std::string line;
std::vector<std::string> lines;
std::stringstream ss(s);
while (std::getline(ss, line)) {
lines.push_back(line);
}
return lines;
}
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
for (size_t i = 0; i < tokens.size(); i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, false);
}
}
static void normalize(float * vec, float * out, int n) {
float norm = 0;
for (int i = 0; i < n; i++) {
norm += vec[i] * vec[i];
}
norm = sqrt(norm);
for (int i = 0; i < n; i++) {
out[i] = vec[i] / norm;
}
}
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
// run model
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_decode(ctx, batch) < 0) {
fprintf(stderr, "%s : failed to decode\n", __func__);
}
// normalize on copy
for (int k = 0; k < n_seq; k++) {
float * emb = llama_get_embeddings_ith(ctx, k);
float * out = output + k * n_embd;
normalize(emb, out, n_embd);
}
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
@ -55,59 +100,84 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", get_system_info(params).c_str());
} }
int n_past = 0; // split the prompt into lines
std::vector<std::string> prompts = split_lines(params.prompt);
// tokenize the prompt // max batch size
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); const uint64_t n_batch = params.n_batch;
GGML_ASSERT(params.n_batch == params.n_ctx);
// tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs;
for (const auto & prompt : prompts) {
auto inp = ::llama_tokenize(ctx, prompt, true);
if (inp.size() > n_batch) {
inp.resize(n_batch);
}
inputs.push_back(inp);
}
// tokenization stats
if (params.verbose_prompt) { if (params.verbose_prompt) {
fprintf(stderr, "\n"); for (int i = 0; i < (int) inputs.size(); i++) {
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
for (int i = 0; i < (int) embd_inp.size(); i++) { for (int j = 0; j < (int) inputs[i].size(); j++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
}
fprintf(stderr, "\n\n");
} }
fprintf(stderr, "\n");
} }
if (embd_inp.size() > (size_t)n_ctx) { // initialize batch
fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n", const int n_prompts = prompts.size();
__func__, embd_inp.size(), n_ctx); struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts);
return 1;
}
while (!embd_inp.empty()) {
int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
n_past += n_tokens;
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
}
// allocate output
const int n_embd = llama_n_embd(model); const int n_embd = llama_n_embd(model);
auto * embeddings = llama_get_embeddings(ctx); std::vector<float> embeddings(n_prompts * n_embd, 0);
float * emb = embeddings.data();
// l2-normalize embeddings // break into batches
float norm = 0; int p = 0; // number of prompts processed already
for (int i = 0; i < n_embd; i++) { int s = 0; // number of prompts in current batch
norm += embeddings[i] * embeddings[i]; for (int k = 0; k < n_prompts; k++) {
} // clamp to n_batch tokens
norm = sqrt(norm); auto & inp = inputs[k];
for (int i = 0; i < n_embd; i++) { const uint64_t n_toks = inp.size();
embeddings[i] /= norm;
// encode if at capacity
if (batch.n_tokens + n_toks > n_batch) {
float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd);
llama_batch_clear(batch);
p += s;
s = 0;
}
// add to batch
batch_add_seq(batch, inp, s);
s += 1;
} }
for (int i = 0; i < n_embd; i++) { // final batch
printf("%f ", embeddings[i]); float * out = emb + p * n_embd;
} batch_decode(ctx, batch, out, s, n_embd);
printf("\n");
// print first 3 embeddings
for (int j = 0; j < std::min(3, n_prompts); j++) {
fprintf(stderr, "embedding %d: ", j);
for (int i = 0; i < n_embd; i++) {
fprintf(stderr, "%f ", emb[j * n_embd + i]);
}
fprintf(stderr, "\n\n");
}
fprintf(stderr, "\n");
// clean up
llama_print_timings(ctx); llama_print_timings(ctx);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
llama_backend_free(); llama_backend_free();
return 0; return 0;

View file

@ -80,9 +80,9 @@ The LORA rank can be configured for each model tensor type separately with these
--rank-wk N LORA rank for wk tensor (default 4) --rank-wk N LORA rank for wk tensor (default 4)
--rank-wv N LORA rank for wv tensor (default 4) --rank-wv N LORA rank for wv tensor (default 4)
--rank-wo N LORA rank for wo tensor (default 4) --rank-wo N LORA rank for wo tensor (default 4)
--rank-w1 N LORA rank for w1 tensor (default 4) --rank-ffn_gate N LORA rank for ffn_gate tensor (default 4)
--rank-w2 N LORA rank for w2 tensor (default 4) --rank-ffn_down N LORA rank for ffn_down tensor (default 4)
--rank-w3 N LORA rank for w3 tensor (default 4) --rank-ffn_up N LORA rank for ffn_up tensor (default 4)
``` ```
The LORA rank of 'norm' tensors should always be 1. The LORA rank of 'norm' tensors should always be 1.

View file

@ -60,9 +60,9 @@ struct my_llama_layer {
struct ggml_tensor * ffn_norm; struct ggml_tensor * ffn_norm;
// ff // ff
struct ggml_tensor * w1; struct ggml_tensor * ffn_gate; // w1
struct ggml_tensor * w2; struct ggml_tensor * ffn_down; // w2
struct ggml_tensor * w3; struct ggml_tensor * ffn_up; // w3
}; };
struct my_llama_model { struct my_llama_model {
@ -85,9 +85,9 @@ struct my_llama_lora_hparams {
uint32_t n_rank_wv = 4; uint32_t n_rank_wv = 4;
uint32_t n_rank_wo = 4; uint32_t n_rank_wo = 4;
uint32_t n_rank_ffn_norm = 1; uint32_t n_rank_ffn_norm = 1;
uint32_t n_rank_w1 = 4; uint32_t n_rank_ffn_gate = 4;
uint32_t n_rank_w2 = 4; uint32_t n_rank_ffn_down = 4;
uint32_t n_rank_w3 = 4; uint32_t n_rank_ffn_up = 4;
uint32_t n_rank_tok_embeddings = 4; uint32_t n_rank_tok_embeddings = 4;
uint32_t n_rank_norm = 1; uint32_t n_rank_norm = 1;
uint32_t n_rank_output = 4; uint32_t n_rank_output = 4;
@ -117,12 +117,12 @@ struct my_llama_lora_layer {
struct ggml_tensor * ffn_norm_b; struct ggml_tensor * ffn_norm_b;
// ff // ff
struct ggml_tensor * w1_a; struct ggml_tensor * ffn_gate_a;
struct ggml_tensor * w1_b; struct ggml_tensor * ffn_gate_b;
struct ggml_tensor * w2_a; struct ggml_tensor * ffn_down_a;
struct ggml_tensor * w2_b; struct ggml_tensor * ffn_down_b;
struct ggml_tensor * w3_a; struct ggml_tensor * ffn_up_a;
struct ggml_tensor * w3_b; struct ggml_tensor * ffn_up_b;
}; };
struct my_llama_lora { struct my_llama_lora {
@ -208,9 +208,9 @@ static void print_lora_params(struct my_llama_lora_hparams * params) {
printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv); printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv);
printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo); printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo);
printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm); printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm);
printf("%s: n_rank_w1 : %u\n", __func__, params->n_rank_w1); printf("%s: n_rank_ffn_gate : %u\n", __func__, params->n_rank_ffn_gate);
printf("%s: n_rank_w2 : %u\n", __func__, params->n_rank_w2); printf("%s: n_rank_ffn_down : %u\n", __func__, params->n_rank_ffn_down);
printf("%s: n_rank_w3 : %u\n", __func__, params->n_rank_w3); printf("%s: n_rank_ffn_up : %u\n", __func__, params->n_rank_ffn_up);
printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings); printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm); printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm);
printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output); printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output);
@ -319,9 +319,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i)); layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i));
layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i)); layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i));
layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i)); layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i));
layer.w1 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); layer.ffn_gate = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
layer.w2 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); layer.ffn_down = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
layer.w3 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); layer.ffn_up = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
assert_shape_1d(layer.attention_norm, hparams.n_embd); assert_shape_1d(layer.attention_norm, hparams.n_embd);
assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd); assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd);
@ -329,9 +329,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa()); assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa());
assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd); assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd);
assert_shape_1d(layer.ffn_norm, hparams.n_embd); assert_shape_1d(layer.ffn_norm, hparams.n_embd);
assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff); assert_shape_2d(layer.ffn_gate, hparams.n_embd, hparams.n_ff);
assert_shape_2d(layer.w2, hparams.n_ff, hparams.n_embd); assert_shape_2d(layer.ffn_down, hparams.n_ff, hparams.n_embd);
assert_shape_2d(layer.w3, hparams.n_embd, hparams.n_ff); assert_shape_2d(layer.ffn_up, hparams.n_embd, hparams.n_ff);
} }
} }
@ -362,12 +362,12 @@ static void set_param_lora(struct my_llama_lora * lora) {
ggml_set_param(ctx, layer.wo_b); ggml_set_param(ctx, layer.wo_b);
ggml_set_param(ctx, layer.ffn_norm_a); ggml_set_param(ctx, layer.ffn_norm_a);
ggml_set_param(ctx, layer.ffn_norm_b); ggml_set_param(ctx, layer.ffn_norm_b);
ggml_set_param(ctx, layer.w1_a); ggml_set_param(ctx, layer.ffn_gate_a);
ggml_set_param(ctx, layer.w1_b); ggml_set_param(ctx, layer.ffn_gate_b);
ggml_set_param(ctx, layer.w2_a); ggml_set_param(ctx, layer.ffn_down_a);
ggml_set_param(ctx, layer.w2_b); ggml_set_param(ctx, layer.ffn_down_b);
ggml_set_param(ctx, layer.w3_a); ggml_set_param(ctx, layer.ffn_up_a);
ggml_set_param(ctx, layer.w3_b); ggml_set_param(ctx, layer.ffn_up_b);
} }
} }
@ -435,12 +435,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd); layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd);
layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1); layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1);
layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd); layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd);
layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff); layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff);
layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff); layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff);
layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd); layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd);
layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd); layer.ffn_up_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_embd);
layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff); layer.ffn_up_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_ff);
ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i)); ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i));
ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i)); ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i));
@ -454,12 +454,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i)); ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i));
ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i)); ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i));
ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i)); ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i));
ggml_set_name(layer.w1_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); ggml_set_name(layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i));
ggml_set_name(layer.w1_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); ggml_set_name(layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i));
ggml_set_name(layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); ggml_set_name(layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i));
ggml_set_name(layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); ggml_set_name(layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i));
ggml_set_name(layer.w3_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); ggml_set_name(layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i));
ggml_set_name(layer.w3_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); ggml_set_name(layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i));
} }
set_param_lora(lora); set_param_lora(lora);
@ -497,12 +497,12 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
randomize_tensor_normal(layer.ffn_norm_a, rnd); randomize_tensor_normal(layer.ffn_norm_a, rnd);
ggml_set_zero(layer.ffn_norm_b); ggml_set_zero(layer.ffn_norm_b);
randomize_tensor_normal(layer.w1_a, rnd); randomize_tensor_normal(layer.ffn_gate_a, rnd);
ggml_set_zero(layer.w1_b); ggml_set_zero(layer.ffn_gate_b);
randomize_tensor_normal(layer.w2_a, rnd); randomize_tensor_normal(layer.ffn_down_a, rnd);
ggml_set_zero(layer.w2_b); ggml_set_zero(layer.ffn_down_b);
randomize_tensor_normal(layer.w3_a, rnd); randomize_tensor_normal(layer.ffn_up_a, rnd);
ggml_set_zero(layer.w3_b); ggml_set_zero(layer.ffn_up_b);
} }
free_random_normal_distribution(rnd); free_random_normal_distribution(rnd);
@ -610,13 +610,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b)); struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b));
struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b)); struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b));
struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b)); struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b));
struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b)); struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b));
struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b)); struct ggml_tensor * ffn_up = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b));
struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch); struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch);
struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch); struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch);
@ -659,11 +659,11 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch);
struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch);
struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch);
struct ggml_tensor * t25 = ggml_mul_mat (ctx, w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); struct ggml_tensor * t25 = ggml_mul_mat (ctx, ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
struct ggml_tensor * t26 = ggml_mul_mat (ctx, w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); struct ggml_tensor * t26 = ggml_mul_mat (ctx, ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch);
struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch);
struct ggml_tensor * t29 = ggml_mul_mat (ctx, w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); struct ggml_tensor * t29 = ggml_mul_mat (ctx, ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch);
cur = t30; cur = t30;
if (enable_checkpointing) { if (enable_checkpointing) {
@ -723,9 +723,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f));
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f));
} }
// allocating checkpoints in one block to reduce memory fragmentation // allocating checkpoints in one block to reduce memory fragmentation
@ -798,9 +798,9 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V); GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V);
GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT); GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT);
GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM); GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM);
GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
init_lora(model, lora); init_lora(model, lora);
@ -825,12 +825,12 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b)); copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b));
copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a)); copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b)); copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
copy_tensor_by_name(layer.w1_a, f_ggml_ctx, ggml_get_name(layer.w1_a)); copy_tensor_by_name(layer.ffn_gate_a, f_ggml_ctx, ggml_get_name(layer.ffn_gate_a));
copy_tensor_by_name(layer.w1_b, f_ggml_ctx, ggml_get_name(layer.w1_b)); copy_tensor_by_name(layer.ffn_gate_b, f_ggml_ctx, ggml_get_name(layer.ffn_gate_b));
copy_tensor_by_name(layer.w2_a, f_ggml_ctx, ggml_get_name(layer.w2_a)); copy_tensor_by_name(layer.ffn_down_a, f_ggml_ctx, ggml_get_name(layer.ffn_down_a));
copy_tensor_by_name(layer.w2_b, f_ggml_ctx, ggml_get_name(layer.w2_b)); copy_tensor_by_name(layer.ffn_down_b, f_ggml_ctx, ggml_get_name(layer.ffn_down_b));
copy_tensor_by_name(layer.w3_a, f_ggml_ctx, ggml_get_name(layer.w3_a)); copy_tensor_by_name(layer.ffn_up_a, f_ggml_ctx, ggml_get_name(layer.ffn_up_a));
copy_tensor_by_name(layer.w3_b, f_ggml_ctx, ggml_get_name(layer.w3_b)); copy_tensor_by_name(layer.ffn_up_b, f_ggml_ctx, ggml_get_name(layer.ffn_up_b));
} }
} }
@ -868,9 +868,9 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv);
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo);
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm);
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_w1); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_ffn_gate);
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_w2); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_ffn_down);
gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_w3); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_ffn_up);
gguf_add_tensor(fctx, lora->tok_embeddings_a); gguf_add_tensor(fctx, lora->tok_embeddings_a);
gguf_add_tensor(fctx, lora->tok_embeddings_b); gguf_add_tensor(fctx, lora->tok_embeddings_b);
@ -894,12 +894,12 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
gguf_add_tensor(fctx, layer.wo_b); gguf_add_tensor(fctx, layer.wo_b);
gguf_add_tensor(fctx, layer.ffn_norm_a); gguf_add_tensor(fctx, layer.ffn_norm_a);
gguf_add_tensor(fctx, layer.ffn_norm_b); gguf_add_tensor(fctx, layer.ffn_norm_b);
gguf_add_tensor(fctx, layer.w1_a); gguf_add_tensor(fctx, layer.ffn_gate_a);
gguf_add_tensor(fctx, layer.w1_b); gguf_add_tensor(fctx, layer.ffn_gate_b);
gguf_add_tensor(fctx, layer.w2_a); gguf_add_tensor(fctx, layer.ffn_down_a);
gguf_add_tensor(fctx, layer.w2_b); gguf_add_tensor(fctx, layer.ffn_down_b);
gguf_add_tensor(fctx, layer.w3_a); gguf_add_tensor(fctx, layer.ffn_up_a);
gguf_add_tensor(fctx, layer.w3_b); gguf_add_tensor(fctx, layer.ffn_up_b);
} }
} }
@ -1104,12 +1104,12 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB")); write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB"));
write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA")); write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA"));
write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB")); write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB"));
write_tensor(&file, layer.w1_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); write_tensor(&file, layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA"));
write_tensor(&file, layer.w1_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); write_tensor(&file, layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB"));
write_tensor(&file, layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); write_tensor(&file, layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA"));
write_tensor(&file, layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); write_tensor(&file, layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB"));
write_tensor(&file, layer.w3_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); write_tensor(&file, layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA"));
write_tensor(&file, layer.w3_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); write_tensor(&file, layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB"));
} }
} }
@ -1139,9 +1139,9 @@ struct train_params {
uint32_t n_rank_wv; uint32_t n_rank_wv;
uint32_t n_rank_wo; uint32_t n_rank_wo;
uint32_t n_rank_ffn_norm; uint32_t n_rank_ffn_norm;
uint32_t n_rank_w1; uint32_t n_rank_ffn_gate;
uint32_t n_rank_w2; uint32_t n_rank_ffn_down;
uint32_t n_rank_w3; uint32_t n_rank_ffn_up;
uint32_t n_rank_tok_embeddings; uint32_t n_rank_tok_embeddings;
uint32_t n_rank_norm; uint32_t n_rank_norm;
uint32_t n_rank_output; uint32_t n_rank_output;
@ -1152,9 +1152,9 @@ struct train_params {
bool custom_n_rank_wv; bool custom_n_rank_wv;
bool custom_n_rank_wo; bool custom_n_rank_wo;
bool custom_n_rank_ffn_norm; bool custom_n_rank_ffn_norm;
bool custom_n_rank_w1; bool custom_n_rank_ffn_gate;
bool custom_n_rank_w2; bool custom_n_rank_ffn_down;
bool custom_n_rank_w3; bool custom_n_rank_ffn_up;
bool custom_n_rank_tok_embeddings; bool custom_n_rank_tok_embeddings;
bool custom_n_rank_norm; bool custom_n_rank_norm;
bool custom_n_rank_output; bool custom_n_rank_output;
@ -1186,9 +1186,9 @@ static struct train_params get_default_train_params() {
params.n_rank_wv = 4; params.n_rank_wv = 4;
params.n_rank_wo = 4; params.n_rank_wo = 4;
params.n_rank_ffn_norm = 1; params.n_rank_ffn_norm = 1;
params.n_rank_w1 = 4; params.n_rank_ffn_gate = 4;
params.n_rank_w2 = 4; params.n_rank_ffn_down = 4;
params.n_rank_w3 = 4; params.n_rank_ffn_up = 4;
params.n_rank_tok_embeddings = 4; params.n_rank_tok_embeddings = 4;
params.n_rank_norm = 1; params.n_rank_norm = 1;
params.n_rank_output = 4; params.n_rank_output = 4;
@ -1199,9 +1199,9 @@ static struct train_params get_default_train_params() {
params.custom_n_rank_wv = false; params.custom_n_rank_wv = false;
params.custom_n_rank_wo = false; params.custom_n_rank_wo = false;
params.custom_n_rank_ffn_norm = false; params.custom_n_rank_ffn_norm = false;
params.custom_n_rank_w1 = false; params.custom_n_rank_ffn_gate = false;
params.custom_n_rank_w2 = false; params.custom_n_rank_ffn_down = false;
params.custom_n_rank_w3 = false; params.custom_n_rank_ffn_up = false;
params.custom_n_rank_tok_embeddings = false; params.custom_n_rank_tok_embeddings = false;
params.custom_n_rank_norm = false; params.custom_n_rank_norm = false;
params.custom_n_rank_output = false; params.custom_n_rank_output = false;
@ -1232,9 +1232,9 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n"); fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n");
fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n"); fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n");
fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n"); fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n");
fprintf(stderr, " --rank-w1 N LORA rank for w1 tensor, overrides default rank.\n"); fprintf(stderr, " --rank-ffn_gate N LORA rank for ffn_gate tensor, overrides default rank.\n");
fprintf(stderr, " --rank-w2 N LORA rank for w2 tensor, overrides default rank.\n"); fprintf(stderr, " --rank-ffn_down N LORA rank for ffn_down tensor, overrides default rank.\n");
fprintf(stderr, " --rank-w3 N LORA rank for w3 tensor, overrides default rank.\n"); fprintf(stderr, " --rank-ffn_up N LORA rank for ffn_up tensor, overrides default rank.\n");
print_common_train_usage(argc, argv, &params->common); print_common_train_usage(argc, argv, &params->common);
} }
@ -1369,27 +1369,27 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
} }
params->n_rank_wo = std::stoi(argv[i]); params->n_rank_wo = std::stoi(argv[i]);
params->custom_n_rank_wo = true; params->custom_n_rank_wo = true;
} else if (arg == "--rank-w1") { } else if (arg == "--rank-ffn_gate") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params->n_rank_w1 = std::stoi(argv[i]); params->n_rank_ffn_gate = std::stoi(argv[i]);
params->custom_n_rank_w1 = true; params->custom_n_rank_ffn_gate = true;
} else if (arg == "--rank-w2") { } else if (arg == "--rank-ffn_down") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params->n_rank_w2 = std::stoi(argv[i]); params->n_rank_ffn_down = std::stoi(argv[i]);
params->custom_n_rank_w2 = true; params->custom_n_rank_ffn_down = true;
} else if (arg == "--rank-w3") { } else if (arg == "--rank-ffn_up") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params->n_rank_w3 = std::stoi(argv[i]); params->n_rank_ffn_up = std::stoi(argv[i]);
params->custom_n_rank_w3 = true; params->custom_n_rank_ffn_up = true;
} else { } else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
train_print_usage(argc, argv, &default_params); train_print_usage(argc, argv, &default_params);
@ -1452,12 +1452,12 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) {
nx += ggml_nelements(layer.wo_b); nx += ggml_nelements(layer.wo_b);
nx += ggml_nelements(layer.ffn_norm_a); nx += ggml_nelements(layer.ffn_norm_a);
nx += ggml_nelements(layer.ffn_norm_b); nx += ggml_nelements(layer.ffn_norm_b);
nx += ggml_nelements(layer.w1_a); nx += ggml_nelements(layer.ffn_gate_a);
nx += ggml_nelements(layer.w1_b); nx += ggml_nelements(layer.ffn_gate_b);
nx += ggml_nelements(layer.w2_a); nx += ggml_nelements(layer.ffn_down_a);
nx += ggml_nelements(layer.w2_b); nx += ggml_nelements(layer.ffn_down_b);
nx += ggml_nelements(layer.w3_a); nx += ggml_nelements(layer.ffn_up_a);
nx += ggml_nelements(layer.w3_b); nx += ggml_nelements(layer.ffn_up_b);
} }
return nx; return nx;
} }
@ -1511,9 +1511,9 @@ int main(int argc, char ** argv) {
uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r; uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r;
uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r; uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r;
uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1; uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1;
uint32_t n_rank_w1 = params.custom_n_rank_w1 ? params.n_rank_w1 : params.lora_r; uint32_t n_rank_ffn_gate = params.custom_n_rank_ffn_gate ? params.n_rank_ffn_gate : params.lora_r;
uint32_t n_rank_w2 = params.custom_n_rank_w2 ? params.n_rank_w2 : params.lora_r; uint32_t n_rank_ffn_down = params.custom_n_rank_ffn_down ? params.n_rank_ffn_down : params.lora_r;
uint32_t n_rank_w3 = params.custom_n_rank_w3 ? params.n_rank_w3 : params.lora_r; uint32_t n_rank_ffn_up = params.custom_n_rank_ffn_up ? params.n_rank_ffn_up : params.lora_r;
uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r; uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1; uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1;
uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r; uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r;
@ -1523,9 +1523,9 @@ int main(int argc, char ** argv) {
lora.hparams.n_rank_wv = n_rank_wv; lora.hparams.n_rank_wv = n_rank_wv;
lora.hparams.n_rank_wo = n_rank_wo; lora.hparams.n_rank_wo = n_rank_wo;
lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm; lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm;
lora.hparams.n_rank_w1 = n_rank_w1; lora.hparams.n_rank_ffn_gate = n_rank_ffn_gate;
lora.hparams.n_rank_w2 = n_rank_w2; lora.hparams.n_rank_ffn_down = n_rank_ffn_down;
lora.hparams.n_rank_w3 = n_rank_w3; lora.hparams.n_rank_ffn_up = n_rank_ffn_up;
lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings; lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings;
lora.hparams.n_rank_norm = n_rank_norm; lora.hparams.n_rank_norm = n_rank_norm;
lora.hparams.n_rank_output = n_rank_output; lora.hparams.n_rank_output = n_rank_output;
@ -1566,9 +1566,9 @@ int main(int argc, char ** argv) {
|| (lora.hparams.n_rank_wv != n_rank_wv) || (lora.hparams.n_rank_wv != n_rank_wv)
|| (lora.hparams.n_rank_wo != n_rank_wo) || (lora.hparams.n_rank_wo != n_rank_wo)
|| (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm) || (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm)
|| (lora.hparams.n_rank_w1 != n_rank_w1) || (lora.hparams.n_rank_ffn_gate != n_rank_ffn_gate)
|| (lora.hparams.n_rank_w2 != n_rank_w2) || (lora.hparams.n_rank_ffn_down != n_rank_ffn_down)
|| (lora.hparams.n_rank_w3 != n_rank_w3) || (lora.hparams.n_rank_ffn_up != n_rank_ffn_up)
|| (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings) || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings)
|| (lora.hparams.n_rank_norm != n_rank_norm) || (lora.hparams.n_rank_norm != n_rank_norm)
|| (lora.hparams.n_rank_output != n_rank_output) || (lora.hparams.n_rank_output != n_rank_output)

View file

@ -50,9 +50,9 @@ struct my_llama_layer {
struct ggml_tensor * ffn_norm; struct ggml_tensor * ffn_norm;
// ff // ff
struct ggml_tensor * w1; struct ggml_tensor * ffn_gate; // w1
struct ggml_tensor * w2; struct ggml_tensor * ffn_down; // w2
struct ggml_tensor * w3; struct ggml_tensor * ffn_up; // w3
}; };
struct my_llama_model { struct my_llama_model {
@ -140,9 +140,9 @@ static void set_param_model(struct my_llama_model * model) {
ggml_set_param(ctx, layer.wv); ggml_set_param(ctx, layer.wv);
ggml_set_param(ctx, layer.wo); ggml_set_param(ctx, layer.wo);
ggml_set_param(ctx, layer.ffn_norm); ggml_set_param(ctx, layer.ffn_norm);
ggml_set_param(ctx, layer.w1); ggml_set_param(ctx, layer.ffn_gate);
ggml_set_param(ctx, layer.w2); ggml_set_param(ctx, layer.ffn_down);
ggml_set_param(ctx, layer.w3); ggml_set_param(ctx, layer.ffn_up);
} }
} }
@ -198,9 +198,9 @@ static void init_model(struct my_llama_model * model) {
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); layer.ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); layer.ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); layer.ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i)); ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i));
@ -211,9 +211,9 @@ static void init_model(struct my_llama_model * model) {
ggml_set_name(layer.ffn_norm, tni(LLM_TENSOR_FFN_NORM, i)); ggml_set_name(layer.ffn_norm, tni(LLM_TENSOR_FFN_NORM, i));
ggml_set_name(layer.w1, tni(LLM_TENSOR_FFN_GATE, i)); ggml_set_name(layer.ffn_gate, tni(LLM_TENSOR_FFN_GATE, i));
ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i)); ggml_set_name(layer.ffn_down, tni(LLM_TENSOR_FFN_DOWN, i));
ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i)); ggml_set_name(layer.ffn_up, tni(LLM_TENSOR_FFN_UP, i));
} }
set_param_model(model); set_param_model(model);
@ -244,9 +244,9 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean,
randomize_tensor_normal(layer.ffn_norm, rnd); randomize_tensor_normal(layer.ffn_norm, rnd);
randomize_tensor_normal(layer.w1, rnd); randomize_tensor_normal(layer.ffn_gate, rnd);
randomize_tensor_normal(layer.w2, rnd); randomize_tensor_normal(layer.ffn_down, rnd);
randomize_tensor_normal(layer.w3, rnd); randomize_tensor_normal(layer.ffn_up, rnd);
} }
free_random_normal_distribution(rnd); free_random_normal_distribution(rnd);
@ -356,11 +356,11 @@ static struct ggml_tensor * llama_build_train_graphs(
struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, f_norm_rms_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, f_norm_rms_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch);
struct ggml_tensor * t23 = ggml_repeat (ctx, layer.ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); struct ggml_tensor * t23 = ggml_repeat (ctx, layer.ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch);
struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch);
struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch);
struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch);
struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch);
cur = t30; cur = t30;
checkpoints.push_back(cur); checkpoints.push_back(cur);
@ -521,9 +521,9 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex
copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i)); copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i)); copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i)); copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
copy_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); copy_tensor_by_name(layer.ffn_gate, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
copy_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); copy_tensor_by_name(layer.ffn_down, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
copy_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); copy_tensor_by_name(layer.ffn_up, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
} }
} }
@ -664,9 +664,9 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
gguf_add_tensor(fctx, layer.wv); gguf_add_tensor(fctx, layer.wv);
gguf_add_tensor(fctx, layer.wo); gguf_add_tensor(fctx, layer.wo);
gguf_add_tensor(fctx, layer.ffn_norm); gguf_add_tensor(fctx, layer.ffn_norm);
gguf_add_tensor(fctx, layer.w1); gguf_add_tensor(fctx, layer.ffn_gate);
gguf_add_tensor(fctx, layer.w2); gguf_add_tensor(fctx, layer.ffn_down);
gguf_add_tensor(fctx, layer.w3); gguf_add_tensor(fctx, layer.ffn_up);
} }
} }
@ -915,9 +915,9 @@ static int64_t get_parameter_count(struct my_llama_model* model) {
nx += ggml_nelements(layer.wv); nx += ggml_nelements(layer.wv);
nx += ggml_nelements(layer.wo); nx += ggml_nelements(layer.wo);
nx += ggml_nelements(layer.ffn_norm); nx += ggml_nelements(layer.ffn_norm);
nx += ggml_nelements(layer.w1); nx += ggml_nelements(layer.ffn_gate);
nx += ggml_nelements(layer.w2); nx += ggml_nelements(layer.ffn_down);
nx += ggml_nelements(layer.w3); nx += ggml_nelements(layer.ffn_up);
} }
return nx; return nx;
} }

View file

@ -219,6 +219,10 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
if (!size) {
return;
}
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size); tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
} }
@ -229,6 +233,10 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
if (!size) {
return;
}
tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size); tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
} }

View file

@ -0,0 +1,45 @@
#!/usr/bin/env python3
import sys
from pathlib import Path
from gguf.gguf_reader import GGUFReader
sys.path.insert(0, str(Path(__file__).parent.parent))
def read_gguf_file(gguf_file_path):
"""
Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.
Parameters:
- gguf_file_path: Path to the GGUF file.
"""
reader = GGUFReader(gguf_file_path)
# List all key-value pairs in a columnized format
print("Key-Value Pairs:")
max_key_length = max(len(key) for key in reader.fields.keys())
for key, field in reader.fields.items():
value = field.parts[field.data[0]]
print(f"{key:{max_key_length}} : {value}")
print("----")
# List all tensors
print("Tensors:")
tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization"))
print("-" * 80)
for tensor in reader.tensors:
shape_str = "x".join(map(str, tensor.shape))
size_str = str(tensor.n_elements)
quantization_str = tensor.tensor_type.name
print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str))
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage: reader.py <path_to_gguf_file>")
sys.exit(1)
gguf_file_path = sys.argv[1]
read_gguf_file(gguf_file_path)

View file

@ -40,6 +40,7 @@ class Keys:
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
EXPERT_COUNT = "{arch}.expert_count" EXPERT_COUNT = "{arch}.expert_count"
EXPERT_USED_COUNT = "{arch}.expert_used_count" EXPERT_USED_COUNT = "{arch}.expert_used_count"
POOLING_LAYER = "{arch}.pooling_layer"
class Attention: class Attention:
HEAD_COUNT = "{arch}.attention.head_count" HEAD_COUNT = "{arch}.attention.head_count"
@ -86,27 +87,28 @@ class Keys:
class MODEL_ARCH(IntEnum): class MODEL_ARCH(IntEnum):
LLAMA = auto() LLAMA = auto()
FALCON = auto() FALCON = auto()
BAICHUAN = auto() BAICHUAN = auto()
GPT2 = auto() GPT2 = auto()
GPTJ = auto() GPTJ = auto()
GPTNEOX = auto() GPTNEOX = auto()
MPT = auto() MPT = auto()
STARCODER = auto() STARCODER = auto()
PERSIMMON = auto() PERSIMMON = auto()
REFACT = auto() REFACT = auto()
BERT = auto() BERT = auto()
BLOOM = auto() NOMIC_BERT = auto()
STABLELM = auto() BLOOM = auto()
QWEN = auto() STABLELM = auto()
QWEN2 = auto() QWEN = auto()
PHI2 = auto() QWEN2 = auto()
PLAMO = auto() PHI2 = auto()
CODESHELL = auto() PLAMO = auto()
ORION = auto() CODESHELL = auto()
ORION = auto()
INTERNLM2 = auto() INTERNLM2 = auto()
MINICPM = auto() MINICPM = auto()
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
@ -152,6 +154,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.PERSIMMON: "persimmon", MODEL_ARCH.PERSIMMON: "persimmon",
MODEL_ARCH.REFACT: "refact", MODEL_ARCH.REFACT: "refact",
MODEL_ARCH.BERT: "bert", MODEL_ARCH.BERT: "bert",
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.BLOOM: "bloom",
MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.STABLELM: "stablelm",
MODEL_ARCH.QWEN: "qwen", MODEL_ARCH.QWEN: "qwen",
@ -281,6 +284,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.LAYER_OUT_NORM, MODEL_TENSOR.LAYER_OUT_NORM,
], ],
MODEL_ARCH.NOMIC_BERT: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.TOKEN_TYPES,
MODEL_TENSOR.POS_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_OUT_NORM,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.LAYER_OUT_NORM,
],
MODEL_ARCH.MPT: [ MODEL_ARCH.MPT: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,

View file

@ -360,6 +360,9 @@ class GGUFWriter:
def add_causal_attention(self, value: bool) -> None: def add_causal_attention(self, value: bool) -> None:
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
def add_pooling_layer(self, value: bool) -> None:
self.add_bool(Keys.LLM.POOLING_LAYER.format(arch=self.arch), value)
def add_rope_dimension_count(self, count: int) -> None: def add_rope_dimension_count(self, count: int) -> None:
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)

View file

@ -15,7 +15,7 @@ class TensorNameMap:
"word_embeddings", # bloom "word_embeddings", # bloom
"model.embed_tokens", # llama-hf "model.embed_tokens", # llama-hf
"tok_embeddings", # llama-pth "tok_embeddings", # llama-pth
"embeddings.word_embeddings", # bert "embeddings.word_embeddings", # bert nomic-bert
"language_model.embedding.word_embeddings", # persimmon "language_model.embedding.word_embeddings", # persimmon
"wte", # gpt2 "wte", # gpt2
"transformer.embd.wte", # phi2 "transformer.embd.wte", # phi2
@ -24,13 +24,14 @@ class TensorNameMap:
# Token type embeddings # Token type embeddings
MODEL_TENSOR.TOKEN_TYPES: ( MODEL_TENSOR.TOKEN_TYPES: (
"embeddings.token_type_embeddings", # bert "embeddings.token_type_embeddings", # bert nomic-bert
), ),
# Normalization of token embeddings # Normalization of token embeddings
MODEL_TENSOR.TOKEN_EMBD_NORM: ( MODEL_TENSOR.TOKEN_EMBD_NORM: (
"word_embeddings_layernorm", # bloom "word_embeddings_layernorm", # bloom
"embeddings.LayerNorm", # bert "embeddings.LayerNorm", # bert
"emb_ln", # nomic-bert
), ),
# Position embeddings # Position embeddings
@ -103,6 +104,7 @@ class TensorNameMap:
"model.layers.{bid}.self_attn.query_key_value", # persimmon "model.layers.{bid}.self_attn.query_key_value", # persimmon
"h.{bid}.attn.c_attn", # gpt2 "h.{bid}.attn.c_attn", # gpt2
"transformer.h.{bid}.mixer.Wqkv", # phi2 "transformer.h.{bid}.mixer.Wqkv", # phi2
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
), ),
# Attention query # Attention query
@ -152,11 +154,13 @@ class TensorNameMap:
"transformer.h.{bid}.mixer.out_proj", # phi2 "transformer.h.{bid}.mixer.out_proj", # phi2
"model.layers.layers.{bid}.self_attn.o_proj", # plamo "model.layers.layers.{bid}.self_attn.o_proj", # plamo
"model.layers.{bid}.attention.wo", # internlm2 "model.layers.{bid}.attention.wo", # internlm2
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
), ),
# Attention output norm # Attention output norm
MODEL_TENSOR.ATTN_OUT_NORM: ( MODEL_TENSOR.ATTN_OUT_NORM: (
"encoder.layer.{bid}.attention.output.LayerNorm", # bert "encoder.layer.{bid}.attention.output.LayerNorm", # bert
"encoder.layers.{bid}.norm1", # nomic-bert
), ),
# Rotary embeddings # Rotary embeddings
@ -205,6 +209,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.fc1", # phi2 "model.layers.{bid}.mlp.fc1", # phi2
"model.layers.layers.{bid}.mlp.up_proj", # plamo "model.layers.layers.{bid}.mlp.up_proj", # plamo
"model.layers.{bid}.feed_forward.w3", # internlm2 "model.layers.{bid}.feed_forward.w3", # internlm2
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
), ),
MODEL_TENSOR.FFN_UP_EXP: ( MODEL_TENSOR.FFN_UP_EXP: (
@ -224,6 +229,7 @@ class TensorNameMap:
"transformer.h.{bid}.mlp.w2", # qwen "transformer.h.{bid}.mlp.w2", # qwen
"model.layers.layers.{bid}.mlp.gate_proj", # plamo "model.layers.layers.{bid}.mlp.gate_proj", # plamo
"model.layers.{bid}.feed_forward.w1", # internlm2 "model.layers.{bid}.feed_forward.w1", # internlm2
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
), ),
MODEL_TENSOR.FFN_GATE_EXP: ( MODEL_TENSOR.FFN_GATE_EXP: (
@ -249,6 +255,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.fc2", # phi2 "model.layers.{bid}.mlp.fc2", # phi2
"model.layers.layers.{bid}.mlp.down_proj", # plamo "model.layers.layers.{bid}.mlp.down_proj", # plamo
"model.layers.{bid}.feed_forward.w2", # internlm2 "model.layers.{bid}.feed_forward.w2", # internlm2
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
), ),
MODEL_TENSOR.FFN_DOWN_EXP: ( MODEL_TENSOR.FFN_DOWN_EXP: (
@ -272,6 +279,7 @@ class TensorNameMap:
MODEL_TENSOR.LAYER_OUT_NORM: ( MODEL_TENSOR.LAYER_OUT_NORM: (
"encoder.layer.{bid}.output.LayerNorm", # bert "encoder.layer.{bid}.output.LayerNorm", # bert
"encoder.layers.{bid}.norm2", # nomic-bert
) )
} }

325
llama.cpp
View file

@ -197,6 +197,7 @@ enum llm_arch {
LLM_ARCH_PERSIMMON, LLM_ARCH_PERSIMMON,
LLM_ARCH_REFACT, LLM_ARCH_REFACT,
LLM_ARCH_BERT, LLM_ARCH_BERT,
LLM_ARCH_NOMIC_BERT,
LLM_ARCH_BLOOM, LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM, LLM_ARCH_STABLELM,
LLM_ARCH_QWEN, LLM_ARCH_QWEN,
@ -211,27 +212,28 @@ enum llm_arch {
}; };
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = { static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" }, { LLM_ARCH_LLAMA, "llama" },
{ LLM_ARCH_FALCON, "falcon" }, { LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GPT2, "gpt2" }, { LLM_ARCH_GPT2, "gpt2" },
{ LLM_ARCH_GPTJ, "gptj" }, { LLM_ARCH_GPTJ, "gptj" },
{ LLM_ARCH_GPTNEOX, "gptneox" }, { LLM_ARCH_GPTNEOX, "gptneox" },
{ LLM_ARCH_MPT, "mpt" }, { LLM_ARCH_MPT, "mpt" },
{ LLM_ARCH_BAICHUAN, "baichuan" }, { LLM_ARCH_BAICHUAN, "baichuan" },
{ LLM_ARCH_STARCODER, "starcoder" }, { LLM_ARCH_STARCODER, "starcoder" },
{ LLM_ARCH_PERSIMMON, "persimmon" }, { LLM_ARCH_PERSIMMON, "persimmon" },
{ LLM_ARCH_REFACT, "refact" }, { LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BERT, "bert" }, { LLM_ARCH_BERT, "bert" },
{ LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
{ LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_QWEN, "qwen" }, { LLM_ARCH_STABLELM, "stablelm" },
{ LLM_ARCH_QWEN2, "qwen2" }, { LLM_ARCH_QWEN, "qwen" },
{ LLM_ARCH_PHI2, "phi2" }, { LLM_ARCH_QWEN2, "qwen2" },
{ LLM_ARCH_PLAMO, "plamo" }, { LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_CODESHELL, "codeshell" }, { LLM_ARCH_PLAMO, "plamo" },
{ LLM_ARCH_ORION, "orion" }, { LLM_ARCH_CODESHELL, "codeshell" },
{ LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_ORION, "orion" },
{ LLM_ARCH_MINICPM, "minicpm" }, { LLM_ARCH_INTERNLM2, "internlm2" },
{ LLM_ARCH_MINICPM, "minicpm" },
}; };
enum llm_kv { enum llm_kv {
@ -254,6 +256,7 @@ enum llm_kv {
LLM_KV_TENSOR_DATA_LAYOUT, LLM_KV_TENSOR_DATA_LAYOUT,
LLM_KV_EXPERT_COUNT, LLM_KV_EXPERT_COUNT,
LLM_KV_EXPERT_USED_COUNT, LLM_KV_EXPERT_USED_COUNT,
LLM_KV_POOLING_LAYER,
LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV, LLM_KV_ATTENTION_HEAD_COUNT_KV,
@ -311,6 +314,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" }, { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
{ LLM_KV_POOLING_LAYER, "%s.pooling_layer" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@ -373,6 +377,7 @@ enum llm_tensor {
LLM_TENSOR_ATTN_OUT, LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2, LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_OUT_NORM,
LLM_TENSOR_ATTN_ROT_EMBD, LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE_INP, LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_NORM, LLM_TENSOR_FFN_NORM,
@ -385,6 +390,7 @@ enum llm_tensor {
LLM_TENSOR_FFN_UP_EXP, LLM_TENSOR_FFN_UP_EXP,
LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_LAYER_OUT_NORM,
}; };
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = { static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@ -550,12 +556,27 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_TOKEN_TYPES, "token_types" }, { LLM_TENSOR_TOKEN_TYPES, "token_types" },
{ LLM_TENSOR_POS_EMBD, "position_embd" }, { LLM_TENSOR_POS_EMBD, "position_embd" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" }, { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" }, { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_NOMIC_BERT,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
}, },
@ -1483,6 +1504,7 @@ enum e_model {
MODEL_22M, MODEL_22M,
MODEL_33M, MODEL_33M,
MODEL_109M, MODEL_109M,
MODEL_137M,
MODEL_335M, MODEL_335M,
MODEL_0_5B, MODEL_0_5B,
MODEL_1B, MODEL_1B,
@ -1539,6 +1561,7 @@ struct llama_hparams {
float f_max_alibi_bias; float f_max_alibi_bias;
bool causal_attn = true; bool causal_attn = true;
bool pooling_layer = false;
bool operator!=(const llama_hparams & other) const { bool operator!=(const llama_hparams & other) const {
@ -1601,6 +1624,7 @@ struct llama_cparams {
bool mul_mat_q; bool mul_mat_q;
bool offload_kqv; bool offload_kqv;
bool do_pooling;
ggml_backend_sched_eval_callback cb_eval; ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data; void * cb_eval_user_data;
@ -1616,6 +1640,8 @@ struct llama_layer {
struct ggml_tensor * attn_q_norm_b; struct ggml_tensor * attn_q_norm_b;
struct ggml_tensor * attn_k_norm; struct ggml_tensor * attn_k_norm;
struct ggml_tensor * attn_k_norm_b; struct ggml_tensor * attn_k_norm_b;
struct ggml_tensor * attn_out_norm;
struct ggml_tensor * attn_out_norm_b;
// attention // attention
struct ggml_tensor * wq; struct ggml_tensor * wq;
@ -1634,6 +1660,8 @@ struct llama_layer {
// normalization // normalization
struct ggml_tensor * ffn_norm; struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b; struct ggml_tensor * ffn_norm_b;
struct ggml_tensor * layer_out_norm;
struct ggml_tensor * layer_out_norm_b;
// ff // ff
struct ggml_tensor * ffn_gate; // w1 struct ggml_tensor * ffn_gate; // w1
@ -1896,7 +1924,7 @@ struct llama_context {
struct ggml_tensor * inp_pos; // I32 [n_batch] struct ggml_tensor * inp_pos; // I32 [n_batch]
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch] struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
struct ggml_tensor * inp_K_shift; // I32 [n_ctx] struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
struct ggml_tensor * inp_sum; // F32 [1, n_batch] struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
#ifdef GGML_USE_MPI #ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL; ggml_mpi_context * ctx_mpi = NULL;
@ -2851,6 +2879,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
static const char * llama_model_type_name(e_model type) { static const char * llama_model_type_name(e_model type) {
switch (type) { switch (type) {
case MODEL_22M: return "22M";
case MODEL_33M: return "33M";
case MODEL_109M: return "109M";
case MODEL_137M: return "137M";
case MODEL_0_5B: return "0.5B";
case MODEL_1B: return "1B"; case MODEL_1B: return "1B";
case MODEL_2B: return "2B"; case MODEL_2B: return "2B";
case MODEL_3B: return "3B"; case MODEL_3B: return "3B";
@ -3053,6 +3086,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 3: case 3:
@ -3068,6 +3102,17 @@ static void llm_load_hparams(
model.type = e_model::MODEL_335M; break; // bge-large model.type = e_model::MODEL_335M; break; // bge-large
} }
} break; } break;
case LLM_ARCH_NOMIC_BERT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
model.type = e_model::MODEL_137M;
}
} break;
case LLM_ARCH_BLOOM: case LLM_ARCH_BLOOM:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@ -3309,7 +3354,12 @@ static void llm_load_vocab(
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); try {
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
} catch (const std::exception & e) {
LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
vocab.linefeed_id = vocab.special_pad_id;
}
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
vocab.linefeed_id = vocab.special_pad_id; vocab.linefeed_id = vocab.special_pad_id;
} else { } else {
@ -3865,10 +3915,14 @@ static bool llm_load_tensors(
} }
} break; } break;
case LLM_ARCH_BERT: case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
{ {
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); if (model.arch == LLM_ARCH_BERT) {
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
}
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
@ -3878,29 +3932,38 @@ static bool llm_load_tensors(
auto & layer = model.layers[i]; auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); if (model.arch == LLM_ARCH_BERT) {
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
} else {
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
}
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); if (model.arch == LLM_ARCH_BERT) {
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); } else {
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
}
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
} }
} break; } break;
case LLM_ARCH_BLOOM: case LLM_ARCH_BLOOM:
@ -4379,9 +4442,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
model.hparams.vocab_only = params.vocab_only; model.hparams.vocab_only = params.vocab_only;
llm_load_arch (ml, model); try {
llm_load_hparams(ml, model); llm_load_arch(ml, model);
llm_load_vocab (ml, model); } catch(const std::exception & e) {
throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
}
try {
llm_load_hparams(ml, model);
} catch(const std::exception & e) {
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
}
try {
llm_load_vocab(ml, model);
} catch(const std::exception & e) {
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
}
llm_load_print_meta(ml, model); llm_load_print_meta(ml, model);
@ -4859,7 +4934,7 @@ struct llm_build_context {
const int32_t n_orig_ctx; const int32_t n_orig_ctx;
const bool do_rope_shift; const bool do_rope_shift;
const bool causal_attn; const bool do_pooling;
const llm_build_cb & cb; const llm_build_cb & cb;
@ -4903,7 +4978,7 @@ struct llm_build_context {
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
n_orig_ctx (cparams.n_yarn_orig_ctx), n_orig_ctx (cparams.n_yarn_orig_ctx),
do_rope_shift (worst_case || kv_self.has_shift), do_rope_shift (worst_case || kv_self.has_shift),
causal_attn (hparams.causal_attn), do_pooling (hparams.pooling_layer && cparams.do_pooling),
cb (cb), cb (cb),
buf_compute_meta (lctx.buf_compute_meta) { buf_compute_meta (lctx.buf_compute_meta) {
// all initializations should be done in init() // all initializations should be done in init()
@ -5751,22 +5826,26 @@ struct llm_build_context {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
// get input vectors with right size // get input vectors with right size
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
struct ggml_tensor * inp_sum = ggml_view_1d(ctx0, lctx.inp_sum, n_tokens, 0); struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
// construct input embeddings (token, type, position) // construct input embeddings (token, type, position)
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
// token types are hardcoded to zero ("Sentence A") // token types are hardcoded to zero ("Sentence A")
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
inpL = ggml_add(ctx0, inpL, type_row0); inpL = ggml_add(ctx0, inpL, type_row0);
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL); if (model.arch == LLM_ARCH_BERT) {
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
}
cb(inpL, "inp_embd", -1); cb(inpL, "inp_embd", -1);
// embed layer norm // embed layer norm
@ -5782,7 +5861,7 @@ struct llm_build_context {
struct ggml_tensor * cur = inpL; struct ggml_tensor * cur = inpL;
// self-attention // self-attention
{ if (model.arch == LLM_ARCH_BERT) {
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq); struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
@ -5795,6 +5874,37 @@ struct llm_build_context {
// seems like we just need to do this for Q? // seems like we just need to do this for Q?
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
} else {
// compute Q and K and RoPE them
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
@ -5805,25 +5915,34 @@ struct llm_build_context {
cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, inpL);
// attention layer norm // attention layer norm
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il); cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
struct ggml_tensor * ffn_inp = cur; struct ggml_tensor * ffn_inp = cur;
cb(ffn_inp, "ffn_inp", il); cb(ffn_inp, "ffn_inp", il);
// feed-forward network // feed-forward network
cur = llm_build_ffn(ctx0, cur, if (model.arch == LLM_ARCH_BERT) {
model.layers[il].ffn_up, model.layers[il].ffn_up_b, cur = llm_build_ffn(ctx0, cur,
NULL, NULL, model.layers[il].ffn_up, model.layers[il].ffn_up_b,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
} else {
cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL,
model.layers[il].ffn_gate, NULL,
model.layers[il].ffn_down, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
}
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
// attentions bypass the intermediate layer // attentions bypass the intermediate layer
cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, ffn_inp);
// output layer norm // output layer norm
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
// input for next layer // input for next layer
inpL = cur; inpL = cur;
@ -5832,9 +5951,11 @@ struct llm_build_context {
// final output // final output
cur = inpL; cur = inpL;
// pooling // pooling layer
cur = ggml_mul_mat(ctx0, inp_sum, ggml_cont(ctx0, ggml_transpose(ctx0, cur))); if (do_pooling) {
cb(cur, "result_embed", -1); cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
}
cb(cur, "result_embd", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -7264,6 +7385,7 @@ static struct ggml_cgraph * llama_build_graph(
result = llm.build_refact(); result = llm.build_refact();
} break; } break;
case LLM_ARCH_BERT: case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
{ {
result = llm.build_bert(); result = llm.build_bert();
} break; } break;
@ -7367,7 +7489,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
float f; float f;
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
(hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
f = -INFINITY; f = -INFINITY;
} else { } else {
f = 0; f = 0;
@ -7378,7 +7501,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
} }
} }
{ {
assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer)); assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
float * data = (float *) lctx.inp_sum->data; float * data = (float *) lctx.inp_sum->data;
@ -7399,6 +7521,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
data[i] = lctx.kv_self.cells[i].delta; data[i] = lctx.kv_self.cells[i].delta;
} }
} }
if (hparams.pooling_layer && cparams.do_pooling) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
float * data = (float *) lctx.inp_sum->data;
memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
data[seq_id*n_tokens + i] = 1.0f;
}
}
} }
// decode a batch of tokens by evaluating the transformer // decode a batch of tokens by evaluating the transformer
@ -7510,7 +7646,7 @@ static int llama_decode_internal(
embeddings = gf->nodes[gf->n_nodes - 3]; embeddings = gf->nodes[gf->n_nodes - 3];
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
} }
} else if (strcmp(res->name, "result_embed") == 0) { } else if (strcmp(res->name, "result_embd") == 0) {
embeddings = res; embeddings = res;
res = nullptr; res = nullptr;
} else { } else {
@ -7630,11 +7766,12 @@ static int llama_decode_internal(
if (!lctx.embedding.empty()) { if (!lctx.embedding.empty()) {
auto & embedding_out = lctx.embedding; auto & embedding_out = lctx.embedding;
const int64_t embed_pos = res ? n_embd * (n_tokens-1) : 0; const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
embedding_out.resize(n_embd); embedding_out.resize(embd_size);
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings); ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embed_pos*sizeof(float), n_embd*sizeof(float)); ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
ggml_backend_synchronize(embeddings_backend); ggml_backend_synchronize(embeddings_backend);
} }
@ -7711,7 +7848,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
switch (llama_vocab_get_type(vocab)) { switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM: { case LLAMA_VOCAB_TYPE_SPM: {
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
return vocab.token_to_id.at(buf); auto token = vocab.token_to_id.find(buf);
if (token != vocab.token_to_id.end()) {
return (*token).second;
}
// Try to fall back to just the byte as a string
const char buf2[2] = { (char)ch, 0 };
return vocab.token_to_id.at(buf2);
} }
case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_WPM:
case LLAMA_VOCAB_TYPE_BPE: { case LLAMA_VOCAB_TYPE_BPE: {
@ -7759,7 +7902,7 @@ struct llm_bigram_spm {
}; };
struct llm_tokenizer_spm { struct llm_tokenizer_spm {
llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {} llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) { void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
// split string into utf8 chars // split string into utf8 chars
@ -7834,6 +7977,7 @@ private:
if (p == rev_merge.end()) { if (p == rev_merge.end()) {
// output any symbols that did not form tokens as bytes. // output any symbols that did not form tokens as bytes.
output.reserve(output.size() + symbol.n);
for (int j = 0; j < (int)symbol.n; ++j) { for (int j = 0; j < (int)symbol.n; ++j) {
llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]); llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
output.push_back(token_id); output.push_back(token_id);
@ -8396,17 +8540,18 @@ struct fragment_buffer_variant {
token(_token), token(_token),
raw_text(_dummy), raw_text(_dummy),
offset(0), offset(0),
length(0){} length(0) {}
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length) fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
: :
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT), type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
token((llama_vocab::id)-1), token((llama_vocab::id) - 1),
raw_text(_raw_text), raw_text(_raw_text),
offset(_offset), offset(_offset),
length(_length){ length(_length){
GGML_ASSERT( _offset >= 0 ); GGML_ASSERT(_offset >= 0);
GGML_ASSERT( _length >= 1 ); GGML_ASSERT(_length >= 1);
GGML_ASSERT( offset + length <= raw_text.length() ); GGML_ASSERT(offset + length <= raw_text.length());
} }
const FRAGMENT_BUFFER_VARIANT_TYPE type; const FRAGMENT_BUFFER_VARIANT_TYPE type;
@ -8530,14 +8675,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
} }
std::forward_list<fragment_buffer_variant> fragment_buffer; std::forward_list<fragment_buffer_variant> fragment_buffer;
fragment_buffer.emplace_front( raw_text, 0, raw_text.length() ); fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
if (special) tokenizer_st_partition( vocab, fragment_buffer ); if (special) tokenizer_st_partition(vocab, fragment_buffer);
switch (vocab.type) { switch (vocab.type) {
case LLAMA_VOCAB_TYPE_SPM: case LLAMA_VOCAB_TYPE_SPM:
{ {
for (const auto & fragment: fragment_buffer) { for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
// without adding this leading whitespace, we do not get the same results as the original tokenizer // without adding this leading whitespace, we do not get the same results as the original tokenizer
@ -8565,7 +8710,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
} break; } break;
case LLAMA_VOCAB_TYPE_BPE: case LLAMA_VOCAB_TYPE_BPE:
{ {
for (const auto & fragment: fragment_buffer) { for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@ -8581,7 +8726,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
} break; } break;
case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_WPM:
{ {
for (const auto & fragment: fragment_buffer) { for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@ -10444,7 +10589,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
quantize &= !params->only_copy; quantize &= !params->only_copy;
// do not quantize expert gating tensors // do not quantize expert gating tensors
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
// do not quantize positional embeddings and token types (BERT)
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
enum ggml_type new_type; enum ggml_type new_type;
void * new_data; void * new_data;
@ -10946,6 +11095,7 @@ struct llama_context_params llama_context_default_params() {
/*.logits_all =*/ false, /*.logits_all =*/ false,
/*.embedding =*/ false, /*.embedding =*/ false,
/*.offload_kqv =*/ true, /*.offload_kqv =*/ true,
/*.do_pooling =*/ true,
}; };
return result; return result;
@ -11101,6 +11251,7 @@ struct llama_context * llama_new_context_with_model(
cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.mul_mat_q = params.mul_mat_q; cparams.mul_mat_q = params.mul_mat_q;
cparams.offload_kqv = params.offload_kqv; cparams.offload_kqv = params.offload_kqv;
cparams.do_pooling = params.do_pooling;
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
@ -11248,7 +11399,7 @@ struct llama_context * llama_new_context_with_model(
// resized during inference, reserve maximum // resized during inference, reserve maximum
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch); ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
if (params.embedding){ if (params.embedding) {
ctx->embedding.resize(hparams.n_embd); ctx->embedding.resize(hparams.n_embd);
} }
@ -11266,7 +11417,7 @@ struct llama_context * llama_new_context_with_model(
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch); ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx); ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch); ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
ggml_set_name(ctx->inp_tokens, "inp_tokens"); ggml_set_name(ctx->inp_tokens, "inp_tokens");
ggml_set_name(ctx->inp_embd, "inp_embd"); ggml_set_name(ctx->inp_embd, "inp_embd");
@ -12124,6 +12275,10 @@ float * llama_get_embeddings(struct llama_context * ctx) {
return ctx->embedding.data(); return ctx->embedding.data();
} }
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
}
const char * llama_token_get_text(const struct llama_model * model, llama_token token) { const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
return model->vocab.id_to_token[token].text.c_str(); return model->vocab.id_to_token[token].text.c_str();
} }

View file

@ -236,6 +236,7 @@ extern "C" {
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool embedding; // embedding mode only bool embedding; // embedding mode only
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
}; };
// model quantization parameters // model quantization parameters
@ -628,6 +629,10 @@ extern "C" {
// shape: [n_embd] (1-dimensional) // shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Get the embeddings for the ith sequence
// llama_get_embeddings(ctx) + i*n_embd
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
// //
// Vocab // Vocab
// //

View file

@ -2129,14 +2129,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_pad()); test_cases.emplace_back(new test_pad());
test_cases.emplace_back(new test_leaky_relu()); test_cases.emplace_back(new test_leaky_relu());
// these tests are disabled to save execution time, but they can be handy for debugging
#if 0
#if !defined(__SANITIZE_THREAD__) #if !defined(__SANITIZE_THREAD__)
// FIXME: these tests use too much memory with thread sanitizer // FIXME: these tests use too much memory with thread sanitizer
test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024)); test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
//test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336)); //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
#endif #endif
// these tests are disabled to save execution time, but they can be handy for debugging
#if 0
test_cases.emplace_back(new test_llama(1)); test_cases.emplace_back(new test_llama(1));
test_cases.emplace_back(new test_llama(2)); test_cases.emplace_back(new test_llama(2));
test_cases.emplace_back(new test_falcon(1)); test_cases.emplace_back(new test_falcon(1));

View file

@ -4,13 +4,13 @@
#include "console.h" #include "console.h"
#include <cassert> #include <cassert>
#include <codecvt>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <string>
#include <codecvt>
#include <map>
#include <vector>
#include <locale> #include <locale>
#include <string>
#include <thread>
#include <vector>
int main(int argc, char **argv) { int main(int argc, char **argv) {
if (argc < 2) { if (argc < 2) {
@ -74,45 +74,46 @@ int main(int argc, char **argv) {
} }
} }
catch (const std::invalid_argument &) { catch (const std::invalid_argument &) {
fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str()); //fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
} }
} }
for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) { // unicode
// NOTE: these exceptions seem to be necessary, because the GPT2 tokenizer doesn't want to interfere with some ASCII control characters {
if ((cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && (cp < 0x13 || cp > 0x17) && cp != 0x19 && (cp < 0x1c || cp > 0x1e) && (cp < 0xd800 || cp > 0xdfff)) { const int nthread = std::thread::hardware_concurrency();
std::string str = " " + codepoint_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false); std::vector<std::thread> threads(nthread);
std::string check = llama_detokenize_bpe(ctx, tokens);
if (str != check) { for (int i = 0; i < nthread; ++i) {
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", threads[i] = std::thread([i, nthread, ctx]() {
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
return 3; if (!( // NOLINT
} (cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 &&
} (cp < 0x13 || cp > 0x17) && cp != 0x19 &&
} (cp < 0x1c || cp > 0x1e) &&
// Restrict to assigned unicode planes (cp < 0xd800 || cp > 0xdfff) &&
// for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) { (cp < 0x00040000 || cp >= 0x000e0000)
for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) { )) {
std::string str = codepoint_to_utf8(cp); continue;
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false); }
std::string check = llama_detokenize_bpe(ctx, tokens);
if (str != check) { std::string str = codepoint_to_utf8(cp);
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); std::string check = llama_detokenize_bpe(ctx, tokens);
return 4; if (cp != 9601 && str != check) {
} fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
} cp, check.c_str(), check.length(), str.c_str(), str.length());
for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) { std::exit(3);
std::string str = codepoint_to_utf8(cp); }
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false); }
std::string check = llama_detokenize_bpe(ctx, tokens); });
if (str != check) { }
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); for (auto & t : threads) {
return 4; t.join();
} }
} }
llama_free_model(model); llama_free_model(model);
llama_free(ctx); llama_free(ctx);

View file

@ -4,13 +4,13 @@
#include "console.h" #include "console.h"
#include <cassert> #include <cassert>
#include <codecvt>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <string>
#include <codecvt>
#include <map>
#include <vector>
#include <locale> #include <locale>
#include <string>
#include <thread>
#include <vector>
int main(int argc, char **argv) { int main(int argc, char **argv) {
if (argc < 2) { if (argc < 2) {
@ -72,26 +72,33 @@ int main(int argc, char **argv) {
} }
} }
for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) { // unicode
if (cp < 0xd800 || cp > 0xdfff) { {
std::string str = codepoint_to_utf8(cp); const int nthread = std::thread::hardware_concurrency();
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens); std::vector<std::thread> threads(nthread);
if (cp != 9601 && str != check) {
fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", for (int i = 0; i < nthread; ++i) {
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); threads[i] = std::thread([i, nthread, ctx]() {
return 3; for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
} if (cp >= 0xd800 && cp <= 0xdfff) {
continue;
}
std::string str = codepoint_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens);
if (cp != 9601 && str != check) {
fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
cp, check.c_str(), check.length(), str.c_str(), str.length());
std::exit(3);
}
}
});
} }
}
for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) { for (auto & t : threads) {
std::string str = codepoint_to_utf8(cp); t.join();
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens);
if (str != check) {
fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
return 4;
} }
} }

View file

@ -264,26 +264,29 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
offset += 1; offset += 1;
return result; return result;
} }
else if (!(utf8[offset + 0] & 0x40)) { if (!(utf8[offset + 0] & 0x40)) {
throw std::invalid_argument("invalid character"); throw std::invalid_argument("invalid character");
} }
else if (!(utf8[offset + 0] & 0x20)) { if (!(utf8[offset + 0] & 0x20)) {
if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
throw std::invalid_argument("invalid character"); throw std::invalid_argument("invalid character");
}
auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f); auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
offset += 2; offset += 2;
return result; return result;
} }
else if (!(utf8[offset + 0] & 0x10)) { if (!(utf8[offset + 0] & 0x10)) {
if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
throw std::invalid_argument("invalid character"); throw std::invalid_argument("invalid character");
}
auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f); auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
offset += 3; offset += 3;
return result; return result;
} }
else if (!(utf8[offset + 0] & 0x08)) { if (!(utf8[offset + 0] & 0x08)) {
if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
throw std::invalid_argument("invalid character"); throw std::invalid_argument("invalid character");
}
auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f); auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
offset += 4; offset += 4;
return result; return result;
@ -331,21 +334,22 @@ static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t
offset += 1; offset += 1;
return result; return result;
} }
else {
if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
throw std::invalid_argument("invalid character"); throw std::invalid_argument("invalid character");
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
offset += 2;
return result;
} }
throw std::invalid_argument("invalid string");
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
offset += 2;
return result;
} }
static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) { static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
std::vector<uint32_t> result; std::vector<uint32_t> result;
size_t offset = 0; size_t offset = 0;
while (offset < utf16.size()) while (offset < utf16.size()) {
result.push_back(codepoint_from_utf16(utf16, offset)); result.push_back(codepoint_from_utf16(utf16, offset));
}
return result; return result;
} }
@ -361,44 +365,52 @@ static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> &
static std::unordered_map<uint32_t, int> codepoint_type_map() { static std::unordered_map<uint32_t, int> codepoint_type_map() {
std::unordered_map<uint32_t, int> codepoint_types; std::unordered_map<uint32_t, int> codepoint_types;
for (auto p : digit_ranges) { for (auto p : digit_ranges) {
for(auto i = p.first; i <= p.second; ++ i) for (auto i = p.first; i <= p.second; ++ i) {
codepoint_types[i] = CODEPOINT_TYPE_DIGIT; codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
}
} }
for(auto p : letter_ranges) { for (auto p : letter_ranges) {
for(auto i = p.first; i <= p.second; ++ i) for (auto i = p.first; i <= p.second; ++ i) {
codepoint_types[i] = CODEPOINT_TYPE_LETTER; codepoint_types[i] = CODEPOINT_TYPE_LETTER;
}
} }
for(auto p : whitespace_ranges) { for (auto p : whitespace_ranges) {
for(auto i = p.first; i <= p.second; ++ i) for (auto i = p.first; i <= p.second; ++ i) {
codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE; codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
}
} }
for(auto p : accent_mark_ranges) { for (auto p : accent_mark_ranges) {
for(auto i = p.first; i <= p.second; ++ i) for (auto i = p.first; i <= p.second; ++ i) {
codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK; codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
}
} }
for(auto p : punctuation_ranges) { for (auto p : punctuation_ranges) {
for(auto i = p.first; i <= p.second; ++ i) for (auto i = p.first; i <= p.second; ++ i) {
codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION; codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
}
} }
for (auto p : symbol_ranges) { for (auto p : symbol_ranges) {
for (auto i = p.first; i <= p.second; ++i) for (auto i = p.first; i <= p.second; ++i) {
codepoint_types[i] = CODEPOINT_TYPE_SYMBOL; codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
}
} }
for(auto p : control_ranges) { for (auto p : control_ranges) {
for(auto i = p.first; i <= p.second; ++ i) for (auto i = p.first; i <= p.second; ++ i) {
codepoint_types[i] = CODEPOINT_TYPE_CONTROL; codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
}
} }
return codepoint_types; return codepoint_types;
} }
static int codepoint_type(uint32_t cp) { static int codepoint_type(uint32_t cp) {
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map(); static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
return codepoint_types[cp]; return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
} }
static int codepoint_type(const std::string & utf8) { static int codepoint_type(const std::string & utf8) {
if (utf8.length() == 0) if (utf8.length() == 0) {
return CODEPOINT_TYPE_UNIDENTIFIED; return CODEPOINT_TYPE_UNIDENTIFIED;
}
size_t offset = 0; size_t offset = 0;
return codepoint_type(codepoint_from_utf8(utf8, offset)); return codepoint_type(codepoint_from_utf8(utf8, offset));
} }