Merge branch 'master' into xsn/main_chat_template_2

This commit is contained in:
ngxson 2024-06-24 11:00:00 +02:00
commit c530ce4c17
28 changed files with 22555 additions and 22484 deletions

View file

@ -33,15 +33,13 @@ jobs:
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
# have disabled them for now until the reason why
# is understood.
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } # Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" } - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" } - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
steps: steps:

View file

@ -30,7 +30,7 @@ jobs:
strategy: strategy:
matrix: matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED] sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
build_type: [RelWithDebInfo] build_type: [RelWithDebInfo]
include: include:
- build_type: Release - build_type: Release

View file

@ -144,9 +144,6 @@ option(LLAMA_BUILD_SERVER "llama: build server example"
option(LLAMA_LASX "llama: enable lasx" ON) option(LLAMA_LASX "llama: enable lasx" ON)
option(LLAMA_LSX "llama: enable lsx" ON) option(LLAMA_LSX "llama: enable lsx" ON)
# add perf arguments
option(LLAMA_PERF "llama: enable perf" OFF)
# Required for relocatable CMake package # Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
@ -870,10 +867,6 @@ if (LLAMA_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind) target_link_libraries(ggml PUBLIC memkind)
endif() endif()
if (LLAMA_PERF)
add_compile_definitions(GGML_PERF)
endif()
function(get_flags CCID CCVER) function(get_flags CCID CCVER)
set(C_FLAGS "") set(C_FLAGS "")
set(CXX_FLAGS "") set(CXX_FLAGS "")

View file

@ -344,9 +344,6 @@ ifdef LLAMA_GPROF
MK_CFLAGS += -pg MK_CFLAGS += -pg
MK_CXXFLAGS += -pg MK_CXXFLAGS += -pg
endif endif
ifdef LLAMA_PERF
MK_CPPFLAGS += -DGGML_PERF
endif
# Architecture specific # Architecture specific
# TODO: probably these flags need to be tweaked on some architectures # TODO: probably these flags need to be tweaked on some architectures

File diff suppressed because it is too large Load diff

View file

@ -152,7 +152,6 @@ struct gpt_params {
bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
bool embedding = false; // get only sentence embedding
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\" bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool multiline_input = false; // reverse the usage of `\` bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool simple_io = false; // improves compatibility with subprocesses and limited consoles
@ -179,6 +178,12 @@ struct gpt_params {
std::string mmproj = ""; // path to multimodal projector std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
// embedding
bool embedding = false; // get only sentence embedding
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
std::string embd_sep = "\n"; // separator of embendings
// server params // server params
int32_t port = 8080; // server listens on this network port int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_read = 600; // http read timeout in seconds
@ -400,7 +405,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
// Embedding utils // Embedding utils
// //
void llama_embd_normalize(const float * inp, float * out, int n); void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);

View file

@ -80,7 +80,7 @@ class Model:
if not self.is_safetensors: if not self.is_safetensors:
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin") self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = Model.load_hparams(self.dir_model) self.hparams = Model.load_hparams(self.dir_model)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
self.tensor_names = None self.tensor_names = None
if self.ftype == gguf.LlamaFileType.GUESSED: if self.ftype == gguf.LlamaFileType.GUESSED:
@ -973,8 +973,6 @@ class XverseModel(Model):
if max_vocab_index >= vocab_size: if max_vocab_index >= vocab_size:
raise ValueError("Vocabulary size exceeds expected maximum size.") raise ValueError("Vocabulary size exceeds expected maximum size.")
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab() added_vocab = tokenizer.get_added_vocab()
@ -1406,6 +1404,48 @@ class LlamaModel(Model):
raise ValueError(f"Unprocessed experts: {experts}") raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("BitnetForCausalLM")
class BitnetModel(Model):
model_arch = gguf.MODEL_ARCH.BITNET
def set_vocab(self):
self._set_vocab_sentencepiece()
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(1.0)
def weight_quant(self, weight):
dtype = weight.dtype
weight = weight.float()
s = 1 / weight.abs().mean().clamp(min=1e-5)
weight = (weight * s).round().clamp(-1, 1) / s
scale = weight.abs().max().unsqueeze(0)
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
weight = torch.sign(weight).type(dtype)
return weight.type(dtype), scale.type(torch.float32)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
new_name = self.map_tensor_name(name)
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
gguf.MODEL_TENSOR.ATTN_Q,
gguf.MODEL_TENSOR.ATTN_K,
gguf.MODEL_TENSOR.ATTN_V,
gguf.MODEL_TENSOR.ATTN_OUT,
gguf.MODEL_TENSOR.FFN_UP,
gguf.MODEL_TENSOR.FFN_DOWN,
gguf.MODEL_TENSOR.FFN_GATE,
]):
# transform weight into 1/0/-1 (in fp32)
weight_torch, scale_torch = self.weight_quant(data_torch)
yield (new_name, weight_torch)
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
else:
yield (new_name, data_torch)
@Model.register("GrokForCausalLM") @Model.register("GrokForCausalLM")
class GrokModel(Model): class GrokModel(Model):
model_arch = gguf.MODEL_ARCH.GROK model_arch = gguf.MODEL_ARCH.GROK
@ -2731,6 +2771,124 @@ class DeepseekV2Model(Model):
raise ValueError(f"Unprocessed experts: {experts}") raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("T5ForConditionalGeneration")
@Model.register("T5WithLMHeadModel")
class T5Model(Model):
model_arch = gguf.MODEL_ARCH.T5
def set_vocab(self):
# to avoid TypeError: Descriptors cannot be created directly
# exception when importing sentencepiece_model_pb2
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from sentencepiece import SentencePieceProcessor
from sentencepiece import sentencepiece_model_pb2 as model
tokenizer_path = self.dir_model / 'spiece.model'
if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}")
sentencepiece_model = model.ModelProto()
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
tokenizer = SentencePieceProcessor()
tokenizer.LoadFromFile(str(tokenizer_path))
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
for token_id in range(tokenizer.vocab_size()):
piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8")
score = tokenizer.GetScore(token_id)
toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.IsUnknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.IsControl(token_id):
toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.IsUnused(token_id):
toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.IsByte(token_id):
toktype = SentencePieceTokenTypes.BYTE
tokens[token_id] = text
scores[token_id] = score
toktypes[token_id] = toktype
added_tokens_file = self.dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
added_tokens_json = json.load(f)
for key in added_tokens_json:
token_id = added_tokens_json[key]
if (token_id >= vocab_size):
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
tokens[token_id] = key.encode("utf-8")
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if vocab_size > len(tokens):
pad_count = vocab_size - len(tokens)
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
for i in range(1, pad_count + 1):
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED)
self.gguf_writer.add_tokenizer_model("t5")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_add_space_prefix(add_prefix)
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
if precompiled_charsmap:
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
self.gguf_writer.add_add_bos_token(False)
self.gguf_writer.add_add_eos_token(True)
def set_gguf_parameters(self):
self.gguf_writer.add_name("T5")
self.gguf_writer.add_context_length(self.hparams["n_positions"])
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
self.gguf_writer.add_block_count(self.hparams["num_layers"])
self.gguf_writer.add_head_count(self.hparams["num_heads"])
self.gguf_writer.add_key_length(self.hparams["d_kv"])
self.gguf_writer.add_value_length(self.hparams["d_kv"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
self.gguf_writer.add_file_type(self.ftype)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
# Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
# "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
# To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight":
logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.")
return []
return [(self.map_tensor_name(name), data_torch)]
###### CONVERSION LOGIC ###### ###### CONVERSION LOGIC ######

View file

@ -19,3 +19,43 @@ llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
``` ```
The above command will output space-separated float values. The above command will output space-separated float values.
## extra parameters
### --embd-normalize $integer$
| $integer$ | description | formula |
|-----------|---------------------|---------|
| $-1$ | none |
| $0$ | max absolute int16 | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$
| $1$ | taxicab | $\Large{x_i \over\sum \lvert x_i\rvert}$
| $2$ | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$
| $>2$ | p-norm | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$
### --embd-output-format $'string'$
| $'string'$ | description | |
|------------|------------------------------|--|
| '' | same as before | (default)
| 'array' | single embeddings | $[[x_1,...,x_n]]$
| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
| 'json' | openai style |
| 'json+' | add cosine similarity matrix |
### --embd-separator $"string"$
| $"string"$ | |
|--------------|-|
| "\n" | (default)
| "<#embSep#>" | for exemple
| "<#sep#>" | other exemple
## examples
### Unix-based systems (Linux, macOS, etc.):
```bash
./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
```
### Windows:
```powershell
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
```

View file

@ -7,13 +7,19 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static std::vector<std::string> split_lines(const std::string & s) { static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
std::string line;
std::vector<std::string> lines; std::vector<std::string> lines;
std::stringstream ss(s); size_t start = 0;
while (std::getline(ss, line)) { size_t end = s.find(separator);
lines.push_back(line);
while (end != std::string::npos) {
lines.push_back(s.substr(start, end - start));
start = end + separator.length();
end = s.find(separator, start);
} }
lines.push_back(s.substr(start)); // Add the last part
return lines; return lines;
} }
@ -24,7 +30,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
} }
} }
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
// clear previous kv_cache values (irrelevant for embeddings) // clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
@ -44,13 +50,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
float * out = output + batch.seq_id[i][0] * n_embd; float * out = output + batch.seq_id[i][0] * n_embd;
//TODO: I would also add a parameter here to enable normalization or not. llama_embd_normalize(embd, out, n_embd, embd_norm);
/*fprintf(stdout, "unnormalized_embedding:");
for (int hh = 0; hh < n_embd; hh++) {
fprintf(stdout, "%9.6f ", embd[hh]);
}
fprintf(stdout, "\n");*/
llama_embd_normalize(embd, out, n_embd);
} }
} }
@ -110,7 +110,7 @@ int main(int argc, char ** argv) {
} }
// split the prompt into lines // split the prompt into lines
std::vector<std::string> prompts = split_lines(params.prompt); std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
// max batch size // max batch size
const uint64_t n_batch = params.n_batch; const uint64_t n_batch = params.n_batch;
@ -170,7 +170,7 @@ int main(int argc, char ** argv) {
// encode if at capacity // encode if at capacity
if (batch.n_tokens + n_toks > n_batch) { if (batch.n_tokens + n_toks > n_batch) {
float * out = emb + p * n_embd; float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd); batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
llama_batch_clear(batch); llama_batch_clear(batch);
p += s; p += s;
s = 0; s = 0;
@ -183,15 +183,20 @@ int main(int argc, char ** argv) {
// final batch // final batch
float * out = emb + p * n_embd; float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd); batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
if (params.embd_out.empty()) {
// print the first part of the embeddings or for a single prompt, the full embedding // print the first part of the embeddings or for a single prompt, the full embedding
fprintf(stdout, "\n"); fprintf(stdout, "\n");
for (int j = 0; j < n_prompts; j++) { for (int j = 0; j < n_prompts; j++) {
fprintf(stdout, "embedding %d: ", j); fprintf(stdout, "embedding %d: ", j);
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
if (params.embd_normalize == 0) {
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
} else {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
} }
}
fprintf(stdout, "\n"); fprintf(stdout, "\n");
} }
@ -199,14 +204,58 @@ int main(int argc, char ** argv) {
if (n_prompts > 1) { if (n_prompts > 1) {
fprintf(stdout, "\n"); fprintf(stdout, "\n");
printf("cosine similarity matrix:\n\n"); printf("cosine similarity matrix:\n\n");
for (int i = 0; i < n_prompts; i++) {
fprintf(stdout, "%6.6s ", prompts[i].c_str());
}
fprintf(stdout, "\n");
for (int i = 0; i < n_prompts; i++) { for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) { for (int j = 0; j < n_prompts; j++) {
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f ", sim); fprintf(stdout, "%6.2f ", sim);
} }
fprintf(stdout, "%1.10s", prompts[i].c_str());
fprintf(stdout, "\n"); fprintf(stdout, "\n");
} }
} }
}
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
const bool notArray = params.embd_out != "array";
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
for (int j = 0;;) { // at least one iteration (one prompt)
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
fprintf(stdout, "[");
for (int i = 0;;) { // at least one iteration (n_embd > 0)
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
i++;
if (i < n_embd) fprintf(stdout, ","); else break;
}
fprintf(stdout, notArray ? "]\n }" : "]");
j++;
if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
}
fprintf(stdout, notArray ? "\n ]" : "]\n");
if (params.embd_out == "json+" && n_prompts > 1) {
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
for (int i = 0;;) { // at least two iteration (n_prompts > 1)
fprintf(stdout, " [");
for (int j = 0;;) { // at least two iteration (n_prompts > 1)
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f", sim);
j++;
if (j < n_prompts) fprintf(stdout, ", "); else break;
}
fprintf(stdout, " ]");
i++;
if (i < n_prompts) fprintf(stdout, ",\n"); else break;
}
fprintf(stdout, "\n ]");
}
if (notArray) fprintf(stdout, "\n}\n");
}
// clean up // clean up
llama_print_timings(ctx); llama_print_timings(ctx);

View file

@ -634,12 +634,12 @@ return html`
<div> <div>
<div class="grammar"> <div class="grammar">
<label for="template"></label> <label for="template"></label>
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON-Scheme + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/> <textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON Schema + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
</div> </div>
<div class="grammar-columns"> <div class="grammar-columns">
<div class="json-schema-controls"> <div class="json-schema-controls">
<input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} /> <input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON-Scheme</button> <button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
</div> </div>
</div> </div>
</div> </div>

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

1266
ggml.c

File diff suppressed because it is too large Load diff

35
ggml.h
View file

@ -591,11 +591,7 @@ extern "C" {
struct ggml_tensor * grad; struct ggml_tensor * grad;
struct ggml_tensor * src[GGML_MAX_SRC]; struct ggml_tensor * src[GGML_MAX_SRC];
// performance // source tensor and offset for views
int perf_runs;
int64_t perf_cycles;
int64_t perf_time_us;
struct ggml_tensor * view_src; struct ggml_tensor * view_src;
size_t view_offs; size_t view_offs;
@ -605,7 +601,7 @@ extern "C" {
void * extra; // extra things e.g. for ggml-cuda.cu void * extra; // extra things e.g. for ggml-cuda.cu
char padding[8]; // char padding[4];
}; };
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -652,11 +648,6 @@ extern "C" {
struct ggml_hash_set visited_hash_table; struct ggml_hash_set visited_hash_table;
enum ggml_cgraph_eval_order order; enum ggml_cgraph_eval_order order;
// performance
int perf_runs;
int64_t perf_cycles;
int64_t perf_time_us;
}; };
// scratch buffer // scratch buffer
@ -673,28 +664,6 @@ extern "C" {
bool no_alloc; // don't allocate memory for the tensor data bool no_alloc; // don't allocate memory for the tensor data
}; };
// compute types
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
enum ggml_task_type {
GGML_TASK_TYPE_INIT = 0,
GGML_TASK_TYPE_COMPUTE,
GGML_TASK_TYPE_FINALIZE,
};
struct ggml_compute_params {
enum ggml_task_type type;
// ith = thread index, nth = number of threads
int ith, nth;
// work buffer for all threads
size_t wsize;
void * wdata;
};
// numa strategies // numa strategies
enum ggml_numa_strategy { enum ggml_numa_strategy {
GGML_NUMA_STRATEGY_DISABLED = 0, GGML_NUMA_STRATEGY_DISABLED = 0,

View file

@ -49,6 +49,7 @@ class Keys:
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
POOLING_TYPE = "{arch}.pooling_type" POOLING_TYPE = "{arch}.pooling_type"
LOGIT_SCALE = "{arch}.logit_scale" LOGIT_SCALE = "{arch}.logit_scale"
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
class Attention: class Attention:
HEAD_COUNT = "{arch}.attention.head_count" HEAD_COUNT = "{arch}.attention.head_count"
@ -62,6 +63,7 @@ class Keys:
CAUSAL = "{arch}.attention.causal" CAUSAL = "{arch}.attention.causal"
Q_LORA_RANK = "{arch}.attention.q_lora_rank" Q_LORA_RANK = "{arch}.attention.q_lora_rank"
KV_LORA_RANK = "{arch}.attention.kv_lora_rank" KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
class Rope: class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count" DIMENSION_COUNT = "{arch}.rope.dimension_count"
@ -97,6 +99,8 @@ class Keys:
ADD_BOS = "tokenizer.ggml.add_bos_token" ADD_BOS = "tokenizer.ggml.add_bos_token"
ADD_EOS = "tokenizer.ggml.add_eos_token" ADD_EOS = "tokenizer.ggml.add_eos_token"
ADD_PREFIX = "tokenizer.ggml.add_space_prefix" ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
HF_JSON = "tokenizer.huggingface.json" HF_JSON = "tokenizer.huggingface.json"
RWKV = "tokenizer.rwkv.world" RWKV = "tokenizer.rwkv.world"
CHAT_TEMPLATE = "tokenizer.chat_template" CHAT_TEMPLATE = "tokenizer.chat_template"
@ -149,6 +153,8 @@ class MODEL_ARCH(IntEnum):
OLMO = auto() OLMO = auto()
ARCTIC = auto() ARCTIC = auto()
DEEPSEEK2 = auto() DEEPSEEK2 = auto()
BITNET = auto()
T5 = auto()
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
@ -200,6 +206,36 @@ class MODEL_TENSOR(IntEnum):
ATTN_KV_B = auto() ATTN_KV_B = auto()
ATTN_Q_A_NORM = auto() ATTN_Q_A_NORM = auto()
ATTN_KV_A_NORM = auto() ATTN_KV_A_NORM = auto()
FFN_SUB_NORM = auto()
ATTN_SUB_NORM = auto()
DEC_ATTN_NORM = auto()
DEC_ATTN_Q = auto()
DEC_ATTN_K = auto()
DEC_ATTN_V = auto()
DEC_ATTN_OUT = auto()
DEC_ATTN_REL_B = auto()
DEC_CROSS_ATTN_NORM = auto()
DEC_CROSS_ATTN_Q = auto()
DEC_CROSS_ATTN_K = auto()
DEC_CROSS_ATTN_V = auto()
DEC_CROSS_ATTN_OUT = auto()
DEC_CROSS_ATTN_REL_B = auto()
DEC_FFN_NORM = auto()
DEC_FFN_GATE = auto()
DEC_FFN_DOWN = auto()
DEC_FFN_UP = auto()
DEC_OUTPUT_NORM = auto()
ENC_ATTN_NORM = auto()
ENC_ATTN_Q = auto()
ENC_ATTN_K = auto()
ENC_ATTN_V = auto()
ENC_ATTN_OUT = auto()
ENC_ATTN_REL_B = auto()
ENC_FFN_NORM = auto()
ENC_FFN_GATE = auto()
ENC_FFN_DOWN = auto()
ENC_FFN_UP = auto()
ENC_OUTPUT_NORM = auto()
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@ -237,6 +273,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.OLMO: "olmo", MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.BITNET: "bitnet",
MODEL_ARCH.T5: "t5",
} }
TENSOR_NAMES: dict[MODEL_TENSOR, str] = { TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -288,6 +326,36 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
} }
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -808,6 +876,53 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP,
], ],
MODEL_ARCH.BITNET: [
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.ATTN_SUB_NORM,
MODEL_TENSOR.FFN_SUB_NORM,
],
MODEL_ARCH.T5: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.DEC_ATTN_NORM,
MODEL_TENSOR.DEC_ATTN_Q,
MODEL_TENSOR.DEC_ATTN_K,
MODEL_TENSOR.DEC_ATTN_V,
MODEL_TENSOR.DEC_ATTN_OUT,
MODEL_TENSOR.DEC_ATTN_REL_B,
MODEL_TENSOR.DEC_CROSS_ATTN_NORM,
MODEL_TENSOR.DEC_CROSS_ATTN_Q,
MODEL_TENSOR.DEC_CROSS_ATTN_K,
MODEL_TENSOR.DEC_CROSS_ATTN_V,
MODEL_TENSOR.DEC_CROSS_ATTN_OUT,
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B,
MODEL_TENSOR.DEC_FFN_NORM,
MODEL_TENSOR.DEC_FFN_GATE,
MODEL_TENSOR.DEC_FFN_DOWN,
MODEL_TENSOR.DEC_FFN_UP,
MODEL_TENSOR.DEC_OUTPUT_NORM,
MODEL_TENSOR.ENC_ATTN_NORM,
MODEL_TENSOR.ENC_ATTN_Q,
MODEL_TENSOR.ENC_ATTN_K,
MODEL_TENSOR.ENC_ATTN_V,
MODEL_TENSOR.ENC_ATTN_OUT,
MODEL_TENSOR.ENC_ATTN_REL_B,
MODEL_TENSOR.ENC_FFN_NORM,
MODEL_TENSOR.ENC_FFN_GATE,
MODEL_TENSOR.ENC_FFN_DOWN,
MODEL_TENSOR.ENC_FFN_UP,
MODEL_TENSOR.ENC_OUTPUT_NORM,
],
# TODO # TODO
} }

View file

@ -400,6 +400,9 @@ class GGUFWriter:
def add_parallel_residual(self, use: bool) -> None: def add_parallel_residual(self, use: bool) -> None:
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
def add_decoder_start_token_id(self, id: int) -> None:
self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
def add_head_count(self, count: int) -> None: def add_head_count(self, count: int) -> None:
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
@ -448,6 +451,9 @@ class GGUFWriter:
def add_kv_lora_rank(self, length: int) -> None: def add_kv_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length) self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
def add_relative_attn_buckets_count(self, value: int) -> None:
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
def add_pooling_type(self, value: PoolingType) -> None: def add_pooling_type(self, value: PoolingType) -> None:
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
@ -538,6 +544,12 @@ class GGUFWriter:
def add_add_space_prefix(self, value: bool) -> None: def add_add_space_prefix(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value) self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
def add_remove_extra_whitespaces(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.REMOVE_EXTRA_WS, value)
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
if not isinstance(value, str): if not isinstance(value, str):
template_default = None template_default = None
@ -599,6 +611,9 @@ class GGUFWriter:
kv_data += self._pack("Q", len(encoded_val)) kv_data += self._pack("Q", len(encoded_val))
kv_data += encoded_val kv_data += encoded_val
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val: elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
if isinstance(val, bytes):
ltype = GGUFValueType.UINT8
else:
ltype = GGUFValueType.get_type(val[0]) ltype = GGUFValueType.get_type(val[0])
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]): if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
raise ValueError("All items in a GGUF array should be of the same type") raise ValueError("All items in a GGUF array should be of the same type")

View file

@ -24,6 +24,7 @@ class TensorNameMap:
"backbone.embedding", # mamba "backbone.embedding", # mamba
"backbone.embeddings", # mamba-hf "backbone.embeddings", # mamba-hf
"transformer.in_out_embed", # Grok "transformer.in_out_embed", # Grok
"shared", # t5
), ),
# Token type embeddings # Token type embeddings
@ -413,6 +414,128 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_KV_A_NORM: ( MODEL_TENSOR.ATTN_KV_A_NORM: (
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2 "model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
), ),
MODEL_TENSOR.ATTN_SUB_NORM: (
"model.layers.{bid}.self_attn.inner_attn_ln", # bitnet
),
MODEL_TENSOR.FFN_SUB_NORM: (
"model.layers.{bid}.mlp.ffn_layernorm", # bitnet
),
MODEL_TENSOR.DEC_ATTN_NORM: (
"decoder.block.{bid}.layer.0.layer_norm", # t5
),
MODEL_TENSOR.DEC_ATTN_Q: (
"decoder.block.{bid}.layer.0.SelfAttention.q", # t5
),
MODEL_TENSOR.DEC_ATTN_K: (
"decoder.block.{bid}.layer.0.SelfAttention.k", # t5
),
MODEL_TENSOR.DEC_ATTN_V: (
"decoder.block.{bid}.layer.0.SelfAttention.v", # t5
),
MODEL_TENSOR.DEC_ATTN_OUT: (
"decoder.block.{bid}.layer.0.SelfAttention.o", # t5
),
MODEL_TENSOR.DEC_ATTN_REL_B: (
"decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
),
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
"decoder.block.{bid}.layer.1.layer_norm", # t5
),
MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
"decoder.block.{bid}.layer.1.EncDecAttention.q", # t5
),
MODEL_TENSOR.DEC_CROSS_ATTN_K: (
"decoder.block.{bid}.layer.1.EncDecAttention.k", # t5
),
MODEL_TENSOR.DEC_CROSS_ATTN_V: (
"decoder.block.{bid}.layer.1.EncDecAttention.v", # t5
),
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
"decoder.block.{bid}.layer.1.EncDecAttention.o", # t5
),
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
"decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5
),
MODEL_TENSOR.DEC_FFN_NORM: (
"decoder.block.{bid}.layer.2.layer_norm", # t5
),
MODEL_TENSOR.DEC_FFN_GATE: (
"decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5
),
MODEL_TENSOR.DEC_FFN_UP: (
"decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5
"decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5
),
MODEL_TENSOR.DEC_FFN_DOWN: (
"decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5
),
MODEL_TENSOR.DEC_OUTPUT_NORM: (
"decoder.final_layer_norm", # t5
),
MODEL_TENSOR.ENC_ATTN_NORM: (
"encoder.block.{bid}.layer.0.layer_norm", # t5
),
MODEL_TENSOR.ENC_ATTN_Q: (
"encoder.block.{bid}.layer.0.SelfAttention.q", # t5
),
MODEL_TENSOR.ENC_ATTN_K: (
"encoder.block.{bid}.layer.0.SelfAttention.k", # t5
),
MODEL_TENSOR.ENC_ATTN_V: (
"encoder.block.{bid}.layer.0.SelfAttention.v", # t5
),
MODEL_TENSOR.ENC_ATTN_OUT: (
"encoder.block.{bid}.layer.0.SelfAttention.o", # t5
),
MODEL_TENSOR.ENC_ATTN_REL_B: (
"encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
),
MODEL_TENSOR.ENC_FFN_NORM: (
"encoder.block.{bid}.layer.1.layer_norm", # t5
),
MODEL_TENSOR.ENC_FFN_GATE: (
"encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5
),
MODEL_TENSOR.ENC_FFN_UP: (
"encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5
"encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5
),
MODEL_TENSOR.ENC_FFN_DOWN: (
"encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
),
MODEL_TENSOR.ENC_OUTPUT_NORM: (
"encoder.final_layer_norm", # t5
),
} }
# architecture-specific block mappings # architecture-specific block mappings

241
llama.cpp
View file

@ -225,6 +225,7 @@ enum llm_arch {
LLM_ARCH_OLMO, LLM_ARCH_OLMO,
LLM_ARCH_ARCTIC, LLM_ARCH_ARCTIC,
LLM_ARCH_DEEPSEEK2, LLM_ARCH_DEEPSEEK2,
LLM_ARCH_BITNET,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
@ -263,6 +264,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_OLMO, "olmo" }, { LLM_ARCH_OLMO, "olmo" },
{ LLM_ARCH_ARCTIC, "arctic" }, { LLM_ARCH_ARCTIC, "arctic" },
{ LLM_ARCH_DEEPSEEK2, "deepseek2" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" },
{ LLM_ARCH_BITNET, "bitnet" },
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
@ -500,6 +502,8 @@ enum llm_tensor {
LLM_TENSOR_ATTN_KV_B, LLM_TENSOR_ATTN_KV_B,
LLM_TENSOR_ATTN_Q_A_NORM, LLM_TENSOR_ATTN_Q_A_NORM,
LLM_TENSOR_ATTN_KV_A_NORM, LLM_TENSOR_ATTN_KV_A_NORM,
LLM_TENSOR_ATTN_SUB_NORM,
LLM_TENSOR_FFN_SUB_NORM,
}; };
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = { static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@ -1113,6 +1117,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
}, },
}, },
{
LLM_ARCH_BITNET,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_SUB_NORM, "blk.%d.attn_sub_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
},
},
{ {
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
{ {
@ -2118,6 +2140,8 @@ struct llama_layer {
struct ggml_tensor * attn_out_norm_b; struct ggml_tensor * attn_out_norm_b;
struct ggml_tensor * attn_q_a_norm; struct ggml_tensor * attn_q_a_norm;
struct ggml_tensor * attn_kv_a_norm; struct ggml_tensor * attn_kv_a_norm;
struct ggml_tensor * attn_sub_norm;
struct ggml_tensor * ffn_sub_norm;
// attention // attention
struct ggml_tensor * wq; struct ggml_tensor * wq;
@ -2185,6 +2209,15 @@ struct llama_layer {
// long rope factors // long rope factors
struct ggml_tensor * rope_long = nullptr; struct ggml_tensor * rope_long = nullptr;
struct ggml_tensor * rope_short = nullptr; struct ggml_tensor * rope_short = nullptr;
// bitnet scale
struct ggml_tensor * wq_scale;
struct ggml_tensor * wk_scale;
struct ggml_tensor * wv_scale;
struct ggml_tensor * wo_scale;
struct ggml_tensor * ffn_gate_scale;
struct ggml_tensor * ffn_up_scale;
struct ggml_tensor * ffn_down_scale;
}; };
struct llama_kv_cell { struct llama_kv_cell {
@ -4710,6 +4743,15 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_BITNET:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 26: model.type = e_model::MODEL_3B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
default: (void)0; default: (void)0;
} }
@ -6655,6 +6697,44 @@ static bool llm_load_tensors(
} }
} }
} break; } break;
case LLM_ARCH_BITNET:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
// output
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
}
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
}
} break;
default: default:
throw std::runtime_error("unknown architecture"); throw std::runtime_error("unknown architecture");
} }
@ -7295,7 +7375,10 @@ static struct ggml_tensor * llm_build_kqv(
ggml_build_forward_expand(graph, cur); ggml_build_forward_expand(graph, cur);
if (wo) {
cur = ggml_mul_mat(ctx, wo, cur); cur = ggml_mul_mat(ctx, wo, cur);
}
if (wo_b) { if (wo_b) {
cb(cur, "kqv_wo", il); cb(cur, "kqv_wo", il);
} }
@ -11709,6 +11792,153 @@ struct llm_build_context {
return gf; return gf;
} }
struct ggml_cgraph * build_bitnet() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
// B1.K
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
// B1.V
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
nullptr, nullptr,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cur = llm_build_norm(ctx0, cur, hparams,
model.layers[il].attn_sub_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_sub_norm", il);
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
if (model.layers[il].bo) {
cur = ggml_add(ctx0, cur, model.layers[il].bo);
}
cb(cur, "attn_o_out", il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward forward
if (model.layers[il].ffn_gate_inp == nullptr) {
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
struct ggml_tensor *tmp = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_up_scale);
cb(tmp, "ffn_up", il);
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur);
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_gate_scale);
cb(cur, "ffn_gate", il);
cur = ggml_silu(ctx0, cur);
cb(cur, "ffn_silu", il);
cur = ggml_mul(ctx0, cur, tmp);
cb(cur, "ffn_gate_par", il);
cur = llm_build_norm(ctx0, cur, hparams,
model.layers[il].ffn_sub_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_sub_norm", il);
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
cb(cur, "ffn_down", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head
cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
}; };
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) { static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@ -11932,6 +12162,10 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_deepseek2(); result = llm.build_deepseek2();
} break; } break;
case LLM_ARCH_BITNET:
{
result = llm.build_bitnet();
} break;
default: default:
GGML_ASSERT(false); GGML_ASSERT(false);
} }
@ -12551,12 +12785,6 @@ static int llama_decode_internal(
} }
} }
#ifdef GGML_PERF
// print timing information per ggml operation (for debugging purposes)
// requires GGML_PERF to be defined
ggml_graph_print(gf);
#endif
// plot the computation graph in dot format (for debugging purposes) // plot the computation graph in dot format (for debugging purposes)
//if (n_past%100 == 0) { //if (n_past%100 == 0) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot"); // ggml_graph_dump_dot(gf, NULL, "llama.dot");
@ -16760,6 +16988,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_BERT: case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_STABLELM: case LLM_ARCH_STABLELM:
case LLM_ARCH_BITNET:
case LLM_ARCH_QWEN: case LLM_ARCH_QWEN:
case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2:
case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN2MOE:

View file

@ -786,7 +786,7 @@ extern "C" {
// Get the number of threads used for prompt and batch processing (multiple token). // Get the number of threads used for prompt and batch processing (multiple token).
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx); LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
// Set whether the model is in embeddings model or not // Set whether the model is in embeddings mode or not
// If true, embeddings will be returned but logits will not // If true, embeddings will be returned but logits will not
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings); LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);

View file

@ -249,8 +249,7 @@ class tinyBLAS {
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
} }
void matmul(int64_t m, int64_t n, int task) { void matmul(int64_t m, int64_t n) {
if (task == GGML_TASK_TYPE_COMPUTE)
mnpack(0, m, 0, n); mnpack(0, m, 0, n);
} }
@ -458,8 +457,7 @@ class tinyBLAS_Q0_ARM {
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
} }
void matmul(int64_t m, int64_t n, int task) { void matmul(int64_t m, int64_t n) {
if (task == GGML_TASK_TYPE_COMPUTE)
mnpack(0, m, 0, n); mnpack(0, m, 0, n);
} }
@ -596,8 +594,7 @@ class tinyBLAS_Q0_AVX {
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
} }
void matmul(int64_t m, int64_t n, int task) { void matmul(int64_t m, int64_t n) {
if (task == GGML_TASK_TYPE_COMPUTE)
mnpack(0, m, 0, n); mnpack(0, m, 0, n);
} }
@ -829,7 +826,7 @@ class tinyBLAS_Q0_AVX {
* For example, for single-threaded single-precision GEMM you can say * For example, for single-threaded single-precision GEMM you can say
* *
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
* 0, 1, GGML_TASK_TYPE_COMPUTE, * 0, 1,
* GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32); * GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
* *
* @param m is rows in `A` and `C` * @param m is rows in `A` and `C`
@ -843,14 +840,13 @@ class tinyBLAS_Q0_AVX {
* @param ldc is row stride of `C` * @param ldc is row stride of `C`
* @param ith is thread id (must be less than `nth`) * @param ith is thread id (must be less than `nth`)
* @param nth is number of threads (must be greater than zero) * @param nth is number of threads (must be greater than zero)
* @param task is GGML task type
* @param Atype is GGML data type of `A` * @param Atype is GGML data type of `A`
* @param Btype is GGML data type of `B` * @param Btype is GGML data type of `B`
* @param Ctype is GGML data type of `C` * @param Ctype is GGML data type of `C`
* @return true if this function was able to service the matmul request * @return true if this function was able to service the matmul request
*/ */
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C, bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) { int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
assert(m >= 0); assert(m >= 0);
assert(n >= 0); assert(n >= 0);
@ -877,7 +873,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#elif defined(__AVX__) || defined(__AVX2__) #elif defined(__AVX__) || defined(__AVX2__)
if (k % 8) if (k % 8)
@ -887,7 +883,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#elif defined(__ARM_NEON) #elif defined(__ARM_NEON)
if (n < 4) if (n < 4)
@ -899,7 +895,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#else #else
return false; return false;
@ -917,7 +913,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
if (k % 8) if (k % 8)
@ -929,7 +925,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
if (n < 8) if (n < 8)
@ -943,7 +939,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const ggml_fp16_t *)B, ldb, (const ggml_fp16_t *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#elif defined(__ARM_NEON) && !defined(_MSC_VER) #elif defined(__ARM_NEON) && !defined(_MSC_VER)
if (k % 4) if (k % 4)
@ -955,7 +951,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#else #else
return false; return false;
@ -971,7 +967,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const block_q8_0 *)B, ldb, (const block_q8_0 *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#elif defined(__ARM_FEATURE_DOTPROD) #elif defined(__ARM_FEATURE_DOTPROD)
tinyBLAS_Q0_ARM<block_q8_0> tb{ tinyBLAS_Q0_ARM<block_q8_0> tb{
@ -979,7 +975,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const block_q8_0 *)B, ldb, (const block_q8_0 *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#else #else
return false; return false;
@ -995,7 +991,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const block_q8_0 *)B, ldb, (const block_q8_0 *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#elif defined(__ARM_FEATURE_DOTPROD) #elif defined(__ARM_FEATURE_DOTPROD)
tinyBLAS_Q0_ARM<block_q4_0> tb{ tinyBLAS_Q0_ARM<block_q4_0> tb{
@ -1003,7 +999,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const block_q8_0 *)B, ldb, (const block_q8_0 *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#else #else
return false; return false;
@ -1025,7 +1021,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(void)ldc; (void)ldc;
(void)ith; (void)ith;
(void)nth; (void)nth;
(void)task;
(void)Atype; (void)Atype;
(void)Btype; (void)Btype;
(void)Ctype; (void)Ctype;

View file

@ -7,7 +7,7 @@ extern "C" {
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t, bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
const void *, int64_t, void *, int64_t, int, int, const void *, int64_t, void *, int64_t, int, int,
int, int, int, int); int, int, int);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -785,6 +785,10 @@ struct test_cpy : public test_case {
return VARS_TO_STR3(type_src, type_dst, ne); return VARS_TO_STR3(type_src, type_dst, ne);
} }
double max_nmse_err() override {
return 1e-6;
}
size_t op_size(ggml_tensor * t) override { size_t op_size(ggml_tensor * t) override {
return ggml_nbytes(t) + ggml_nbytes(t->src[0]); return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
} }

View file

@ -13,7 +13,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
shared FLOAT_TYPE tmp[BLOCK_SIZE]; shared FLOAT_TYPE tmp[BLOCK_SIZE];
void main() { void main() {
const uint row = gl_WorkGroupID.x; const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
const uint tid = gl_LocalInvocationID.x; const uint tid = gl_LocalInvocationID.x;
uint a_offset, b_offset, d_offset; uint a_offset, b_offset, d_offset;

View file

@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE tmp[32]; shared FLOAT_TYPE tmp[32];
void main() { void main() {
const uint row = gl_WorkGroupID.x; const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
uint a_offset, b_offset, d_offset; uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset); get_offsets(a_offset, b_offset, d_offset);

View file

@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE tmp[32]; shared FLOAT_TYPE tmp[32];
void main() { void main() {
const uint row = gl_WorkGroupID.x; const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
uint a_offset, b_offset, d_offset; uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset); get_offsets(a_offset, b_offset, d_offset);

View file

@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE tmp[32]; shared FLOAT_TYPE tmp[32];
void main() { void main() {
const uint row = gl_WorkGroupID.x; const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
uint a_offset, b_offset, d_offset; uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset); get_offsets(a_offset, b_offset, d_offset);

View file

@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE tmp[32]; shared FLOAT_TYPE tmp[32];
void main() { void main() {
const uint row = gl_WorkGroupID.x; const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
uint a_offset, b_offset, d_offset; uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset); get_offsets(a_offset, b_offset, d_offset);

View file

@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE tmp[32]; shared FLOAT_TYPE tmp[32];
void main() { void main() {
const uint row = gl_WorkGroupID.x; const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
uint a_offset, b_offset, d_offset; uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset); get_offsets(a_offset, b_offset, d_offset);