diff --git a/Makefile b/Makefile index ba73f0637..0a2070b53 100644 --- a/Makefile +++ b/Makefile @@ -569,6 +569,14 @@ $(info I CC: $(shell $(CC) --version | head -n 1)) $(info I CXX: $(shell $(CXX) --version | head -n 1)) ifdef LLAMA_CUBLAS $(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) +CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])') +ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) +ifndef CUDA_DOCKER_ARCH +ifndef CUDA_POWER_ARCH +$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH) +endif # CUDA_POWER_ARCH +endif # CUDA_DOCKER_ARCH +endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) endif # LLAMA_CUBLAS $(info ) diff --git a/ci/run.sh b/ci/run.sh index 82fe247a5..a4264d775 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -568,6 +568,50 @@ function gg_sum_open_llama_7b_v2 { #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" } +# bge-small + +function gg_run_embd_bge_small { + cd ${SRC} + + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt + + path_models="../models-mnt/bge-small" + + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + + python3 ../convert-hf-to-gguf.py ${path_models} + + model_f16="${path_models}/ggml-model-f16.gguf" + model_q8_0="${path_models}/ggml-model-q8_0.gguf" + + ./bin/quantize ${model_f16} ${model_q8_0} q8_0 + + (time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + + set +e +} + +function gg_sum_embd_bge_small { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'BGE Small (BERT):\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" + gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" +} + ## main if [ -z ${GG_BUILD_LOW_PERF} ]; then @@ -591,6 +635,8 @@ test $ret -eq 0 && gg_run ctest_debug test $ret -eq 0 && gg_run ctest_release if [ -z ${GG_BUILD_LOW_PERF} ]; then + test $ret -eq 0 && gg_run embd_bge_small + if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_CUDA} ]; then test $ret -eq 0 && gg_run open_llama_3b_v2 diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index cae1551a2..ae471481d 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -10,7 +10,7 @@ import re import sys from enum import IntEnum from pathlib import Path -from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast +from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast import numpy as np import torch @@ -25,15 +25,6 @@ import gguf from convert import HfVocab -# check for any of the given keys in the dictionary and return the value of the first key found -def get_key_opts(d, keys): - for k in keys: - if k in d: - return d[k] - print(f"Could not find any of {keys}") - sys.exit() - - ###### MODEL DEFINITIONS ###### class SentencePieceTokenTypes(IntEnum): @@ -58,6 +49,15 @@ class Model: self.hparams = Model.load_hparams(self.dir_model) self.model_arch = self._get_model_architecture() self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) + self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) + + def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any: + key = next((k for k in keys if k in self.hparams), None) + if key is not None: + return self.hparams[key] + if optional: + return None + raise KeyError(f"could not find any of: {keys}") def set_vocab(self): self._set_vocab_gpt2() @@ -79,28 +79,33 @@ class Model: def set_gguf_parameters(self): self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams.get( - "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), - )) - if (n_ctx := self.hparams.get("max_position_embeddings")) is not None: + self.gguf_writer.add_block_count(self.block_count) + + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: self.gguf_writer.add_context_length(n_ctx) - if (n_embd := self.hparams.get("hidden_size")) is not None: - self.gguf_writer.add_embedding_length(n_embd) - if (n_ff := self.hparams.get("intermediate_size")) is not None: + + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + self.gguf_writer.add_embedding_length(n_embd) + + if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) - if (n_head := self.hparams.get("num_attention_heads")) is not None: - self.gguf_writer.add_head_count(n_head) + + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + self.gguf_writer.add_head_count(n_head) + if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) - if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None: - self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps) + if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_eps(f_norm_eps) if (n_experts := self.hparams.get("num_local_experts")) is not None: self.gguf_writer.add_expert_count(n_experts) if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: self.gguf_writer.add_expert_used_count(n_experts_used) - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + self.gguf_writer.add_file_type(self.ftype) def write_tensors(self): block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) @@ -211,6 +216,8 @@ class Model: return MiniCPMModel if model_architecture == "BertModel": return BertModel + if model_architecture == "NomicBertModel": + return NomicBertModel return Model def _is_model_safetensors(self) -> bool: @@ -268,6 +275,8 @@ class Model: return gguf.MODEL_ARCH.MINICPM if arch == "BertModel": return gguf.MODEL_ARCH.BERT + if arch == "NomicBertModel": + return gguf.MODEL_ARCH.NOMIC_BERT raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -1297,21 +1306,21 @@ class GPT2Model(Model): class Phi2Model(Model): def set_gguf_parameters(self): - block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"]) + block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) - rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"]) - n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"]) - n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"]) + rot_pct = self.find_hparam(["partial_rotary_factor"]) + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_name("Phi2") - self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"])) + self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_feed_forward_length(4 * n_embd) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"])) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_add_bos_token(False) @@ -1636,19 +1645,12 @@ in chat mode so that the conversation can end normally.") class BertModel(Model): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.block_count = self.hparams["num_hidden_layers"] + self.vocab_size = None def set_gguf_parameters(self): - # TODO(cebtenzzre): merge with parent class - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + super().set_gguf_parameters() self.gguf_writer.add_causal_attention(False) - self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_pooling_layer(True) def set_vocab(self): path = self.dir_model @@ -1658,6 +1660,7 @@ class BertModel(Model): vocab = HfVocab(path, added_tokens_path) tokens, scores, toktypes = zip(*vocab.all_tokens()) assert len(tokens) == vocab.vocab_size + self.vocab_size = vocab.vocab_size # we need this to validate the size of the token_type embeddings # though currently we are passing all zeros to the token_type embeddings @@ -1671,7 +1674,7 @@ class BertModel(Model): if tok.startswith(b"##"): return tok[2:] return b"\xe2\x96\x81" + tok - tokens = [phantom(t, y) for t, y in zip(tokens, toktypes)] + tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes)) # set up bos and eos tokens (cls and sep) self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id) @@ -1723,6 +1726,43 @@ class BertModel(Model): self.gguf_writer.add_tensor(new_name, data) +class NomicBertModel(BertModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # the HF config claims n_ctx=8192, but it uses RoPE scaling + self.hparams["n_ctx"] = 2048 + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # this doesn't do anything in the HF version + assert self.hparams["causal"] is False + # no bias tensors + assert self.hparams["qkv_proj_bias"] is False + assert self.hparams["mlp_fc1_bias"] is False + assert self.hparams["mlp_fc2_bias"] is False + # norm at end of layer + assert self.hparams["prenorm"] is False + # standard RoPE + assert self.hparams["rotary_emb_fraction"] == 1.0 + assert self.hparams["rotary_emb_interleaved"] is False + assert self.hparams["rotary_emb_scale_base"] is None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + + def get_tensors(self): + assert self.vocab_size is not None + for name, data in super().get_tensors(): + # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly. + if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size: + rounded_vocab_size = (self.vocab_size + 63) // 64 * 64 + assert data.shape == (rounded_vocab_size, self.hparams["n_embd"]) + data = data[:self.vocab_size, :] + yield name, data + + ###### CONVERSION LOGIC ###### diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 68ad89964..653abc73a 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -38,6 +38,7 @@ else() add_subdirectory(speculative) add_subdirectory(lookahead) add_subdirectory(lookup) + add_subdirectory(gguf) add_subdirectory(train-text-from-scratch) add_subdirectory(imatrix) if (LLAMA_BUILD_SERVER) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 27376c8f0..b4688cf51 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -7,6 +7,51 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +static std::vector split_lines(const std::string & s) { + std::string line; + std::vector lines; + std::stringstream ss(s); + while (std::getline(ss, line)) { + lines.push_back(line); + } + return lines; +} + +static void batch_add_seq(llama_batch & batch, const std::vector & tokens, int seq_id) { + for (size_t i = 0; i < tokens.size(); i++) { + llama_batch_add(batch, tokens[i], i, { seq_id }, false); + } +} + +static void normalize(float * vec, float * out, int n) { + float norm = 0; + for (int i = 0; i < n; i++) { + norm += vec[i] * vec[i]; + } + norm = sqrt(norm); + for (int i = 0; i < n; i++) { + out[i] = vec[i] / norm; + } +} + +static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { + // clear previous kv_cache values (irrelevant for embeddings) + llama_kv_cache_clear(ctx); + + // run model + fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); + if (llama_decode(ctx, batch) < 0) { + fprintf(stderr, "%s : failed to decode\n", __func__); + } + + // normalize on copy + for (int k = 0; k < n_seq; k++) { + float * emb = llama_get_embeddings_ith(ctx, k); + float * out = output + k * n_embd; + normalize(emb, out, n_embd); + } +} + int main(int argc, char ** argv) { gpt_params params; @@ -55,59 +100,84 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s\n", get_system_info(params).c_str()); } - int n_past = 0; + // split the prompt into lines + std::vector prompts = split_lines(params.prompt); - // tokenize the prompt - auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + // max batch size + const uint64_t n_batch = params.n_batch; + GGML_ASSERT(params.n_batch == params.n_ctx); + // tokenize the prompts and trim + std::vector> inputs; + for (const auto & prompt : prompts) { + auto inp = ::llama_tokenize(ctx, prompt, true); + if (inp.size() > n_batch) { + inp.resize(n_batch); + } + inputs.push_back(inp); + } + + // tokenization stats if (params.verbose_prompt) { - fprintf(stderr, "\n"); - fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + for (int i = 0; i < (int) inputs.size(); i++) { + fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); + for (int j = 0; j < (int) inputs[i].size(); j++) { + fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); + } + fprintf(stderr, "\n\n"); } - fprintf(stderr, "\n"); } - if (embd_inp.size() > (size_t)n_ctx) { - fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n", - __func__, embd_inp.size(), n_ctx); - return 1; - } - - while (!embd_inp.empty()) { - int n_tokens = std::min(params.n_batch, (int) embd_inp.size()); - if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return 1; - } - n_past += n_tokens; - embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens); - } + // initialize batch + const int n_prompts = prompts.size(); + struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts); + // allocate output const int n_embd = llama_n_embd(model); - auto * embeddings = llama_get_embeddings(ctx); + std::vector embeddings(n_prompts * n_embd, 0); + float * emb = embeddings.data(); - // l2-normalize embeddings - float norm = 0; - for (int i = 0; i < n_embd; i++) { - norm += embeddings[i] * embeddings[i]; - } - norm = sqrt(norm); - for (int i = 0; i < n_embd; i++) { - embeddings[i] /= norm; + // break into batches + int p = 0; // number of prompts processed already + int s = 0; // number of prompts in current batch + for (int k = 0; k < n_prompts; k++) { + // clamp to n_batch tokens + auto & inp = inputs[k]; + const uint64_t n_toks = inp.size(); + + // encode if at capacity + if (batch.n_tokens + n_toks > n_batch) { + float * out = emb + p * n_embd; + batch_decode(ctx, batch, out, s, n_embd); + llama_batch_clear(batch); + p += s; + s = 0; + } + + // add to batch + batch_add_seq(batch, inp, s); + s += 1; } - for (int i = 0; i < n_embd; i++) { - printf("%f ", embeddings[i]); - } - printf("\n"); + // final batch + float * out = emb + p * n_embd; + batch_decode(ctx, batch, out, s, n_embd); + // print first 3 embeddings + for (int j = 0; j < std::min(3, n_prompts); j++) { + fprintf(stderr, "embedding %d: ", j); + for (int i = 0; i < n_embd; i++) { + fprintf(stderr, "%f ", emb[j * n_embd + i]); + } + fprintf(stderr, "\n\n"); + } + fprintf(stderr, "\n"); + + // clean up llama_print_timings(ctx); llama_free(ctx); llama_free_model(model); - llama_backend_free(); return 0; diff --git a/examples/finetune/README.md b/examples/finetune/README.md index a884706c5..2fafd505e 100644 --- a/examples/finetune/README.md +++ b/examples/finetune/README.md @@ -80,9 +80,9 @@ The LORA rank can be configured for each model tensor type separately with these --rank-wk N LORA rank for wk tensor (default 4) --rank-wv N LORA rank for wv tensor (default 4) --rank-wo N LORA rank for wo tensor (default 4) - --rank-w1 N LORA rank for w1 tensor (default 4) - --rank-w2 N LORA rank for w2 tensor (default 4) - --rank-w3 N LORA rank for w3 tensor (default 4) + --rank-ffn_gate N LORA rank for ffn_gate tensor (default 4) + --rank-ffn_down N LORA rank for ffn_down tensor (default 4) + --rank-ffn_up N LORA rank for ffn_up tensor (default 4) ``` The LORA rank of 'norm' tensors should always be 1. diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index b11c56020..98bf5a07a 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -60,9 +60,9 @@ struct my_llama_layer { struct ggml_tensor * ffn_norm; // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; + struct ggml_tensor * ffn_gate; // w1 + struct ggml_tensor * ffn_down; // w2 + struct ggml_tensor * ffn_up; // w3 }; struct my_llama_model { @@ -85,9 +85,9 @@ struct my_llama_lora_hparams { uint32_t n_rank_wv = 4; uint32_t n_rank_wo = 4; uint32_t n_rank_ffn_norm = 1; - uint32_t n_rank_w1 = 4; - uint32_t n_rank_w2 = 4; - uint32_t n_rank_w3 = 4; + uint32_t n_rank_ffn_gate = 4; + uint32_t n_rank_ffn_down = 4; + uint32_t n_rank_ffn_up = 4; uint32_t n_rank_tok_embeddings = 4; uint32_t n_rank_norm = 1; uint32_t n_rank_output = 4; @@ -117,12 +117,12 @@ struct my_llama_lora_layer { struct ggml_tensor * ffn_norm_b; // ff - struct ggml_tensor * w1_a; - struct ggml_tensor * w1_b; - struct ggml_tensor * w2_a; - struct ggml_tensor * w2_b; - struct ggml_tensor * w3_a; - struct ggml_tensor * w3_b; + struct ggml_tensor * ffn_gate_a; + struct ggml_tensor * ffn_gate_b; + struct ggml_tensor * ffn_down_a; + struct ggml_tensor * ffn_down_b; + struct ggml_tensor * ffn_up_a; + struct ggml_tensor * ffn_up_b; }; struct my_llama_lora { @@ -208,9 +208,9 @@ static void print_lora_params(struct my_llama_lora_hparams * params) { printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv); printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo); printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm); - printf("%s: n_rank_w1 : %u\n", __func__, params->n_rank_w1); - printf("%s: n_rank_w2 : %u\n", __func__, params->n_rank_w2); - printf("%s: n_rank_w3 : %u\n", __func__, params->n_rank_w3); + printf("%s: n_rank_ffn_gate : %u\n", __func__, params->n_rank_ffn_gate); + printf("%s: n_rank_ffn_down : %u\n", __func__, params->n_rank_ffn_down); + printf("%s: n_rank_ffn_up : %u\n", __func__, params->n_rank_ffn_up); printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings); printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm); printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output); @@ -319,9 +319,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i)); layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i)); layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i)); - layer.w1 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); - layer.w2 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); - layer.w3 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); + layer.ffn_gate = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); + layer.ffn_down = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); + layer.ffn_up = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); assert_shape_1d(layer.attention_norm, hparams.n_embd); assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd); @@ -329,9 +329,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa()); assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd); assert_shape_1d(layer.ffn_norm, hparams.n_embd); - assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff); - assert_shape_2d(layer.w2, hparams.n_ff, hparams.n_embd); - assert_shape_2d(layer.w3, hparams.n_embd, hparams.n_ff); + assert_shape_2d(layer.ffn_gate, hparams.n_embd, hparams.n_ff); + assert_shape_2d(layer.ffn_down, hparams.n_ff, hparams.n_embd); + assert_shape_2d(layer.ffn_up, hparams.n_embd, hparams.n_ff); } } @@ -362,12 +362,12 @@ static void set_param_lora(struct my_llama_lora * lora) { ggml_set_param(ctx, layer.wo_b); ggml_set_param(ctx, layer.ffn_norm_a); ggml_set_param(ctx, layer.ffn_norm_b); - ggml_set_param(ctx, layer.w1_a); - ggml_set_param(ctx, layer.w1_b); - ggml_set_param(ctx, layer.w2_a); - ggml_set_param(ctx, layer.w2_b); - ggml_set_param(ctx, layer.w3_a); - ggml_set_param(ctx, layer.w3_b); + ggml_set_param(ctx, layer.ffn_gate_a); + ggml_set_param(ctx, layer.ffn_gate_b); + ggml_set_param(ctx, layer.ffn_down_a); + ggml_set_param(ctx, layer.ffn_down_b); + ggml_set_param(ctx, layer.ffn_up_a); + ggml_set_param(ctx, layer.ffn_up_b); } } @@ -435,12 +435,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd); layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1); - layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd); - layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff); - layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff); - layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd); - layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd); - layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff); + layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd); + layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff); + layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff); + layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd); + layer.ffn_up_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_embd); + layer.ffn_up_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_ff); ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i)); ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i)); @@ -454,12 +454,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i)); ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i)); ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i)); - ggml_set_name(layer.w1_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); - ggml_set_name(layer.w1_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); - ggml_set_name(layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); - ggml_set_name(layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); - ggml_set_name(layer.w3_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); - ggml_set_name(layer.w3_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); } set_param_lora(lora); @@ -497,12 +497,12 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl randomize_tensor_normal(layer.ffn_norm_a, rnd); ggml_set_zero(layer.ffn_norm_b); - randomize_tensor_normal(layer.w1_a, rnd); - ggml_set_zero(layer.w1_b); - randomize_tensor_normal(layer.w2_a, rnd); - ggml_set_zero(layer.w2_b); - randomize_tensor_normal(layer.w3_a, rnd); - ggml_set_zero(layer.w3_b); + randomize_tensor_normal(layer.ffn_gate_a, rnd); + ggml_set_zero(layer.ffn_gate_b); + randomize_tensor_normal(layer.ffn_down_a, rnd); + ggml_set_zero(layer.ffn_down_b); + randomize_tensor_normal(layer.ffn_up_a, rnd); + ggml_set_zero(layer.ffn_up_b); } free_random_normal_distribution(rnd); @@ -610,13 +610,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b)); struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b)); - struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); - struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); - struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); - struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); - struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b)); - struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b)); - struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b)); + struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); + struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); + struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); + struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); + struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b)); + struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b)); + struct ggml_tensor * ffn_up = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b)); struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch); struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch); @@ -659,11 +659,11 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); - struct ggml_tensor * t25 = ggml_mul_mat (ctx, w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); - struct ggml_tensor * t26 = ggml_mul_mat (ctx, w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); + struct ggml_tensor * t25 = ggml_mul_mat (ctx, ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); + struct ggml_tensor * t26 = ggml_mul_mat (ctx, ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); - struct ggml_tensor * t29 = ggml_mul_mat (ctx, w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); + struct ggml_tensor * t29 = ggml_mul_mat (ctx, ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); cur = t30; if (enable_checkpointing) { @@ -723,9 +723,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f)); } // allocating checkpoints in one block to reduce memory fragmentation @@ -798,9 +798,9 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V); GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT); GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); init_lora(model, lora); @@ -825,12 +825,12 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b)); copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a)); copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b)); - copy_tensor_by_name(layer.w1_a, f_ggml_ctx, ggml_get_name(layer.w1_a)); - copy_tensor_by_name(layer.w1_b, f_ggml_ctx, ggml_get_name(layer.w1_b)); - copy_tensor_by_name(layer.w2_a, f_ggml_ctx, ggml_get_name(layer.w2_a)); - copy_tensor_by_name(layer.w2_b, f_ggml_ctx, ggml_get_name(layer.w2_b)); - copy_tensor_by_name(layer.w3_a, f_ggml_ctx, ggml_get_name(layer.w3_a)); - copy_tensor_by_name(layer.w3_b, f_ggml_ctx, ggml_get_name(layer.w3_b)); + copy_tensor_by_name(layer.ffn_gate_a, f_ggml_ctx, ggml_get_name(layer.ffn_gate_a)); + copy_tensor_by_name(layer.ffn_gate_b, f_ggml_ctx, ggml_get_name(layer.ffn_gate_b)); + copy_tensor_by_name(layer.ffn_down_a, f_ggml_ctx, ggml_get_name(layer.ffn_down_a)); + copy_tensor_by_name(layer.ffn_down_b, f_ggml_ctx, ggml_get_name(layer.ffn_down_b)); + copy_tensor_by_name(layer.ffn_up_a, f_ggml_ctx, ggml_get_name(layer.ffn_up_a)); + copy_tensor_by_name(layer.ffn_up_b, f_ggml_ctx, ggml_get_name(layer.ffn_up_b)); } } @@ -868,9 +868,9 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_w1); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_w2); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_w3); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_ffn_gate); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_ffn_down); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_ffn_up); gguf_add_tensor(fctx, lora->tok_embeddings_a); gguf_add_tensor(fctx, lora->tok_embeddings_b); @@ -894,12 +894,12 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod gguf_add_tensor(fctx, layer.wo_b); gguf_add_tensor(fctx, layer.ffn_norm_a); gguf_add_tensor(fctx, layer.ffn_norm_b); - gguf_add_tensor(fctx, layer.w1_a); - gguf_add_tensor(fctx, layer.w1_b); - gguf_add_tensor(fctx, layer.w2_a); - gguf_add_tensor(fctx, layer.w2_b); - gguf_add_tensor(fctx, layer.w3_a); - gguf_add_tensor(fctx, layer.w3_b); + gguf_add_tensor(fctx, layer.ffn_gate_a); + gguf_add_tensor(fctx, layer.ffn_gate_b); + gguf_add_tensor(fctx, layer.ffn_down_a); + gguf_add_tensor(fctx, layer.ffn_down_b); + gguf_add_tensor(fctx, layer.ffn_up_a); + gguf_add_tensor(fctx, layer.ffn_up_b); } } @@ -1104,12 +1104,12 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB")); write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA")); write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB")); - write_tensor(&file, layer.w1_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); - write_tensor(&file, layer.w1_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); - write_tensor(&file, layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); - write_tensor(&file, layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); - write_tensor(&file, layer.w3_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); - write_tensor(&file, layer.w3_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); } } @@ -1139,9 +1139,9 @@ struct train_params { uint32_t n_rank_wv; uint32_t n_rank_wo; uint32_t n_rank_ffn_norm; - uint32_t n_rank_w1; - uint32_t n_rank_w2; - uint32_t n_rank_w3; + uint32_t n_rank_ffn_gate; + uint32_t n_rank_ffn_down; + uint32_t n_rank_ffn_up; uint32_t n_rank_tok_embeddings; uint32_t n_rank_norm; uint32_t n_rank_output; @@ -1152,9 +1152,9 @@ struct train_params { bool custom_n_rank_wv; bool custom_n_rank_wo; bool custom_n_rank_ffn_norm; - bool custom_n_rank_w1; - bool custom_n_rank_w2; - bool custom_n_rank_w3; + bool custom_n_rank_ffn_gate; + bool custom_n_rank_ffn_down; + bool custom_n_rank_ffn_up; bool custom_n_rank_tok_embeddings; bool custom_n_rank_norm; bool custom_n_rank_output; @@ -1186,9 +1186,9 @@ static struct train_params get_default_train_params() { params.n_rank_wv = 4; params.n_rank_wo = 4; params.n_rank_ffn_norm = 1; - params.n_rank_w1 = 4; - params.n_rank_w2 = 4; - params.n_rank_w3 = 4; + params.n_rank_ffn_gate = 4; + params.n_rank_ffn_down = 4; + params.n_rank_ffn_up = 4; params.n_rank_tok_embeddings = 4; params.n_rank_norm = 1; params.n_rank_output = 4; @@ -1199,9 +1199,9 @@ static struct train_params get_default_train_params() { params.custom_n_rank_wv = false; params.custom_n_rank_wo = false; params.custom_n_rank_ffn_norm = false; - params.custom_n_rank_w1 = false; - params.custom_n_rank_w2 = false; - params.custom_n_rank_w3 = false; + params.custom_n_rank_ffn_gate = false; + params.custom_n_rank_ffn_down = false; + params.custom_n_rank_ffn_up = false; params.custom_n_rank_tok_embeddings = false; params.custom_n_rank_norm = false; params.custom_n_rank_output = false; @@ -1232,9 +1232,9 @@ static void train_print_usage(int argc, char ** argv, const struct train_params fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n"); fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n"); fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-w1 N LORA rank for w1 tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-w2 N LORA rank for w2 tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-w3 N LORA rank for w3 tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_gate N LORA rank for ffn_gate tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_down N LORA rank for ffn_down tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_up N LORA rank for ffn_up tensor, overrides default rank.\n"); print_common_train_usage(argc, argv, ¶ms->common); } @@ -1369,27 +1369,27 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par } params->n_rank_wo = std::stoi(argv[i]); params->custom_n_rank_wo = true; - } else if (arg == "--rank-w1") { + } else if (arg == "--rank-ffn_gate") { if (++i >= argc) { invalid_param = true; break; } - params->n_rank_w1 = std::stoi(argv[i]); - params->custom_n_rank_w1 = true; - } else if (arg == "--rank-w2") { + params->n_rank_ffn_gate = std::stoi(argv[i]); + params->custom_n_rank_ffn_gate = true; + } else if (arg == "--rank-ffn_down") { if (++i >= argc) { invalid_param = true; break; } - params->n_rank_w2 = std::stoi(argv[i]); - params->custom_n_rank_w2 = true; - } else if (arg == "--rank-w3") { + params->n_rank_ffn_down = std::stoi(argv[i]); + params->custom_n_rank_ffn_down = true; + } else if (arg == "--rank-ffn_up") { if (++i >= argc) { invalid_param = true; break; } - params->n_rank_w3 = std::stoi(argv[i]); - params->custom_n_rank_w3 = true; + params->n_rank_ffn_up = std::stoi(argv[i]); + params->custom_n_rank_ffn_up = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); train_print_usage(argc, argv, &default_params); @@ -1452,12 +1452,12 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) { nx += ggml_nelements(layer.wo_b); nx += ggml_nelements(layer.ffn_norm_a); nx += ggml_nelements(layer.ffn_norm_b); - nx += ggml_nelements(layer.w1_a); - nx += ggml_nelements(layer.w1_b); - nx += ggml_nelements(layer.w2_a); - nx += ggml_nelements(layer.w2_b); - nx += ggml_nelements(layer.w3_a); - nx += ggml_nelements(layer.w3_b); + nx += ggml_nelements(layer.ffn_gate_a); + nx += ggml_nelements(layer.ffn_gate_b); + nx += ggml_nelements(layer.ffn_down_a); + nx += ggml_nelements(layer.ffn_down_b); + nx += ggml_nelements(layer.ffn_up_a); + nx += ggml_nelements(layer.ffn_up_b); } return nx; } @@ -1511,9 +1511,9 @@ int main(int argc, char ** argv) { uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r; uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r; uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1; - uint32_t n_rank_w1 = params.custom_n_rank_w1 ? params.n_rank_w1 : params.lora_r; - uint32_t n_rank_w2 = params.custom_n_rank_w2 ? params.n_rank_w2 : params.lora_r; - uint32_t n_rank_w3 = params.custom_n_rank_w3 ? params.n_rank_w3 : params.lora_r; + uint32_t n_rank_ffn_gate = params.custom_n_rank_ffn_gate ? params.n_rank_ffn_gate : params.lora_r; + uint32_t n_rank_ffn_down = params.custom_n_rank_ffn_down ? params.n_rank_ffn_down : params.lora_r; + uint32_t n_rank_ffn_up = params.custom_n_rank_ffn_up ? params.n_rank_ffn_up : params.lora_r; uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r; uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1; uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r; @@ -1523,9 +1523,9 @@ int main(int argc, char ** argv) { lora.hparams.n_rank_wv = n_rank_wv; lora.hparams.n_rank_wo = n_rank_wo; lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm; - lora.hparams.n_rank_w1 = n_rank_w1; - lora.hparams.n_rank_w2 = n_rank_w2; - lora.hparams.n_rank_w3 = n_rank_w3; + lora.hparams.n_rank_ffn_gate = n_rank_ffn_gate; + lora.hparams.n_rank_ffn_down = n_rank_ffn_down; + lora.hparams.n_rank_ffn_up = n_rank_ffn_up; lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings; lora.hparams.n_rank_norm = n_rank_norm; lora.hparams.n_rank_output = n_rank_output; @@ -1566,9 +1566,9 @@ int main(int argc, char ** argv) { || (lora.hparams.n_rank_wv != n_rank_wv) || (lora.hparams.n_rank_wo != n_rank_wo) || (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm) - || (lora.hparams.n_rank_w1 != n_rank_w1) - || (lora.hparams.n_rank_w2 != n_rank_w2) - || (lora.hparams.n_rank_w3 != n_rank_w3) + || (lora.hparams.n_rank_ffn_gate != n_rank_ffn_gate) + || (lora.hparams.n_rank_ffn_down != n_rank_ffn_down) + || (lora.hparams.n_rank_ffn_up != n_rank_ffn_up) || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings) || (lora.hparams.n_rank_norm != n_rank_norm) || (lora.hparams.n_rank_output != n_rank_output) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 2e2a8ce08..bfdf124d7 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -50,9 +50,9 @@ struct my_llama_layer { struct ggml_tensor * ffn_norm; // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; + struct ggml_tensor * ffn_gate; // w1 + struct ggml_tensor * ffn_down; // w2 + struct ggml_tensor * ffn_up; // w3 }; struct my_llama_model { @@ -140,9 +140,9 @@ static void set_param_model(struct my_llama_model * model) { ggml_set_param(ctx, layer.wv); ggml_set_param(ctx, layer.wo); ggml_set_param(ctx, layer.ffn_norm); - ggml_set_param(ctx, layer.w1); - ggml_set_param(ctx, layer.w2); - ggml_set_param(ctx, layer.w3); + ggml_set_param(ctx, layer.ffn_gate); + ggml_set_param(ctx, layer.ffn_down); + ggml_set_param(ctx, layer.ffn_up); } } @@ -198,9 +198,9 @@ static void init_model(struct my_llama_model * model) { layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); - layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); - layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); + layer.ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); + layer.ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); + layer.ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i)); @@ -211,9 +211,9 @@ static void init_model(struct my_llama_model * model) { ggml_set_name(layer.ffn_norm, tni(LLM_TENSOR_FFN_NORM, i)); - ggml_set_name(layer.w1, tni(LLM_TENSOR_FFN_GATE, i)); - ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i)); - ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i)); + ggml_set_name(layer.ffn_gate, tni(LLM_TENSOR_FFN_GATE, i)); + ggml_set_name(layer.ffn_down, tni(LLM_TENSOR_FFN_DOWN, i)); + ggml_set_name(layer.ffn_up, tni(LLM_TENSOR_FFN_UP, i)); } set_param_model(model); @@ -244,9 +244,9 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean, randomize_tensor_normal(layer.ffn_norm, rnd); - randomize_tensor_normal(layer.w1, rnd); - randomize_tensor_normal(layer.w2, rnd); - randomize_tensor_normal(layer.w3, rnd); + randomize_tensor_normal(layer.ffn_gate, rnd); + randomize_tensor_normal(layer.ffn_down, rnd); + randomize_tensor_normal(layer.ffn_up, rnd); } free_random_normal_distribution(rnd); @@ -356,11 +356,11 @@ static struct ggml_tensor * llama_build_train_graphs( struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, f_norm_rms_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); struct ggml_tensor * t23 = ggml_repeat (ctx, layer.ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); - struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); - struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); + struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); + struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); - struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); + struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); cur = t30; checkpoints.push_back(cur); @@ -521,9 +521,9 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i)); copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i)); copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i)); - copy_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); - copy_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); - copy_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); + copy_tensor_by_name(layer.ffn_gate, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); + copy_tensor_by_name(layer.ffn_down, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); + copy_tensor_by_name(layer.ffn_up, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); } } @@ -664,9 +664,9 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo gguf_add_tensor(fctx, layer.wv); gguf_add_tensor(fctx, layer.wo); gguf_add_tensor(fctx, layer.ffn_norm); - gguf_add_tensor(fctx, layer.w1); - gguf_add_tensor(fctx, layer.w2); - gguf_add_tensor(fctx, layer.w3); + gguf_add_tensor(fctx, layer.ffn_gate); + gguf_add_tensor(fctx, layer.ffn_down); + gguf_add_tensor(fctx, layer.ffn_up); } } @@ -915,9 +915,9 @@ static int64_t get_parameter_count(struct my_llama_model* model) { nx += ggml_nelements(layer.wv); nx += ggml_nelements(layer.wo); nx += ggml_nelements(layer.ffn_norm); - nx += ggml_nelements(layer.w1); - nx += ggml_nelements(layer.w2); - nx += ggml_nelements(layer.w3); + nx += ggml_nelements(layer.ffn_gate); + nx += ggml_nelements(layer.ffn_down); + nx += ggml_nelements(layer.ffn_up); } return nx; } diff --git a/ggml-backend.c b/ggml-backend.c index 9ee81b766..87eea8440 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -219,6 +219,10 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + if (!size) { + return; + } + tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size); } @@ -229,6 +233,10 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); + if (!size) { + return; + } + tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size); } diff --git a/gguf-py/examples/reader.py b/gguf-py/examples/reader.py new file mode 100644 index 000000000..62e0769da --- /dev/null +++ b/gguf-py/examples/reader.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +import sys +from pathlib import Path +from gguf.gguf_reader import GGUFReader + + +sys.path.insert(0, str(Path(__file__).parent.parent)) + + +def read_gguf_file(gguf_file_path): + """ + Reads and prints key-value pairs and tensor information from a GGUF file in an improved format. + + Parameters: + - gguf_file_path: Path to the GGUF file. + """ + + reader = GGUFReader(gguf_file_path) + + # List all key-value pairs in a columnized format + print("Key-Value Pairs:") + max_key_length = max(len(key) for key in reader.fields.keys()) + for key, field in reader.fields.items(): + value = field.parts[field.data[0]] + print(f"{key:{max_key_length}} : {value}") + print("----") + + # List all tensors + print("Tensors:") + tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}" + print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) + print("-" * 80) + for tensor in reader.tensors: + shape_str = "x".join(map(str, tensor.shape)) + size_str = str(tensor.n_elements) + quantization_str = tensor.tensor_type.name + print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) + + +if __name__ == '__main__': + if len(sys.argv) < 2: + print("Usage: reader.py ") + sys.exit(1) + gguf_file_path = sys.argv[1] + read_gguf_file(gguf_file_path) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a9c13dd38..5fba01714 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -40,6 +40,7 @@ class Keys: TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" EXPERT_COUNT = "{arch}.expert_count" EXPERT_USED_COUNT = "{arch}.expert_used_count" + POOLING_LAYER = "{arch}.pooling_layer" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -86,27 +87,28 @@ class Keys: class MODEL_ARCH(IntEnum): - LLAMA = auto() - FALCON = auto() - BAICHUAN = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() - STARCODER = auto() - PERSIMMON = auto() - REFACT = auto() - BERT = auto() - BLOOM = auto() - STABLELM = auto() - QWEN = auto() - QWEN2 = auto() - PHI2 = auto() - PLAMO = auto() - CODESHELL = auto() - ORION = auto() + LLAMA = auto() + FALCON = auto() + BAICHUAN = auto() + GPT2 = auto() + GPTJ = auto() + GPTNEOX = auto() + MPT = auto() + STARCODER = auto() + PERSIMMON = auto() + REFACT = auto() + BERT = auto() + NOMIC_BERT = auto() + BLOOM = auto() + STABLELM = auto() + QWEN = auto() + QWEN2 = auto() + PHI2 = auto() + PLAMO = auto() + CODESHELL = auto() + ORION = auto() INTERNLM2 = auto() - MINICPM = auto() + MINICPM = auto() class MODEL_TENSOR(IntEnum): @@ -152,6 +154,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.PERSIMMON: "persimmon", MODEL_ARCH.REFACT: "refact", MODEL_ARCH.BERT: "bert", + MODEL_ARCH.NOMIC_BERT: "nomic-bert", MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.QWEN: "qwen", @@ -281,6 +284,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_UP, MODEL_TENSOR.LAYER_OUT_NORM, ], + MODEL_ARCH.NOMIC_BERT: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.TOKEN_TYPES, + MODEL_TENSOR.POS_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_OUT_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.LAYER_OUT_NORM, + ], MODEL_ARCH.MPT: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 7af58a46c..d87bd8e88 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -360,6 +360,9 @@ class GGUFWriter: def add_causal_attention(self, value: bool) -> None: self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) + def add_pooling_layer(self, value: bool) -> None: + self.add_bool(Keys.LLM.POOLING_LAYER.format(arch=self.arch), value) + def add_rope_dimension_count(self, count: int) -> None: self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index c7ba1420e..861003776 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -15,7 +15,7 @@ class TensorNameMap: "word_embeddings", # bloom "model.embed_tokens", # llama-hf "tok_embeddings", # llama-pth - "embeddings.word_embeddings", # bert + "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon "wte", # gpt2 "transformer.embd.wte", # phi2 @@ -24,13 +24,14 @@ class TensorNameMap: # Token type embeddings MODEL_TENSOR.TOKEN_TYPES: ( - "embeddings.token_type_embeddings", # bert + "embeddings.token_type_embeddings", # bert nomic-bert ), # Normalization of token embeddings MODEL_TENSOR.TOKEN_EMBD_NORM: ( "word_embeddings_layernorm", # bloom "embeddings.LayerNorm", # bert + "emb_ln", # nomic-bert ), # Position embeddings @@ -103,6 +104,7 @@ class TensorNameMap: "model.layers.{bid}.self_attn.query_key_value", # persimmon "h.{bid}.attn.c_attn", # gpt2 "transformer.h.{bid}.mixer.Wqkv", # phi2 + "encoder.layers.{bid}.attn.Wqkv", # nomic-bert ), # Attention query @@ -152,11 +154,13 @@ class TensorNameMap: "transformer.h.{bid}.mixer.out_proj", # phi2 "model.layers.layers.{bid}.self_attn.o_proj", # plamo "model.layers.{bid}.attention.wo", # internlm2 + "encoder.layers.{bid}.attn.out_proj", # nomic-bert ), # Attention output norm MODEL_TENSOR.ATTN_OUT_NORM: ( "encoder.layer.{bid}.attention.output.LayerNorm", # bert + "encoder.layers.{bid}.norm1", # nomic-bert ), # Rotary embeddings @@ -205,6 +209,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.fc1", # phi2 "model.layers.layers.{bid}.mlp.up_proj", # plamo "model.layers.{bid}.feed_forward.w3", # internlm2 + "encoder.layers.{bid}.mlp.fc11", # nomic-bert ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -224,6 +229,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.w2", # qwen "model.layers.layers.{bid}.mlp.gate_proj", # plamo "model.layers.{bid}.feed_forward.w1", # internlm2 + "encoder.layers.{bid}.mlp.fc12", # nomic-bert ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -249,6 +255,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.fc2", # phi2 "model.layers.layers.{bid}.mlp.down_proj", # plamo "model.layers.{bid}.feed_forward.w2", # internlm2 + "encoder.layers.{bid}.mlp.fc2", # nomic-bert ), MODEL_TENSOR.FFN_DOWN_EXP: ( @@ -272,6 +279,7 @@ class TensorNameMap: MODEL_TENSOR.LAYER_OUT_NORM: ( "encoder.layer.{bid}.output.LayerNorm", # bert + "encoder.layers.{bid}.norm2", # nomic-bert ) } diff --git a/llama.cpp b/llama.cpp index bc0d62bb3..81f414d3c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -197,6 +197,7 @@ enum llm_arch { LLM_ARCH_PERSIMMON, LLM_ARCH_REFACT, LLM_ARCH_BERT, + LLM_ARCH_NOMIC_BERT, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, LLM_ARCH_QWEN, @@ -211,27 +212,28 @@ enum llm_arch { }; static std::map LLM_ARCH_NAMES = { - { LLM_ARCH_LLAMA, "llama" }, - { LLM_ARCH_FALCON, "falcon" }, - { LLM_ARCH_GPT2, "gpt2" }, - { LLM_ARCH_GPTJ, "gptj" }, - { LLM_ARCH_GPTNEOX, "gptneox" }, - { LLM_ARCH_MPT, "mpt" }, - { LLM_ARCH_BAICHUAN, "baichuan" }, - { LLM_ARCH_STARCODER, "starcoder" }, - { LLM_ARCH_PERSIMMON, "persimmon" }, - { LLM_ARCH_REFACT, "refact" }, - { LLM_ARCH_BERT, "bert" }, - { LLM_ARCH_BLOOM, "bloom" }, - { LLM_ARCH_STABLELM, "stablelm" }, - { LLM_ARCH_QWEN, "qwen" }, - { LLM_ARCH_QWEN2, "qwen2" }, - { LLM_ARCH_PHI2, "phi2" }, - { LLM_ARCH_PLAMO, "plamo" }, - { LLM_ARCH_CODESHELL, "codeshell" }, - { LLM_ARCH_ORION, "orion" }, - { LLM_ARCH_INTERNLM2, "internlm2" }, - { LLM_ARCH_MINICPM, "minicpm" }, + { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GPT2, "gpt2" }, + { LLM_ARCH_GPTJ, "gptj" }, + { LLM_ARCH_GPTNEOX, "gptneox" }, + { LLM_ARCH_MPT, "mpt" }, + { LLM_ARCH_BAICHUAN, "baichuan" }, + { LLM_ARCH_STARCODER, "starcoder" }, + { LLM_ARCH_PERSIMMON, "persimmon" }, + { LLM_ARCH_REFACT, "refact" }, + { LLM_ARCH_BERT, "bert" }, + { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, + { LLM_ARCH_BLOOM, "bloom" }, + { LLM_ARCH_STABLELM, "stablelm" }, + { LLM_ARCH_QWEN, "qwen" }, + { LLM_ARCH_QWEN2, "qwen2" }, + { LLM_ARCH_PHI2, "phi2" }, + { LLM_ARCH_PLAMO, "plamo" }, + { LLM_ARCH_CODESHELL, "codeshell" }, + { LLM_ARCH_ORION, "orion" }, + { LLM_ARCH_INTERNLM2, "internlm2" }, + { LLM_ARCH_MINICPM, "minicpm" }, }; enum llm_kv { @@ -254,6 +256,7 @@ enum llm_kv { LLM_KV_TENSOR_DATA_LAYOUT, LLM_KV_EXPERT_COUNT, LLM_KV_EXPERT_USED_COUNT, + LLM_KV_POOLING_LAYER, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -311,6 +314,7 @@ static std::map LLM_KV_NAMES = { { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, + { LLM_KV_POOLING_LAYER, "%s.pooling_layer" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -373,6 +377,7 @@ enum llm_tensor { LLM_TENSOR_ATTN_OUT, LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_NORM_2, + LLM_TENSOR_ATTN_OUT_NORM, LLM_TENSOR_ATTN_ROT_EMBD, LLM_TENSOR_FFN_GATE_INP, LLM_TENSOR_FFN_NORM, @@ -385,6 +390,7 @@ enum llm_tensor { LLM_TENSOR_FFN_UP_EXP, LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_LAYER_OUT_NORM, }; static std::map> LLM_TENSOR_NAMES = { @@ -550,12 +556,27 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, { LLM_TENSOR_TOKEN_TYPES, "token_types" }, { LLM_TENSOR_POS_EMBD, "position_embd" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_NOMIC_BERT, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, @@ -1483,6 +1504,7 @@ enum e_model { MODEL_22M, MODEL_33M, MODEL_109M, + MODEL_137M, MODEL_335M, MODEL_0_5B, MODEL_1B, @@ -1539,6 +1561,7 @@ struct llama_hparams { float f_max_alibi_bias; bool causal_attn = true; + bool pooling_layer = false; bool operator!=(const llama_hparams & other) const { @@ -1601,6 +1624,7 @@ struct llama_cparams { bool mul_mat_q; bool offload_kqv; + bool do_pooling; ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; @@ -1616,6 +1640,8 @@ struct llama_layer { struct ggml_tensor * attn_q_norm_b; struct ggml_tensor * attn_k_norm; struct ggml_tensor * attn_k_norm_b; + struct ggml_tensor * attn_out_norm; + struct ggml_tensor * attn_out_norm_b; // attention struct ggml_tensor * wq; @@ -1634,6 +1660,8 @@ struct llama_layer { // normalization struct ggml_tensor * ffn_norm; struct ggml_tensor * ffn_norm_b; + struct ggml_tensor * layer_out_norm; + struct ggml_tensor * layer_out_norm_b; // ff struct ggml_tensor * ffn_gate; // w1 @@ -1896,7 +1924,7 @@ struct llama_context { struct ggml_tensor * inp_pos; // I32 [n_batch] struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch] struct ggml_tensor * inp_K_shift; // I32 [n_ctx] - struct ggml_tensor * inp_sum; // F32 [1, n_batch] + struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch] #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; @@ -2851,6 +2879,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { static const char * llama_model_type_name(e_model type) { switch (type) { + case MODEL_22M: return "22M"; + case MODEL_33M: return "33M"; + case MODEL_109M: return "109M"; + case MODEL_137M: return "137M"; + case MODEL_0_5B: return "0.5B"; case MODEL_1B: return "1B"; case MODEL_2B: return "2B"; case MODEL_3B: return "3B"; @@ -3053,6 +3086,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); + ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer); switch (hparams.n_layer) { case 3: @@ -3068,6 +3102,17 @@ static void llm_load_hparams( model.type = e_model::MODEL_335M; break; // bge-large } } break; + case LLM_ARCH_NOMIC_BERT: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); + ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer); + + if (hparams.n_layer == 12 && hparams.n_embd == 768) { + model.type = e_model::MODEL_137M; + } + } break; case LLM_ARCH_BLOOM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -3309,7 +3354,12 @@ static void llm_load_vocab( // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { - vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + try { + vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + } catch (const std::exception & e) { + LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what()); + vocab.linefeed_id = vocab.special_pad_id; + } } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { vocab.linefeed_id = vocab.special_pad_id; } else { @@ -3865,10 +3915,14 @@ static bool llm_load_tensors( } } break; case LLM_ARCH_BERT: + case LLM_ARCH_NOMIC_BERT: { - model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); - model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); + if (model.arch == LLM_ARCH_BERT) { + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); + } + model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); @@ -3878,29 +3932,38 @@ static bool llm_load_tensors( auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + if (model.arch == LLM_ARCH_BERT) { + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); - layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); - layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); + } else { + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + } - layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); - layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); - layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); + layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); + layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}); - layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); - layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); - layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + if (model.arch == LLM_ARCH_BERT) { + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); - layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + } else { + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + } + + layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); + layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}); } } break; case LLM_ARCH_BLOOM: @@ -4379,9 +4442,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam model.hparams.vocab_only = params.vocab_only; - llm_load_arch (ml, model); - llm_load_hparams(ml, model); - llm_load_vocab (ml, model); + try { + llm_load_arch(ml, model); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model architecture: " + std::string(e.what())); + } + try { + llm_load_hparams(ml, model); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); + } + try { + llm_load_vocab(ml, model); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); + } llm_load_print_meta(ml, model); @@ -4859,7 +4934,7 @@ struct llm_build_context { const int32_t n_orig_ctx; const bool do_rope_shift; - const bool causal_attn; + const bool do_pooling; const llm_build_cb & cb; @@ -4903,7 +4978,7 @@ struct llm_build_context { kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), do_rope_shift (worst_case || kv_self.has_shift), - causal_attn (hparams.causal_attn), + do_pooling (hparams.pooling_layer && cparams.do_pooling), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { // all initializations should be done in init() @@ -5751,22 +5826,26 @@ struct llm_build_context { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); struct ggml_tensor * cur; struct ggml_tensor * inpL; // get input vectors with right size + const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type); struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - struct ggml_tensor * inp_sum = ggml_view_1d(ctx0, lctx.inp_sum, n_tokens, 0); + struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0); // construct input embeddings (token, type, position) inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + // token types are hardcoded to zero ("Sentence A") struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); inpL = ggml_add(ctx0, inpL, type_row0); - inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL); + if (model.arch == LLM_ARCH_BERT) { + inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL); + } cb(inpL, "inp_embd", -1); // embed layer norm @@ -5782,7 +5861,7 @@ struct llm_build_context { struct ggml_tensor * cur = inpL; // self-attention - { + if (model.arch == LLM_ARCH_BERT) { struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq); cb(Qcur, "Qcur", il); @@ -5795,6 +5874,37 @@ struct llm_build_context { // seems like we just need to do this for Q? Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } else { + // compute Q and K and RoPE them + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); @@ -5805,25 +5915,34 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, inpL); // attention layer norm - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il); + cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il); struct ggml_tensor * ffn_inp = cur; cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, - NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + if (model.arch == LLM_ARCH_BERT) { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + } else { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + } cb(cur, "ffn_out", il); // attentions bypass the intermediate layer cur = ggml_add(ctx0, cur, ffn_inp); // output layer norm - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); + cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il); // input for next layer inpL = cur; @@ -5832,9 +5951,11 @@ struct llm_build_context { // final output cur = inpL; - // pooling - cur = ggml_mul_mat(ctx0, inp_sum, ggml_cont(ctx0, ggml_transpose(ctx0, cur))); - cb(cur, "result_embed", -1); + // pooling layer + if (do_pooling) { + cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum); + } + cb(cur, "result_embd", -1); ggml_build_forward_expand(gf, cur); @@ -7264,6 +7385,7 @@ static struct ggml_cgraph * llama_build_graph( result = llm.build_refact(); } break; case LLM_ARCH_BERT: + case LLM_ARCH_NOMIC_BERT: { result = llm.build_bert(); } break; @@ -7367,7 +7489,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { for (int i = 0; i < n_kv; ++i) { float f; - if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { + if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || + (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) { f = -INFINITY; } else { f = 0; @@ -7378,7 +7501,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - { assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer)); float * data = (float *) lctx.inp_sum->data; @@ -7399,6 +7521,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { data[i] = lctx.kv_self.cells[i].delta; } } + + if (hparams.pooling_layer && cparams.do_pooling) { + const int64_t n_tokens = batch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer)); + float * data = (float *) lctx.inp_sum->data; + + memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum)); + + for (int i = 0; i < n_tokens; ++i) { + const llama_seq_id seq_id = batch.seq_id[i][0]; + data[seq_id*n_tokens + i] = 1.0f; + } + } } // decode a batch of tokens by evaluating the transformer @@ -7510,7 +7646,7 @@ static int llama_decode_internal( embeddings = gf->nodes[gf->n_nodes - 3]; GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); } - } else if (strcmp(res->name, "result_embed") == 0) { + } else if (strcmp(res->name, "result_embd") == 0) { embeddings = res; res = nullptr; } else { @@ -7630,11 +7766,12 @@ static int llama_decode_internal( if (!lctx.embedding.empty()) { auto & embedding_out = lctx.embedding; - const int64_t embed_pos = res ? n_embd * (n_tokens-1) : 0; + const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0; + const int64_t embd_size = res ? n_embd : n_embd * n_tokens; - embedding_out.resize(n_embd); + embedding_out.resize(embd_size); ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings); - ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embed_pos*sizeof(float), n_embd*sizeof(float)); + ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float)); ggml_backend_synchronize(embeddings_backend); } @@ -7711,7 +7848,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { switch (llama_vocab_get_type(vocab)) { case LLAMA_VOCAB_TYPE_SPM: { const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; - return vocab.token_to_id.at(buf); + auto token = vocab.token_to_id.find(buf); + if (token != vocab.token_to_id.end()) { + return (*token).second; + } + // Try to fall back to just the byte as a string + const char buf2[2] = { (char)ch, 0 }; + return vocab.token_to_id.at(buf2); } case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_BPE: { @@ -7759,7 +7902,7 @@ struct llm_bigram_spm { }; struct llm_tokenizer_spm { - llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {} + llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {} void tokenize(const std::string & text, std::vector & output) { // split string into utf8 chars @@ -7834,6 +7977,7 @@ private: if (p == rev_merge.end()) { // output any symbols that did not form tokens as bytes. + output.reserve(output.size() + symbol.n); for (int j = 0; j < (int)symbol.n; ++j) { llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]); output.push_back(token_id); @@ -8396,17 +8540,18 @@ struct fragment_buffer_variant { token(_token), raw_text(_dummy), offset(0), - length(0){} + length(0) {} + fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length) : type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT), - token((llama_vocab::id)-1), + token((llama_vocab::id) - 1), raw_text(_raw_text), offset(_offset), length(_length){ - GGML_ASSERT( _offset >= 0 ); - GGML_ASSERT( _length >= 1 ); - GGML_ASSERT( offset + length <= raw_text.length() ); + GGML_ASSERT(_offset >= 0); + GGML_ASSERT(_length >= 1); + GGML_ASSERT(offset + length <= raw_text.length()); } const FRAGMENT_BUFFER_VARIANT_TYPE type; @@ -8530,14 +8675,14 @@ static std::vector llama_tokenize_internal(const llama_vocab & } std::forward_list fragment_buffer; - fragment_buffer.emplace_front( raw_text, 0, raw_text.length() ); + fragment_buffer.emplace_front(raw_text, 0, raw_text.length()); - if (special) tokenizer_st_partition( vocab, fragment_buffer ); + if (special) tokenizer_st_partition(vocab, fragment_buffer); switch (vocab.type) { case LLAMA_VOCAB_TYPE_SPM: { - for (const auto & fragment: fragment_buffer) { + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { // without adding this leading whitespace, we do not get the same results as the original tokenizer @@ -8565,7 +8710,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & } break; case LLAMA_VOCAB_TYPE_BPE: { - for (const auto & fragment: fragment_buffer) { + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); @@ -8581,7 +8726,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & } break; case LLAMA_VOCAB_TYPE_WPM: { - for (const auto & fragment: fragment_buffer) { + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); @@ -10444,7 +10589,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= !params->only_copy; // do not quantize expert gating tensors - quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; + quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight"); + + // do not quantize positional embeddings and token types (BERT) + quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); + quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); enum ggml_type new_type; void * new_data; @@ -10946,6 +11095,7 @@ struct llama_context_params llama_context_default_params() { /*.logits_all =*/ false, /*.embedding =*/ false, /*.offload_kqv =*/ true, + /*.do_pooling =*/ true, }; return result; @@ -11101,6 +11251,7 @@ struct llama_context * llama_new_context_with_model( cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.mul_mat_q = params.mul_mat_q; cparams.offload_kqv = params.offload_kqv; + cparams.do_pooling = params.do_pooling; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; @@ -11248,7 +11399,7 @@ struct llama_context * llama_new_context_with_model( // resized during inference, reserve maximum ctx->logits.reserve(hparams.n_vocab*cparams.n_batch); - if (params.embedding){ + if (params.embedding) { ctx->embedding.resize(hparams.n_embd); } @@ -11266,7 +11417,7 @@ struct llama_context * llama_new_context_with_model( ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch); ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx); - ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch); + ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch); ggml_set_name(ctx->inp_tokens, "inp_tokens"); ggml_set_name(ctx->inp_embd, "inp_embd"); @@ -12124,6 +12275,10 @@ float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } +float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { + return ctx->embedding.data() + i*ctx->model.hparams.n_embd; +} + const char * llama_token_get_text(const struct llama_model * model, llama_token token) { return model->vocab.id_to_token[token].text.c_str(); } diff --git a/llama.h b/llama.h index ae134a365..fe9e05f1d 100644 --- a/llama.h +++ b/llama.h @@ -236,6 +236,7 @@ extern "C" { bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embedding; // embedding mode only bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU + bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) }; // model quantization parameters @@ -628,6 +629,10 @@ extern "C" { // shape: [n_embd] (1-dimensional) LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); + // Get the embeddings for the ith sequence + // llama_get_embeddings(ctx) + i*n_embd + LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); + // // Vocab // diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index eb06123d2..9af8517d9 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2129,14 +2129,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_pad()); test_cases.emplace_back(new test_leaky_relu()); + // these tests are disabled to save execution time, but they can be handy for debugging +#if 0 #if !defined(__SANITIZE_THREAD__) // FIXME: these tests use too much memory with thread sanitizer test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024)); //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336)); #endif - - // these tests are disabled to save execution time, but they can be handy for debugging -#if 0 test_cases.emplace_back(new test_llama(1)); test_cases.emplace_back(new test_llama(2)); test_cases.emplace_back(new test_falcon(1)); diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index ed4841198..dcbfbdd3e 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -4,13 +4,13 @@ #include "console.h" #include +#include #include #include -#include -#include -#include -#include #include +#include +#include +#include int main(int argc, char **argv) { if (argc < 2) { @@ -74,45 +74,46 @@ int main(int argc, char **argv) { } } catch (const std::invalid_argument &) { - fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str()); + //fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str()); } } - for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) { - // NOTE: these exceptions seem to be necessary, because the GPT2 tokenizer doesn't want to interfere with some ASCII control characters - if ((cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && (cp < 0x13 || cp > 0x17) && cp != 0x19 && (cp < 0x1c || cp > 0x1e) && (cp < 0xd800 || cp > 0xdfff)) { - std::string str = " " + codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_bpe(ctx, tokens); - if (str != check) { - fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 3; - } - } - } - // Restrict to assigned unicode planes - // for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) { - for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) { - std::string str = codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_bpe(ctx, tokens); - if (str != check) { - fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 4; - } - } - for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) { - std::string str = codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_bpe(ctx, tokens); - if (str != check) { - fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 4; + // unicode + { + const int nthread = std::thread::hardware_concurrency(); + + std::vector threads(nthread); + + for (int i = 0; i < nthread; ++i) { + threads[i] = std::thread([i, nthread, ctx]() { + for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) { + if (!( // NOLINT + (cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && + (cp < 0x13 || cp > 0x17) && cp != 0x19 && + (cp < 0x1c || cp > 0x1e) && + (cp < 0xd800 || cp > 0xdfff) && + (cp < 0x00040000 || cp >= 0x000e0000) + )) { + continue; + } + + std::string str = codepoint_to_utf8(cp); + std::vector tokens = llama_tokenize(ctx, str, false); + std::string check = llama_detokenize_bpe(ctx, tokens); + if (cp != 9601 && str != check) { + fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", + cp, check.c_str(), check.length(), str.c_str(), str.length()); + std::exit(3); + } + } + }); + } + + for (auto & t : threads) { + t.join(); } } + llama_free_model(model); llama_free(ctx); diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp index 0d4f54d92..9a36c9e79 100644 --- a/tests/test-tokenizer-1-llama.cpp +++ b/tests/test-tokenizer-1-llama.cpp @@ -4,13 +4,13 @@ #include "console.h" #include +#include #include #include -#include -#include -#include -#include #include +#include +#include +#include int main(int argc, char **argv) { if (argc < 2) { @@ -72,26 +72,33 @@ int main(int argc, char **argv) { } } - for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) { - if (cp < 0xd800 || cp > 0xdfff) { - std::string str = codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_spm(ctx, tokens); - if (cp != 9601 && str != check) { - fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 3; - } + // unicode + { + const int nthread = std::thread::hardware_concurrency(); + + std::vector threads(nthread); + + for (int i = 0; i < nthread; ++i) { + threads[i] = std::thread([i, nthread, ctx]() { + for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) { + if (cp >= 0xd800 && cp <= 0xdfff) { + continue; + } + + std::string str = codepoint_to_utf8(cp); + std::vector tokens = llama_tokenize(ctx, str, false); + std::string check = llama_detokenize_spm(ctx, tokens); + if (cp != 9601 && str != check) { + fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", + cp, check.c_str(), check.length(), str.c_str(), str.length()); + std::exit(3); + } + } + }); } - } - for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) { - std::string str = codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_spm(ctx, tokens); - if (str != check) { - fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 4; + + for (auto & t : threads) { + t.join(); } } diff --git a/unicode.h b/unicode.h index 844eff3da..263260702 100644 --- a/unicode.h +++ b/unicode.h @@ -264,26 +264,29 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) { offset += 1; return result; } - else if (!(utf8[offset + 0] & 0x40)) { + if (!(utf8[offset + 0] & 0x40)) { throw std::invalid_argument("invalid character"); } - else if (!(utf8[offset + 0] & 0x20)) { - if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) + if (!(utf8[offset + 0] & 0x20)) { + if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) { throw std::invalid_argument("invalid character"); + } auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f); offset += 2; return result; } - else if (!(utf8[offset + 0] & 0x10)) { - if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) + if (!(utf8[offset + 0] & 0x10)) { + if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) { throw std::invalid_argument("invalid character"); + } auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f); offset += 3; return result; } - else if (!(utf8[offset + 0] & 0x08)) { - if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) + if (!(utf8[offset + 0] & 0x08)) { + if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) { throw std::invalid_argument("invalid character"); + } auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f); offset += 4; return result; @@ -331,21 +334,22 @@ static uint32_t codepoint_from_utf16(const std::vector & utf16, size_t offset += 1; return result; } - else { - if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) - throw std::invalid_argument("invalid character"); - auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff)); - offset += 2; - return result; + + if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) { + throw std::invalid_argument("invalid character"); } - throw std::invalid_argument("invalid string"); + + auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff)); + offset += 2; + return result; } static std::vector codepoints_from_utf16(const std::vector & utf16) { std::vector result; size_t offset = 0; - while (offset < utf16.size()) + while (offset < utf16.size()) { result.push_back(codepoint_from_utf16(utf16, offset)); + } return result; } @@ -361,44 +365,52 @@ static std::vector codepoints_from_utf16(const std::vector & static std::unordered_map codepoint_type_map() { std::unordered_map codepoint_types; for (auto p : digit_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_DIGIT; + } } - for(auto p : letter_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : letter_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_LETTER; + } } - for(auto p : whitespace_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : whitespace_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE; + } } - for(auto p : accent_mark_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : accent_mark_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK; + } } - for(auto p : punctuation_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : punctuation_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION; + } } - for (auto p : symbol_ranges) { - for (auto i = p.first; i <= p.second; ++i) + for (auto p : symbol_ranges) { + for (auto i = p.first; i <= p.second; ++i) { codepoint_types[i] = CODEPOINT_TYPE_SYMBOL; + } } - for(auto p : control_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : control_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_CONTROL; + } } return codepoint_types; } static int codepoint_type(uint32_t cp) { static std::unordered_map codepoint_types = codepoint_type_map(); - return codepoint_types[cp]; + return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp); } static int codepoint_type(const std::string & utf8) { - if (utf8.length() == 0) + if (utf8.length() == 0) { return CODEPOINT_TYPE_UNIDENTIFIED; + } size_t offset = 0; return codepoint_type(codepoint_from_utf8(utf8, offset)); }