From 99b8b43d7b185a6483f28cf798a2d968b2e16ca7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 13 Feb 2024 11:20:24 +0200 Subject: [PATCH 01/11] tests : disable moe test (#5473) --- tests/test-backend-ops.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index eb06123d2..9af8517d9 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2129,14 +2129,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_pad()); test_cases.emplace_back(new test_leaky_relu()); + // these tests are disabled to save execution time, but they can be handy for debugging +#if 0 #if !defined(__SANITIZE_THREAD__) // FIXME: these tests use too much memory with thread sanitizer test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024)); //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336)); #endif - - // these tests are disabled to save execution time, but they can be handy for debugging -#if 0 test_cases.emplace_back(new test_llama(1)); test_cases.emplace_back(new test_llama(2)); test_cases.emplace_back(new test_falcon(1)); From 49cc1f7d67de2da99f3ac185f9ff1319b7bf35f8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 13 Feb 2024 13:01:29 +0200 Subject: [PATCH 02/11] bert : add tests + fix quantization (#5475) * llama : do not quantize pos embd and token type tensors * ci : add BERT tests ggml-ci * ci : do not do BERT tests on low-perf nodes ggml-ci --- ci/run.sh | 46 ++++++++++++++++++++++++++++++++++++++++++++++ llama.cpp | 6 +++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/ci/run.sh b/ci/run.sh index 82fe247a5..a4264d775 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -568,6 +568,50 @@ function gg_sum_open_llama_7b_v2 { #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" } +# bge-small + +function gg_run_embd_bge_small { + cd ${SRC} + + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt + + path_models="../models-mnt/bge-small" + + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + + python3 ../convert-hf-to-gguf.py ${path_models} + + model_f16="${path_models}/ggml-model-f16.gguf" + model_q8_0="${path_models}/ggml-model-q8_0.gguf" + + ./bin/quantize ${model_f16} ${model_q8_0} q8_0 + + (time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + + set +e +} + +function gg_sum_embd_bge_small { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'BGE Small (BERT):\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" + gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" +} + ## main if [ -z ${GG_BUILD_LOW_PERF} ]; then @@ -591,6 +635,8 @@ test $ret -eq 0 && gg_run ctest_debug test $ret -eq 0 && gg_run ctest_release if [ -z ${GG_BUILD_LOW_PERF} ]; then + test $ret -eq 0 && gg_run embd_bge_small + if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_CUDA} ]; then test $ret -eq 0 && gg_run open_llama_3b_v2 diff --git a/llama.cpp b/llama.cpp index d316d067b..6dce392df 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10444,7 +10444,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= !params->only_copy; // do not quantize expert gating tensors - quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; + quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight"); + + // do not quantize positional embeddings and token types (BERT) + quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); + quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); enum ggml_type new_type; void * new_data; From ad014bba97ef6ef6c3e2f78b2fc463e91ae94579 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 13 Feb 2024 12:38:37 +0100 Subject: [PATCH 03/11] make: add error message for bad CUDA version (#5444) * make: add error message for bad CUDA version * Update Makefile Co-authored-by: Jared Van Bortel --------- Co-authored-by: Jared Van Bortel --- Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile b/Makefile index ba73f0637..0a2070b53 100644 --- a/Makefile +++ b/Makefile @@ -569,6 +569,14 @@ $(info I CC: $(shell $(CC) --version | head -n 1)) $(info I CXX: $(shell $(CXX) --version | head -n 1)) ifdef LLAMA_CUBLAS $(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) +CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])') +ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) +ifndef CUDA_DOCKER_ARCH +ifndef CUDA_POWER_ARCH +$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH) +endif # CUDA_POWER_ARCH +endif # CUDA_DOCKER_ARCH +endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) endif # LLAMA_CUBLAS $(info ) From 03bf161eb6dea6400ee49c6dc6b69bdcfa9fd3fc Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Tue, 13 Feb 2024 06:06:58 -0600 Subject: [PATCH 04/11] llama : support batched embeddings (#5466) * batched embedding: pool outputs by sequence id. updated embedding example * bring back non-causal attention * embd : minor improvements * llama : minor --------- Co-authored-by: Georgi Gerganov --- convert-hf-to-gguf.py | 1 + examples/embedding/embedding.cpp | 146 +++++++++++++++++++++++-------- gguf-py/gguf/constants.py | 1 + gguf-py/gguf/gguf_writer.py | 3 + llama.cpp | 61 +++++++++---- llama.h | 5 ++ 6 files changed, 163 insertions(+), 54 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index cae1551a2..5adfdc143 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1648,6 +1648,7 @@ class BertModel(Model): self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) self.gguf_writer.add_causal_attention(False) + self.gguf_writer.add_pooling_layer(True) self.gguf_writer.add_file_type(self.ftype) def set_vocab(self): diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 27376c8f0..b4688cf51 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -7,6 +7,51 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +static std::vector split_lines(const std::string & s) { + std::string line; + std::vector lines; + std::stringstream ss(s); + while (std::getline(ss, line)) { + lines.push_back(line); + } + return lines; +} + +static void batch_add_seq(llama_batch & batch, const std::vector & tokens, int seq_id) { + for (size_t i = 0; i < tokens.size(); i++) { + llama_batch_add(batch, tokens[i], i, { seq_id }, false); + } +} + +static void normalize(float * vec, float * out, int n) { + float norm = 0; + for (int i = 0; i < n; i++) { + norm += vec[i] * vec[i]; + } + norm = sqrt(norm); + for (int i = 0; i < n; i++) { + out[i] = vec[i] / norm; + } +} + +static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { + // clear previous kv_cache values (irrelevant for embeddings) + llama_kv_cache_clear(ctx); + + // run model + fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); + if (llama_decode(ctx, batch) < 0) { + fprintf(stderr, "%s : failed to decode\n", __func__); + } + + // normalize on copy + for (int k = 0; k < n_seq; k++) { + float * emb = llama_get_embeddings_ith(ctx, k); + float * out = output + k * n_embd; + normalize(emb, out, n_embd); + } +} + int main(int argc, char ** argv) { gpt_params params; @@ -55,59 +100,84 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s\n", get_system_info(params).c_str()); } - int n_past = 0; + // split the prompt into lines + std::vector prompts = split_lines(params.prompt); - // tokenize the prompt - auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + // max batch size + const uint64_t n_batch = params.n_batch; + GGML_ASSERT(params.n_batch == params.n_ctx); + // tokenize the prompts and trim + std::vector> inputs; + for (const auto & prompt : prompts) { + auto inp = ::llama_tokenize(ctx, prompt, true); + if (inp.size() > n_batch) { + inp.resize(n_batch); + } + inputs.push_back(inp); + } + + // tokenization stats if (params.verbose_prompt) { - fprintf(stderr, "\n"); - fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + for (int i = 0; i < (int) inputs.size(); i++) { + fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); + for (int j = 0; j < (int) inputs[i].size(); j++) { + fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); + } + fprintf(stderr, "\n\n"); } - fprintf(stderr, "\n"); } - if (embd_inp.size() > (size_t)n_ctx) { - fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n", - __func__, embd_inp.size(), n_ctx); - return 1; - } - - while (!embd_inp.empty()) { - int n_tokens = std::min(params.n_batch, (int) embd_inp.size()); - if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return 1; - } - n_past += n_tokens; - embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens); - } + // initialize batch + const int n_prompts = prompts.size(); + struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts); + // allocate output const int n_embd = llama_n_embd(model); - auto * embeddings = llama_get_embeddings(ctx); + std::vector embeddings(n_prompts * n_embd, 0); + float * emb = embeddings.data(); - // l2-normalize embeddings - float norm = 0; - for (int i = 0; i < n_embd; i++) { - norm += embeddings[i] * embeddings[i]; - } - norm = sqrt(norm); - for (int i = 0; i < n_embd; i++) { - embeddings[i] /= norm; + // break into batches + int p = 0; // number of prompts processed already + int s = 0; // number of prompts in current batch + for (int k = 0; k < n_prompts; k++) { + // clamp to n_batch tokens + auto & inp = inputs[k]; + const uint64_t n_toks = inp.size(); + + // encode if at capacity + if (batch.n_tokens + n_toks > n_batch) { + float * out = emb + p * n_embd; + batch_decode(ctx, batch, out, s, n_embd); + llama_batch_clear(batch); + p += s; + s = 0; + } + + // add to batch + batch_add_seq(batch, inp, s); + s += 1; } - for (int i = 0; i < n_embd; i++) { - printf("%f ", embeddings[i]); - } - printf("\n"); + // final batch + float * out = emb + p * n_embd; + batch_decode(ctx, batch, out, s, n_embd); + // print first 3 embeddings + for (int j = 0; j < std::min(3, n_prompts); j++) { + fprintf(stderr, "embedding %d: ", j); + for (int i = 0; i < n_embd; i++) { + fprintf(stderr, "%f ", emb[j * n_embd + i]); + } + fprintf(stderr, "\n\n"); + } + fprintf(stderr, "\n"); + + // clean up llama_print_timings(ctx); llama_free(ctx); llama_free_model(model); - llama_backend_free(); return 0; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a9c13dd38..644e1589c 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -40,6 +40,7 @@ class Keys: TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" EXPERT_COUNT = "{arch}.expert_count" EXPERT_USED_COUNT = "{arch}.expert_used_count" + POOLING_LAYER = "{arch}.pooling_layer" class Attention: HEAD_COUNT = "{arch}.attention.head_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 7af58a46c..d87bd8e88 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -360,6 +360,9 @@ class GGUFWriter: def add_causal_attention(self, value: bool) -> None: self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) + def add_pooling_layer(self, value: bool) -> None: + self.add_bool(Keys.LLM.POOLING_LAYER.format(arch=self.arch), value) + def add_rope_dimension_count(self, count: int) -> None: self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) diff --git a/llama.cpp b/llama.cpp index 6dce392df..eb6c46f36 100644 --- a/llama.cpp +++ b/llama.cpp @@ -254,6 +254,7 @@ enum llm_kv { LLM_KV_TENSOR_DATA_LAYOUT, LLM_KV_EXPERT_COUNT, LLM_KV_EXPERT_USED_COUNT, + LLM_KV_POOLING_LAYER, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -311,6 +312,7 @@ static std::map LLM_KV_NAMES = { { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, + { LLM_KV_POOLING_LAYER, "%s.pooling_layer" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -1539,6 +1541,7 @@ struct llama_hparams { float f_max_alibi_bias; bool causal_attn = true; + bool pooling_layer = false; bool operator!=(const llama_hparams & other) const { @@ -1601,6 +1604,7 @@ struct llama_cparams { bool mul_mat_q; bool offload_kqv; + bool do_pooling; ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; @@ -1896,7 +1900,7 @@ struct llama_context { struct ggml_tensor * inp_pos; // I32 [n_batch] struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch] struct ggml_tensor * inp_K_shift; // I32 [n_ctx] - struct ggml_tensor * inp_sum; // F32 [1, n_batch] + struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch] #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; @@ -3053,6 +3057,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); + ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer); switch (hparams.n_layer) { case 3: @@ -4859,7 +4864,7 @@ struct llm_build_context { const int32_t n_orig_ctx; const bool do_rope_shift; - const bool causal_attn; + const bool do_pooling; const llm_build_cb & cb; @@ -4903,7 +4908,7 @@ struct llm_build_context { kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), do_rope_shift (worst_case || kv_self.has_shift), - causal_attn (hparams.causal_attn), + do_pooling (hparams.pooling_layer && cparams.do_pooling), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { // all initializations should be done in init() @@ -5752,17 +5757,18 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); struct ggml_tensor * cur; struct ggml_tensor * inpL; // get input vectors with right size + const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type); struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - struct ggml_tensor * inp_sum = ggml_view_1d(ctx0, lctx.inp_sum, n_tokens, 0); + struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0); // construct input embeddings (token, type, position) inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + // token types are hardcoded to zero ("Sentence A") struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); inpL = ggml_add(ctx0, inpL, type_row0); @@ -5832,9 +5838,11 @@ struct llm_build_context { // final output cur = inpL; - // pooling - cur = ggml_mul_mat(ctx0, inp_sum, ggml_cont(ctx0, ggml_transpose(ctx0, cur))); - cb(cur, "result_embed", -1); + // pooling layer + if (do_pooling) { + cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum); + } + cb(cur, "result_embd", -1); ggml_build_forward_expand(gf, cur); @@ -7367,7 +7375,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { for (int i = 0; i < n_kv; ++i) { float f; - if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { + if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || + (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) { f = -INFINITY; } else { f = 0; @@ -7378,7 +7387,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - { assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer)); float * data = (float *) lctx.inp_sum->data; @@ -7399,6 +7407,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { data[i] = lctx.kv_self.cells[i].delta; } } + + if (hparams.pooling_layer && cparams.do_pooling) { + const int64_t n_tokens = batch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer)); + float * data = (float *) lctx.inp_sum->data; + + memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum)); + + for (int i = 0; i < n_tokens; ++i) { + const llama_seq_id seq_id = batch.seq_id[i][0]; + data[seq_id*n_tokens + i] = 1.0f; + } + } } // decode a batch of tokens by evaluating the transformer @@ -7510,7 +7532,7 @@ static int llama_decode_internal( embeddings = gf->nodes[gf->n_nodes - 3]; GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); } - } else if (strcmp(res->name, "result_embed") == 0) { + } else if (strcmp(res->name, "result_embd") == 0) { embeddings = res; res = nullptr; } else { @@ -7630,11 +7652,12 @@ static int llama_decode_internal( if (!lctx.embedding.empty()) { auto & embedding_out = lctx.embedding; - const int64_t embed_pos = res ? n_embd * (n_tokens-1) : 0; + const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0; + const int64_t embd_size = res ? n_embd : n_embd * n_tokens; - embedding_out.resize(n_embd); + embedding_out.resize(embd_size); ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings); - ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embed_pos*sizeof(float), n_embd*sizeof(float)); + ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float)); ggml_backend_synchronize(embeddings_backend); } @@ -10950,6 +10973,7 @@ struct llama_context_params llama_context_default_params() { /*.logits_all =*/ false, /*.embedding =*/ false, /*.offload_kqv =*/ true, + /*.do_pooling =*/ true, }; return result; @@ -11105,6 +11129,7 @@ struct llama_context * llama_new_context_with_model( cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.mul_mat_q = params.mul_mat_q; cparams.offload_kqv = params.offload_kqv; + cparams.do_pooling = params.do_pooling; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; @@ -11252,7 +11277,7 @@ struct llama_context * llama_new_context_with_model( // resized during inference, reserve maximum ctx->logits.reserve(hparams.n_vocab*cparams.n_batch); - if (params.embedding){ + if (params.embedding) { ctx->embedding.resize(hparams.n_embd); } @@ -11270,7 +11295,7 @@ struct llama_context * llama_new_context_with_model( ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch); ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx); - ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch); + ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch); ggml_set_name(ctx->inp_tokens, "inp_tokens"); ggml_set_name(ctx->inp_embd, "inp_embd"); @@ -12128,6 +12153,10 @@ float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } +float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { + return ctx->embedding.data() + i*ctx->model.hparams.n_embd; +} + const char * llama_token_get_text(const struct llama_model * model, llama_token token) { return model->vocab.id_to_token[token].text.c_str(); } diff --git a/llama.h b/llama.h index 367e8f1a1..5ef78ec96 100644 --- a/llama.h +++ b/llama.h @@ -236,6 +236,7 @@ extern "C" { bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embedding; // embedding mode only bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU + bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) }; // model quantization parameters @@ -628,6 +629,10 @@ extern "C" { // shape: [n_embd] (1-dimensional) LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); + // Get the embeddings for the ith sequence + // llama_get_embeddings(ctx) + i*n_embd + LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); + // // Vocab // From cf45252a7cfcb998bade46a886e20477cecc538a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 13 Feb 2024 15:14:22 +0200 Subject: [PATCH 05/11] tests : multi-thread the tokenizer tests (#5474) * tests : multi-thread the tokenizer tests ggml-ci * unicode : fix data race for unidentified codepoints ggml-ci * unicode : minor style fixes ggml-ci --- llama.cpp | 24 +++++----- tests/test-tokenizer-1-bpe.cpp | 77 ++++++++++++++++---------------- tests/test-tokenizer-1-llama.cpp | 53 ++++++++++++---------- unicode.h | 72 ++++++++++++++++------------- 4 files changed, 124 insertions(+), 102 deletions(-) diff --git a/llama.cpp b/llama.cpp index eb6c46f36..381a03068 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7782,7 +7782,7 @@ struct llm_bigram_spm { }; struct llm_tokenizer_spm { - llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {} + llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {} void tokenize(const std::string & text, std::vector & output) { // split string into utf8 chars @@ -7857,6 +7857,7 @@ private: if (p == rev_merge.end()) { // output any symbols that did not form tokens as bytes. + output.reserve(output.size() + symbol.n); for (int j = 0; j < (int)symbol.n; ++j) { llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]); output.push_back(token_id); @@ -8419,17 +8420,18 @@ struct fragment_buffer_variant { token(_token), raw_text(_dummy), offset(0), - length(0){} + length(0) {} + fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length) : type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT), - token((llama_vocab::id)-1), + token((llama_vocab::id) - 1), raw_text(_raw_text), offset(_offset), length(_length){ - GGML_ASSERT( _offset >= 0 ); - GGML_ASSERT( _length >= 1 ); - GGML_ASSERT( offset + length <= raw_text.length() ); + GGML_ASSERT(_offset >= 0); + GGML_ASSERT(_length >= 1); + GGML_ASSERT(offset + length <= raw_text.length()); } const FRAGMENT_BUFFER_VARIANT_TYPE type; @@ -8553,14 +8555,14 @@ static std::vector llama_tokenize_internal(const llama_vocab & } std::forward_list fragment_buffer; - fragment_buffer.emplace_front( raw_text, 0, raw_text.length() ); + fragment_buffer.emplace_front(raw_text, 0, raw_text.length()); - if (special) tokenizer_st_partition( vocab, fragment_buffer ); + if (special) tokenizer_st_partition(vocab, fragment_buffer); switch (vocab.type) { case LLAMA_VOCAB_TYPE_SPM: { - for (const auto & fragment: fragment_buffer) { + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { // without adding this leading whitespace, we do not get the same results as the original tokenizer @@ -8588,7 +8590,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & } break; case LLAMA_VOCAB_TYPE_BPE: { - for (const auto & fragment: fragment_buffer) { + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); @@ -8604,7 +8606,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & } break; case LLAMA_VOCAB_TYPE_WPM: { - for (const auto & fragment: fragment_buffer) { + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index 386530f23..3bb629561 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -4,13 +4,13 @@ #include "console.h" #include +#include #include #include -#include -#include -#include -#include #include +#include +#include +#include int main(int argc, char **argv) { if (argc < 2) { @@ -74,45 +74,46 @@ int main(int argc, char **argv) { } } catch (const std::invalid_argument &) { - fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str()); + //fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str()); } } - for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) { - // NOTE: these exceptions seem to be necessary, because the GPT2 tokenizer doesn't want to interfere with some ASCII control characters - if ((cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && (cp < 0x13 || cp > 0x17) && cp != 0x19 && (cp < 0x1c || cp > 0x1e) && (cp < 0xd800 || cp > 0xdfff)) { - std::string str = " " + codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_bpe(ctx, tokens); - if (str != check) { - fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 3; - } - } - } - // Restrict to assigned unicode planes - // for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) { - for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) { - std::string str = codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_bpe(ctx, tokens); - if (str != check) { - fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 4; - } - } - for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) { - std::string str = codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_bpe(ctx, tokens); - if (str != check) { - fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 4; + // unicode + { + const int nthread = std::thread::hardware_concurrency(); + + std::vector threads(nthread); + + for (int i = 0; i < nthread; ++i) { + threads[i] = std::thread([i, nthread, ctx]() { + for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) { + if (!( // NOLINT + (cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && + (cp < 0x13 || cp > 0x17) && cp != 0x19 && + (cp < 0x1c || cp > 0x1e) && + (cp < 0xd800 || cp > 0xdfff) && + (cp < 0x00040000 || cp >= 0x000e0000) + )) { + continue; + } + + std::string str = codepoint_to_utf8(cp); + std::vector tokens = llama_tokenize(ctx, str, false); + std::string check = llama_detokenize_bpe(ctx, tokens); + if (cp != 9601 && str != check) { + fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", + cp, check.c_str(), check.length(), str.c_str(), str.length()); + std::exit(3); + } + } + }); + } + + for (auto & t : threads) { + t.join(); } } + llama_free_model(model); llama_free(ctx); diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp index 4b58fe495..b0d814a41 100644 --- a/tests/test-tokenizer-1-llama.cpp +++ b/tests/test-tokenizer-1-llama.cpp @@ -4,13 +4,13 @@ #include "console.h" #include +#include #include #include -#include -#include -#include -#include #include +#include +#include +#include int main(int argc, char **argv) { if (argc < 2) { @@ -72,26 +72,33 @@ int main(int argc, char **argv) { } } - for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) { - if (cp < 0xd800 || cp > 0xdfff) { - std::string str = codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_spm(ctx, tokens); - if (cp != 9601 && str != check) { - fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 3; - } + // unicode + { + const int nthread = std::thread::hardware_concurrency(); + + std::vector threads(nthread); + + for (int i = 0; i < nthread; ++i) { + threads[i] = std::thread([i, nthread, ctx]() { + for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) { + if (cp >= 0xd800 && cp <= 0xdfff) { + continue; + } + + std::string str = codepoint_to_utf8(cp); + std::vector tokens = llama_tokenize(ctx, str, false); + std::string check = llama_detokenize_spm(ctx, tokens); + if (cp != 9601 && str != check) { + fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", + cp, check.c_str(), check.length(), str.c_str(), str.length()); + std::exit(3); + } + } + }); } - } - for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) { - std::string str = codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_spm(ctx, tokens); - if (str != check) { - fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 4; + + for (auto & t : threads) { + t.join(); } } diff --git a/unicode.h b/unicode.h index 844eff3da..263260702 100644 --- a/unicode.h +++ b/unicode.h @@ -264,26 +264,29 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) { offset += 1; return result; } - else if (!(utf8[offset + 0] & 0x40)) { + if (!(utf8[offset + 0] & 0x40)) { throw std::invalid_argument("invalid character"); } - else if (!(utf8[offset + 0] & 0x20)) { - if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) + if (!(utf8[offset + 0] & 0x20)) { + if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) { throw std::invalid_argument("invalid character"); + } auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f); offset += 2; return result; } - else if (!(utf8[offset + 0] & 0x10)) { - if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) + if (!(utf8[offset + 0] & 0x10)) { + if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) { throw std::invalid_argument("invalid character"); + } auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f); offset += 3; return result; } - else if (!(utf8[offset + 0] & 0x08)) { - if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) + if (!(utf8[offset + 0] & 0x08)) { + if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) { throw std::invalid_argument("invalid character"); + } auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f); offset += 4; return result; @@ -331,21 +334,22 @@ static uint32_t codepoint_from_utf16(const std::vector & utf16, size_t offset += 1; return result; } - else { - if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) - throw std::invalid_argument("invalid character"); - auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff)); - offset += 2; - return result; + + if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) { + throw std::invalid_argument("invalid character"); } - throw std::invalid_argument("invalid string"); + + auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff)); + offset += 2; + return result; } static std::vector codepoints_from_utf16(const std::vector & utf16) { std::vector result; size_t offset = 0; - while (offset < utf16.size()) + while (offset < utf16.size()) { result.push_back(codepoint_from_utf16(utf16, offset)); + } return result; } @@ -361,44 +365,52 @@ static std::vector codepoints_from_utf16(const std::vector & static std::unordered_map codepoint_type_map() { std::unordered_map codepoint_types; for (auto p : digit_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_DIGIT; + } } - for(auto p : letter_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : letter_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_LETTER; + } } - for(auto p : whitespace_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : whitespace_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE; + } } - for(auto p : accent_mark_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : accent_mark_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK; + } } - for(auto p : punctuation_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : punctuation_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION; + } } - for (auto p : symbol_ranges) { - for (auto i = p.first; i <= p.second; ++i) + for (auto p : symbol_ranges) { + for (auto i = p.first; i <= p.second; ++i) { codepoint_types[i] = CODEPOINT_TYPE_SYMBOL; + } } - for(auto p : control_ranges) { - for(auto i = p.first; i <= p.second; ++ i) + for (auto p : control_ranges) { + for (auto i = p.first; i <= p.second; ++ i) { codepoint_types[i] = CODEPOINT_TYPE_CONTROL; + } } return codepoint_types; } static int codepoint_type(uint32_t cp) { static std::unordered_map codepoint_types = codepoint_type_map(); - return codepoint_types[cp]; + return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp); } static int codepoint_type(const std::string & utf8) { - if (utf8.length() == 0) + if (utf8.length() == 0) { return CODEPOINT_TYPE_UNIDENTIFIED; + } size_t offset = 0; return codepoint_type(codepoint_from_utf8(utf8, offset)); } From 263978904c7472db1865409a7ff1129599f6a40b Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 13 Feb 2024 14:15:42 +0100 Subject: [PATCH 06/11] finetune : rename feed-forward tensors (w1/w2/w3) (#4839) * finetune: rename feed-forward tensors (w1/w2/w3) This commit renames the feed-forward tensors w1, w2 and w3 to ffn_gate, ffn_down and ffn_up respectively. The motivation for this change is to make it easier to understand the purpose of the tensors. This also seems to be inline with the names used in the llama_layer struct in llama.cpp. Signed-off-by: Daniel Bevenius * train-text-from-scratch: rename ff tensors This commit renames the feed-forward tensors w1, w2 and w3 to ffn_gate, ffn_down and ffn_up respectively. The motivation for this change is to make it easier to understand the purpose of the tensors. This also seems to be inline with the names used in the llama_layer struct in llama.cpp Signed-off-by: Daniel Bevenius --------- Signed-off-by: Daniel Bevenius --- examples/finetune/README.md | 6 +- examples/finetune/finetune.cpp | 242 +++++++++--------- .../train-text-from-scratch.cpp | 54 ++-- 3 files changed, 151 insertions(+), 151 deletions(-) diff --git a/examples/finetune/README.md b/examples/finetune/README.md index a884706c5..2fafd505e 100644 --- a/examples/finetune/README.md +++ b/examples/finetune/README.md @@ -80,9 +80,9 @@ The LORA rank can be configured for each model tensor type separately with these --rank-wk N LORA rank for wk tensor (default 4) --rank-wv N LORA rank for wv tensor (default 4) --rank-wo N LORA rank for wo tensor (default 4) - --rank-w1 N LORA rank for w1 tensor (default 4) - --rank-w2 N LORA rank for w2 tensor (default 4) - --rank-w3 N LORA rank for w3 tensor (default 4) + --rank-ffn_gate N LORA rank for ffn_gate tensor (default 4) + --rank-ffn_down N LORA rank for ffn_down tensor (default 4) + --rank-ffn_up N LORA rank for ffn_up tensor (default 4) ``` The LORA rank of 'norm' tensors should always be 1. diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index b11c56020..98bf5a07a 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -60,9 +60,9 @@ struct my_llama_layer { struct ggml_tensor * ffn_norm; // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; + struct ggml_tensor * ffn_gate; // w1 + struct ggml_tensor * ffn_down; // w2 + struct ggml_tensor * ffn_up; // w3 }; struct my_llama_model { @@ -85,9 +85,9 @@ struct my_llama_lora_hparams { uint32_t n_rank_wv = 4; uint32_t n_rank_wo = 4; uint32_t n_rank_ffn_norm = 1; - uint32_t n_rank_w1 = 4; - uint32_t n_rank_w2 = 4; - uint32_t n_rank_w3 = 4; + uint32_t n_rank_ffn_gate = 4; + uint32_t n_rank_ffn_down = 4; + uint32_t n_rank_ffn_up = 4; uint32_t n_rank_tok_embeddings = 4; uint32_t n_rank_norm = 1; uint32_t n_rank_output = 4; @@ -117,12 +117,12 @@ struct my_llama_lora_layer { struct ggml_tensor * ffn_norm_b; // ff - struct ggml_tensor * w1_a; - struct ggml_tensor * w1_b; - struct ggml_tensor * w2_a; - struct ggml_tensor * w2_b; - struct ggml_tensor * w3_a; - struct ggml_tensor * w3_b; + struct ggml_tensor * ffn_gate_a; + struct ggml_tensor * ffn_gate_b; + struct ggml_tensor * ffn_down_a; + struct ggml_tensor * ffn_down_b; + struct ggml_tensor * ffn_up_a; + struct ggml_tensor * ffn_up_b; }; struct my_llama_lora { @@ -208,9 +208,9 @@ static void print_lora_params(struct my_llama_lora_hparams * params) { printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv); printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo); printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm); - printf("%s: n_rank_w1 : %u\n", __func__, params->n_rank_w1); - printf("%s: n_rank_w2 : %u\n", __func__, params->n_rank_w2); - printf("%s: n_rank_w3 : %u\n", __func__, params->n_rank_w3); + printf("%s: n_rank_ffn_gate : %u\n", __func__, params->n_rank_ffn_gate); + printf("%s: n_rank_ffn_down : %u\n", __func__, params->n_rank_ffn_down); + printf("%s: n_rank_ffn_up : %u\n", __func__, params->n_rank_ffn_up); printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings); printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm); printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output); @@ -319,9 +319,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i)); layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i)); layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i)); - layer.w1 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); - layer.w2 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); - layer.w3 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); + layer.ffn_gate = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); + layer.ffn_down = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); + layer.ffn_up = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); assert_shape_1d(layer.attention_norm, hparams.n_embd); assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd); @@ -329,9 +329,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa()); assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd); assert_shape_1d(layer.ffn_norm, hparams.n_embd); - assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff); - assert_shape_2d(layer.w2, hparams.n_ff, hparams.n_embd); - assert_shape_2d(layer.w3, hparams.n_embd, hparams.n_ff); + assert_shape_2d(layer.ffn_gate, hparams.n_embd, hparams.n_ff); + assert_shape_2d(layer.ffn_down, hparams.n_ff, hparams.n_embd); + assert_shape_2d(layer.ffn_up, hparams.n_embd, hparams.n_ff); } } @@ -362,12 +362,12 @@ static void set_param_lora(struct my_llama_lora * lora) { ggml_set_param(ctx, layer.wo_b); ggml_set_param(ctx, layer.ffn_norm_a); ggml_set_param(ctx, layer.ffn_norm_b); - ggml_set_param(ctx, layer.w1_a); - ggml_set_param(ctx, layer.w1_b); - ggml_set_param(ctx, layer.w2_a); - ggml_set_param(ctx, layer.w2_b); - ggml_set_param(ctx, layer.w3_a); - ggml_set_param(ctx, layer.w3_b); + ggml_set_param(ctx, layer.ffn_gate_a); + ggml_set_param(ctx, layer.ffn_gate_b); + ggml_set_param(ctx, layer.ffn_down_a); + ggml_set_param(ctx, layer.ffn_down_b); + ggml_set_param(ctx, layer.ffn_up_a); + ggml_set_param(ctx, layer.ffn_up_b); } } @@ -435,12 +435,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd); layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1); - layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd); - layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff); - layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff); - layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd); - layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd); - layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff); + layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd); + layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff); + layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff); + layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd); + layer.ffn_up_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_embd); + layer.ffn_up_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_ff); ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i)); ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i)); @@ -454,12 +454,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i)); ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i)); ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i)); - ggml_set_name(layer.w1_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); - ggml_set_name(layer.w1_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); - ggml_set_name(layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); - ggml_set_name(layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); - ggml_set_name(layer.w3_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); - ggml_set_name(layer.w3_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); } set_param_lora(lora); @@ -497,12 +497,12 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl randomize_tensor_normal(layer.ffn_norm_a, rnd); ggml_set_zero(layer.ffn_norm_b); - randomize_tensor_normal(layer.w1_a, rnd); - ggml_set_zero(layer.w1_b); - randomize_tensor_normal(layer.w2_a, rnd); - ggml_set_zero(layer.w2_b); - randomize_tensor_normal(layer.w3_a, rnd); - ggml_set_zero(layer.w3_b); + randomize_tensor_normal(layer.ffn_gate_a, rnd); + ggml_set_zero(layer.ffn_gate_b); + randomize_tensor_normal(layer.ffn_down_a, rnd); + ggml_set_zero(layer.ffn_down_b); + randomize_tensor_normal(layer.ffn_up_a, rnd); + ggml_set_zero(layer.ffn_up_b); } free_random_normal_distribution(rnd); @@ -610,13 +610,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b)); struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b)); - struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); - struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); - struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); - struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); - struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b)); - struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b)); - struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b)); + struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); + struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); + struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); + struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); + struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b)); + struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b)); + struct ggml_tensor * ffn_up = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b)); struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch); struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch); @@ -659,11 +659,11 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); - struct ggml_tensor * t25 = ggml_mul_mat (ctx, w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); - struct ggml_tensor * t26 = ggml_mul_mat (ctx, w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); + struct ggml_tensor * t25 = ggml_mul_mat (ctx, ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); + struct ggml_tensor * t26 = ggml_mul_mat (ctx, ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); - struct ggml_tensor * t29 = ggml_mul_mat (ctx, w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); + struct ggml_tensor * t29 = ggml_mul_mat (ctx, ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); cur = t30; if (enable_checkpointing) { @@ -723,9 +723,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f)); } // allocating checkpoints in one block to reduce memory fragmentation @@ -798,9 +798,9 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V); GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT); GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); init_lora(model, lora); @@ -825,12 +825,12 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b)); copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a)); copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b)); - copy_tensor_by_name(layer.w1_a, f_ggml_ctx, ggml_get_name(layer.w1_a)); - copy_tensor_by_name(layer.w1_b, f_ggml_ctx, ggml_get_name(layer.w1_b)); - copy_tensor_by_name(layer.w2_a, f_ggml_ctx, ggml_get_name(layer.w2_a)); - copy_tensor_by_name(layer.w2_b, f_ggml_ctx, ggml_get_name(layer.w2_b)); - copy_tensor_by_name(layer.w3_a, f_ggml_ctx, ggml_get_name(layer.w3_a)); - copy_tensor_by_name(layer.w3_b, f_ggml_ctx, ggml_get_name(layer.w3_b)); + copy_tensor_by_name(layer.ffn_gate_a, f_ggml_ctx, ggml_get_name(layer.ffn_gate_a)); + copy_tensor_by_name(layer.ffn_gate_b, f_ggml_ctx, ggml_get_name(layer.ffn_gate_b)); + copy_tensor_by_name(layer.ffn_down_a, f_ggml_ctx, ggml_get_name(layer.ffn_down_a)); + copy_tensor_by_name(layer.ffn_down_b, f_ggml_ctx, ggml_get_name(layer.ffn_down_b)); + copy_tensor_by_name(layer.ffn_up_a, f_ggml_ctx, ggml_get_name(layer.ffn_up_a)); + copy_tensor_by_name(layer.ffn_up_b, f_ggml_ctx, ggml_get_name(layer.ffn_up_b)); } } @@ -868,9 +868,9 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_w1); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_w2); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_w3); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_ffn_gate); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_ffn_down); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_ffn_up); gguf_add_tensor(fctx, lora->tok_embeddings_a); gguf_add_tensor(fctx, lora->tok_embeddings_b); @@ -894,12 +894,12 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod gguf_add_tensor(fctx, layer.wo_b); gguf_add_tensor(fctx, layer.ffn_norm_a); gguf_add_tensor(fctx, layer.ffn_norm_b); - gguf_add_tensor(fctx, layer.w1_a); - gguf_add_tensor(fctx, layer.w1_b); - gguf_add_tensor(fctx, layer.w2_a); - gguf_add_tensor(fctx, layer.w2_b); - gguf_add_tensor(fctx, layer.w3_a); - gguf_add_tensor(fctx, layer.w3_b); + gguf_add_tensor(fctx, layer.ffn_gate_a); + gguf_add_tensor(fctx, layer.ffn_gate_b); + gguf_add_tensor(fctx, layer.ffn_down_a); + gguf_add_tensor(fctx, layer.ffn_down_b); + gguf_add_tensor(fctx, layer.ffn_up_a); + gguf_add_tensor(fctx, layer.ffn_up_b); } } @@ -1104,12 +1104,12 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB")); write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA")); write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB")); - write_tensor(&file, layer.w1_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); - write_tensor(&file, layer.w1_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); - write_tensor(&file, layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); - write_tensor(&file, layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); - write_tensor(&file, layer.w3_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); - write_tensor(&file, layer.w3_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); } } @@ -1139,9 +1139,9 @@ struct train_params { uint32_t n_rank_wv; uint32_t n_rank_wo; uint32_t n_rank_ffn_norm; - uint32_t n_rank_w1; - uint32_t n_rank_w2; - uint32_t n_rank_w3; + uint32_t n_rank_ffn_gate; + uint32_t n_rank_ffn_down; + uint32_t n_rank_ffn_up; uint32_t n_rank_tok_embeddings; uint32_t n_rank_norm; uint32_t n_rank_output; @@ -1152,9 +1152,9 @@ struct train_params { bool custom_n_rank_wv; bool custom_n_rank_wo; bool custom_n_rank_ffn_norm; - bool custom_n_rank_w1; - bool custom_n_rank_w2; - bool custom_n_rank_w3; + bool custom_n_rank_ffn_gate; + bool custom_n_rank_ffn_down; + bool custom_n_rank_ffn_up; bool custom_n_rank_tok_embeddings; bool custom_n_rank_norm; bool custom_n_rank_output; @@ -1186,9 +1186,9 @@ static struct train_params get_default_train_params() { params.n_rank_wv = 4; params.n_rank_wo = 4; params.n_rank_ffn_norm = 1; - params.n_rank_w1 = 4; - params.n_rank_w2 = 4; - params.n_rank_w3 = 4; + params.n_rank_ffn_gate = 4; + params.n_rank_ffn_down = 4; + params.n_rank_ffn_up = 4; params.n_rank_tok_embeddings = 4; params.n_rank_norm = 1; params.n_rank_output = 4; @@ -1199,9 +1199,9 @@ static struct train_params get_default_train_params() { params.custom_n_rank_wv = false; params.custom_n_rank_wo = false; params.custom_n_rank_ffn_norm = false; - params.custom_n_rank_w1 = false; - params.custom_n_rank_w2 = false; - params.custom_n_rank_w3 = false; + params.custom_n_rank_ffn_gate = false; + params.custom_n_rank_ffn_down = false; + params.custom_n_rank_ffn_up = false; params.custom_n_rank_tok_embeddings = false; params.custom_n_rank_norm = false; params.custom_n_rank_output = false; @@ -1232,9 +1232,9 @@ static void train_print_usage(int argc, char ** argv, const struct train_params fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n"); fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n"); fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-w1 N LORA rank for w1 tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-w2 N LORA rank for w2 tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-w3 N LORA rank for w3 tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_gate N LORA rank for ffn_gate tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_down N LORA rank for ffn_down tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_up N LORA rank for ffn_up tensor, overrides default rank.\n"); print_common_train_usage(argc, argv, ¶ms->common); } @@ -1369,27 +1369,27 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par } params->n_rank_wo = std::stoi(argv[i]); params->custom_n_rank_wo = true; - } else if (arg == "--rank-w1") { + } else if (arg == "--rank-ffn_gate") { if (++i >= argc) { invalid_param = true; break; } - params->n_rank_w1 = std::stoi(argv[i]); - params->custom_n_rank_w1 = true; - } else if (arg == "--rank-w2") { + params->n_rank_ffn_gate = std::stoi(argv[i]); + params->custom_n_rank_ffn_gate = true; + } else if (arg == "--rank-ffn_down") { if (++i >= argc) { invalid_param = true; break; } - params->n_rank_w2 = std::stoi(argv[i]); - params->custom_n_rank_w2 = true; - } else if (arg == "--rank-w3") { + params->n_rank_ffn_down = std::stoi(argv[i]); + params->custom_n_rank_ffn_down = true; + } else if (arg == "--rank-ffn_up") { if (++i >= argc) { invalid_param = true; break; } - params->n_rank_w3 = std::stoi(argv[i]); - params->custom_n_rank_w3 = true; + params->n_rank_ffn_up = std::stoi(argv[i]); + params->custom_n_rank_ffn_up = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); train_print_usage(argc, argv, &default_params); @@ -1452,12 +1452,12 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) { nx += ggml_nelements(layer.wo_b); nx += ggml_nelements(layer.ffn_norm_a); nx += ggml_nelements(layer.ffn_norm_b); - nx += ggml_nelements(layer.w1_a); - nx += ggml_nelements(layer.w1_b); - nx += ggml_nelements(layer.w2_a); - nx += ggml_nelements(layer.w2_b); - nx += ggml_nelements(layer.w3_a); - nx += ggml_nelements(layer.w3_b); + nx += ggml_nelements(layer.ffn_gate_a); + nx += ggml_nelements(layer.ffn_gate_b); + nx += ggml_nelements(layer.ffn_down_a); + nx += ggml_nelements(layer.ffn_down_b); + nx += ggml_nelements(layer.ffn_up_a); + nx += ggml_nelements(layer.ffn_up_b); } return nx; } @@ -1511,9 +1511,9 @@ int main(int argc, char ** argv) { uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r; uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r; uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1; - uint32_t n_rank_w1 = params.custom_n_rank_w1 ? params.n_rank_w1 : params.lora_r; - uint32_t n_rank_w2 = params.custom_n_rank_w2 ? params.n_rank_w2 : params.lora_r; - uint32_t n_rank_w3 = params.custom_n_rank_w3 ? params.n_rank_w3 : params.lora_r; + uint32_t n_rank_ffn_gate = params.custom_n_rank_ffn_gate ? params.n_rank_ffn_gate : params.lora_r; + uint32_t n_rank_ffn_down = params.custom_n_rank_ffn_down ? params.n_rank_ffn_down : params.lora_r; + uint32_t n_rank_ffn_up = params.custom_n_rank_ffn_up ? params.n_rank_ffn_up : params.lora_r; uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r; uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1; uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r; @@ -1523,9 +1523,9 @@ int main(int argc, char ** argv) { lora.hparams.n_rank_wv = n_rank_wv; lora.hparams.n_rank_wo = n_rank_wo; lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm; - lora.hparams.n_rank_w1 = n_rank_w1; - lora.hparams.n_rank_w2 = n_rank_w2; - lora.hparams.n_rank_w3 = n_rank_w3; + lora.hparams.n_rank_ffn_gate = n_rank_ffn_gate; + lora.hparams.n_rank_ffn_down = n_rank_ffn_down; + lora.hparams.n_rank_ffn_up = n_rank_ffn_up; lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings; lora.hparams.n_rank_norm = n_rank_norm; lora.hparams.n_rank_output = n_rank_output; @@ -1566,9 +1566,9 @@ int main(int argc, char ** argv) { || (lora.hparams.n_rank_wv != n_rank_wv) || (lora.hparams.n_rank_wo != n_rank_wo) || (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm) - || (lora.hparams.n_rank_w1 != n_rank_w1) - || (lora.hparams.n_rank_w2 != n_rank_w2) - || (lora.hparams.n_rank_w3 != n_rank_w3) + || (lora.hparams.n_rank_ffn_gate != n_rank_ffn_gate) + || (lora.hparams.n_rank_ffn_down != n_rank_ffn_down) + || (lora.hparams.n_rank_ffn_up != n_rank_ffn_up) || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings) || (lora.hparams.n_rank_norm != n_rank_norm) || (lora.hparams.n_rank_output != n_rank_output) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 2e2a8ce08..bfdf124d7 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -50,9 +50,9 @@ struct my_llama_layer { struct ggml_tensor * ffn_norm; // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; + struct ggml_tensor * ffn_gate; // w1 + struct ggml_tensor * ffn_down; // w2 + struct ggml_tensor * ffn_up; // w3 }; struct my_llama_model { @@ -140,9 +140,9 @@ static void set_param_model(struct my_llama_model * model) { ggml_set_param(ctx, layer.wv); ggml_set_param(ctx, layer.wo); ggml_set_param(ctx, layer.ffn_norm); - ggml_set_param(ctx, layer.w1); - ggml_set_param(ctx, layer.w2); - ggml_set_param(ctx, layer.w3); + ggml_set_param(ctx, layer.ffn_gate); + ggml_set_param(ctx, layer.ffn_down); + ggml_set_param(ctx, layer.ffn_up); } } @@ -198,9 +198,9 @@ static void init_model(struct my_llama_model * model) { layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); - layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); - layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); + layer.ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); + layer.ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); + layer.ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i)); @@ -211,9 +211,9 @@ static void init_model(struct my_llama_model * model) { ggml_set_name(layer.ffn_norm, tni(LLM_TENSOR_FFN_NORM, i)); - ggml_set_name(layer.w1, tni(LLM_TENSOR_FFN_GATE, i)); - ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i)); - ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i)); + ggml_set_name(layer.ffn_gate, tni(LLM_TENSOR_FFN_GATE, i)); + ggml_set_name(layer.ffn_down, tni(LLM_TENSOR_FFN_DOWN, i)); + ggml_set_name(layer.ffn_up, tni(LLM_TENSOR_FFN_UP, i)); } set_param_model(model); @@ -244,9 +244,9 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean, randomize_tensor_normal(layer.ffn_norm, rnd); - randomize_tensor_normal(layer.w1, rnd); - randomize_tensor_normal(layer.w2, rnd); - randomize_tensor_normal(layer.w3, rnd); + randomize_tensor_normal(layer.ffn_gate, rnd); + randomize_tensor_normal(layer.ffn_down, rnd); + randomize_tensor_normal(layer.ffn_up, rnd); } free_random_normal_distribution(rnd); @@ -356,11 +356,11 @@ static struct ggml_tensor * llama_build_train_graphs( struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, f_norm_rms_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); struct ggml_tensor * t23 = ggml_repeat (ctx, layer.ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); - struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); - struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); + struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); + struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); - struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); + struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); cur = t30; checkpoints.push_back(cur); @@ -521,9 +521,9 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i)); copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i)); copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i)); - copy_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); - copy_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); - copy_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); + copy_tensor_by_name(layer.ffn_gate, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); + copy_tensor_by_name(layer.ffn_down, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); + copy_tensor_by_name(layer.ffn_up, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); } } @@ -664,9 +664,9 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo gguf_add_tensor(fctx, layer.wv); gguf_add_tensor(fctx, layer.wo); gguf_add_tensor(fctx, layer.ffn_norm); - gguf_add_tensor(fctx, layer.w1); - gguf_add_tensor(fctx, layer.w2); - gguf_add_tensor(fctx, layer.w3); + gguf_add_tensor(fctx, layer.ffn_gate); + gguf_add_tensor(fctx, layer.ffn_down); + gguf_add_tensor(fctx, layer.ffn_up); } } @@ -915,9 +915,9 @@ static int64_t get_parameter_count(struct my_llama_model* model) { nx += ggml_nelements(layer.wv); nx += ggml_nelements(layer.wo); nx += ggml_nelements(layer.ffn_norm); - nx += ggml_nelements(layer.w1); - nx += ggml_nelements(layer.w2); - nx += ggml_nelements(layer.w3); + nx += ggml_nelements(layer.ffn_gate); + nx += ggml_nelements(layer.ffn_down); + nx += ggml_nelements(layer.ffn_up); } return nx; } From 037259be689353081e7bae3c1ab4ab18e7fbe8c9 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 13 Feb 2024 15:24:50 +0200 Subject: [PATCH 07/11] llama : make load error reporting more granular (#5477) Makes it easier to pinpoint where e.g. `unordered_map::at: key not found` comes from. --- llama.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 381a03068..61c695187 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4384,9 +4384,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam model.hparams.vocab_only = params.vocab_only; - llm_load_arch (ml, model); - llm_load_hparams(ml, model); - llm_load_vocab (ml, model); + try { + llm_load_arch(ml, model); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model architecture: " + std::string(e.what())); + } + try { + llm_load_hparams(ml, model); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); + } + try { + llm_load_vocab(ml, model); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); + } llm_load_print_meta(ml, model); From c4e6dd59e45ef7b14f7763fb073b517395dc176c Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 13 Feb 2024 18:18:16 +0200 Subject: [PATCH 08/11] llama : allow raw byte in SPM vocabs; don't crash on nl 404 (#5478) * common : don't crash if newline token is not found * common : llama_byte_to_token: allow falling back to finding just the token byte in SPM vocabs --- llama.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 61c695187..8ebbf7628 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3314,7 +3314,12 @@ static void llm_load_vocab( // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { - vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + try { + vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + } catch (const std::exception & e) { + LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what()); + vocab.linefeed_id = vocab.special_pad_id; + } } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { vocab.linefeed_id = vocab.special_pad_id; } else { @@ -7746,7 +7751,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { switch (llama_vocab_get_type(vocab)) { case LLAMA_VOCAB_TYPE_SPM: { const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; - return vocab.token_to_id.at(buf); + auto token = vocab.token_to_id.find(buf); + if (token != vocab.token_to_id.end()) { + return (*token).second; + } + // Try to fall back to just the byte as a string + const char buf2[2] = { (char)ch, 0 }; + return vocab.token_to_id.at(buf2); } case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_BPE: { From ea9c8e11436ad50719987fa23a289c74b7b40d40 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 13 Feb 2024 12:03:53 -0500 Subject: [PATCH 09/11] llama : add support for Nomic Embed (#5468) --- convert-hf-to-gguf.py | 117 ++++++++++++------- gguf-py/gguf/constants.py | 56 +++++---- gguf-py/gguf/tensor_mapping.py | 12 +- llama.cpp | 201 ++++++++++++++++++++++++--------- 4 files changed, 273 insertions(+), 113 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5adfdc143..ae471481d 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -10,7 +10,7 @@ import re import sys from enum import IntEnum from pathlib import Path -from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast +from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast import numpy as np import torch @@ -25,15 +25,6 @@ import gguf from convert import HfVocab -# check for any of the given keys in the dictionary and return the value of the first key found -def get_key_opts(d, keys): - for k in keys: - if k in d: - return d[k] - print(f"Could not find any of {keys}") - sys.exit() - - ###### MODEL DEFINITIONS ###### class SentencePieceTokenTypes(IntEnum): @@ -58,6 +49,15 @@ class Model: self.hparams = Model.load_hparams(self.dir_model) self.model_arch = self._get_model_architecture() self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) + self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) + + def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any: + key = next((k for k in keys if k in self.hparams), None) + if key is not None: + return self.hparams[key] + if optional: + return None + raise KeyError(f"could not find any of: {keys}") def set_vocab(self): self._set_vocab_gpt2() @@ -79,28 +79,33 @@ class Model: def set_gguf_parameters(self): self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams.get( - "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), - )) - if (n_ctx := self.hparams.get("max_position_embeddings")) is not None: + self.gguf_writer.add_block_count(self.block_count) + + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: self.gguf_writer.add_context_length(n_ctx) - if (n_embd := self.hparams.get("hidden_size")) is not None: - self.gguf_writer.add_embedding_length(n_embd) - if (n_ff := self.hparams.get("intermediate_size")) is not None: + + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + self.gguf_writer.add_embedding_length(n_embd) + + if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) - if (n_head := self.hparams.get("num_attention_heads")) is not None: - self.gguf_writer.add_head_count(n_head) + + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + self.gguf_writer.add_head_count(n_head) + if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) - if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None: - self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps) + if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_eps(f_norm_eps) if (n_experts := self.hparams.get("num_local_experts")) is not None: self.gguf_writer.add_expert_count(n_experts) if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: self.gguf_writer.add_expert_used_count(n_experts_used) - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + self.gguf_writer.add_file_type(self.ftype) def write_tensors(self): block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) @@ -211,6 +216,8 @@ class Model: return MiniCPMModel if model_architecture == "BertModel": return BertModel + if model_architecture == "NomicBertModel": + return NomicBertModel return Model def _is_model_safetensors(self) -> bool: @@ -268,6 +275,8 @@ class Model: return gguf.MODEL_ARCH.MINICPM if arch == "BertModel": return gguf.MODEL_ARCH.BERT + if arch == "NomicBertModel": + return gguf.MODEL_ARCH.NOMIC_BERT raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -1297,21 +1306,21 @@ class GPT2Model(Model): class Phi2Model(Model): def set_gguf_parameters(self): - block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"]) + block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) - rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"]) - n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"]) - n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"]) + rot_pct = self.find_hparam(["partial_rotary_factor"]) + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_name("Phi2") - self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"])) + self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_feed_forward_length(4 * n_embd) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"])) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_add_bos_token(False) @@ -1636,20 +1645,12 @@ in chat mode so that the conversation can end normally.") class BertModel(Model): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.block_count = self.hparams["num_hidden_layers"] + self.vocab_size = None def set_gguf_parameters(self): - # TODO(cebtenzzre): merge with parent class - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + super().set_gguf_parameters() self.gguf_writer.add_causal_attention(False) self.gguf_writer.add_pooling_layer(True) - self.gguf_writer.add_file_type(self.ftype) def set_vocab(self): path = self.dir_model @@ -1659,6 +1660,7 @@ class BertModel(Model): vocab = HfVocab(path, added_tokens_path) tokens, scores, toktypes = zip(*vocab.all_tokens()) assert len(tokens) == vocab.vocab_size + self.vocab_size = vocab.vocab_size # we need this to validate the size of the token_type embeddings # though currently we are passing all zeros to the token_type embeddings @@ -1672,7 +1674,7 @@ class BertModel(Model): if tok.startswith(b"##"): return tok[2:] return b"\xe2\x96\x81" + tok - tokens = [phantom(t, y) for t, y in zip(tokens, toktypes)] + tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes)) # set up bos and eos tokens (cls and sep) self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id) @@ -1724,6 +1726,43 @@ class BertModel(Model): self.gguf_writer.add_tensor(new_name, data) +class NomicBertModel(BertModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # the HF config claims n_ctx=8192, but it uses RoPE scaling + self.hparams["n_ctx"] = 2048 + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # this doesn't do anything in the HF version + assert self.hparams["causal"] is False + # no bias tensors + assert self.hparams["qkv_proj_bias"] is False + assert self.hparams["mlp_fc1_bias"] is False + assert self.hparams["mlp_fc2_bias"] is False + # norm at end of layer + assert self.hparams["prenorm"] is False + # standard RoPE + assert self.hparams["rotary_emb_fraction"] == 1.0 + assert self.hparams["rotary_emb_interleaved"] is False + assert self.hparams["rotary_emb_scale_base"] is None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + + def get_tensors(self): + assert self.vocab_size is not None + for name, data in super().get_tensors(): + # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly. + if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size: + rounded_vocab_size = (self.vocab_size + 63) // 64 * 64 + assert data.shape == (rounded_vocab_size, self.hparams["n_embd"]) + data = data[:self.vocab_size, :] + yield name, data + + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 644e1589c..5fba01714 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -87,27 +87,28 @@ class Keys: class MODEL_ARCH(IntEnum): - LLAMA = auto() - FALCON = auto() - BAICHUAN = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() - STARCODER = auto() - PERSIMMON = auto() - REFACT = auto() - BERT = auto() - BLOOM = auto() - STABLELM = auto() - QWEN = auto() - QWEN2 = auto() - PHI2 = auto() - PLAMO = auto() - CODESHELL = auto() - ORION = auto() + LLAMA = auto() + FALCON = auto() + BAICHUAN = auto() + GPT2 = auto() + GPTJ = auto() + GPTNEOX = auto() + MPT = auto() + STARCODER = auto() + PERSIMMON = auto() + REFACT = auto() + BERT = auto() + NOMIC_BERT = auto() + BLOOM = auto() + STABLELM = auto() + QWEN = auto() + QWEN2 = auto() + PHI2 = auto() + PLAMO = auto() + CODESHELL = auto() + ORION = auto() INTERNLM2 = auto() - MINICPM = auto() + MINICPM = auto() class MODEL_TENSOR(IntEnum): @@ -153,6 +154,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.PERSIMMON: "persimmon", MODEL_ARCH.REFACT: "refact", MODEL_ARCH.BERT: "bert", + MODEL_ARCH.NOMIC_BERT: "nomic-bert", MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.QWEN: "qwen", @@ -282,6 +284,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_UP, MODEL_TENSOR.LAYER_OUT_NORM, ], + MODEL_ARCH.NOMIC_BERT: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.TOKEN_TYPES, + MODEL_TENSOR.POS_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_OUT_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.LAYER_OUT_NORM, + ], MODEL_ARCH.MPT: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index c7ba1420e..861003776 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -15,7 +15,7 @@ class TensorNameMap: "word_embeddings", # bloom "model.embed_tokens", # llama-hf "tok_embeddings", # llama-pth - "embeddings.word_embeddings", # bert + "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon "wte", # gpt2 "transformer.embd.wte", # phi2 @@ -24,13 +24,14 @@ class TensorNameMap: # Token type embeddings MODEL_TENSOR.TOKEN_TYPES: ( - "embeddings.token_type_embeddings", # bert + "embeddings.token_type_embeddings", # bert nomic-bert ), # Normalization of token embeddings MODEL_TENSOR.TOKEN_EMBD_NORM: ( "word_embeddings_layernorm", # bloom "embeddings.LayerNorm", # bert + "emb_ln", # nomic-bert ), # Position embeddings @@ -103,6 +104,7 @@ class TensorNameMap: "model.layers.{bid}.self_attn.query_key_value", # persimmon "h.{bid}.attn.c_attn", # gpt2 "transformer.h.{bid}.mixer.Wqkv", # phi2 + "encoder.layers.{bid}.attn.Wqkv", # nomic-bert ), # Attention query @@ -152,11 +154,13 @@ class TensorNameMap: "transformer.h.{bid}.mixer.out_proj", # phi2 "model.layers.layers.{bid}.self_attn.o_proj", # plamo "model.layers.{bid}.attention.wo", # internlm2 + "encoder.layers.{bid}.attn.out_proj", # nomic-bert ), # Attention output norm MODEL_TENSOR.ATTN_OUT_NORM: ( "encoder.layer.{bid}.attention.output.LayerNorm", # bert + "encoder.layers.{bid}.norm1", # nomic-bert ), # Rotary embeddings @@ -205,6 +209,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.fc1", # phi2 "model.layers.layers.{bid}.mlp.up_proj", # plamo "model.layers.{bid}.feed_forward.w3", # internlm2 + "encoder.layers.{bid}.mlp.fc11", # nomic-bert ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -224,6 +229,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.w2", # qwen "model.layers.layers.{bid}.mlp.gate_proj", # plamo "model.layers.{bid}.feed_forward.w1", # internlm2 + "encoder.layers.{bid}.mlp.fc12", # nomic-bert ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -249,6 +255,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.fc2", # phi2 "model.layers.layers.{bid}.mlp.down_proj", # plamo "model.layers.{bid}.feed_forward.w2", # internlm2 + "encoder.layers.{bid}.mlp.fc2", # nomic-bert ), MODEL_TENSOR.FFN_DOWN_EXP: ( @@ -272,6 +279,7 @@ class TensorNameMap: MODEL_TENSOR.LAYER_OUT_NORM: ( "encoder.layer.{bid}.output.LayerNorm", # bert + "encoder.layers.{bid}.norm2", # nomic-bert ) } diff --git a/llama.cpp b/llama.cpp index 8ebbf7628..14e8821cd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -197,6 +197,7 @@ enum llm_arch { LLM_ARCH_PERSIMMON, LLM_ARCH_REFACT, LLM_ARCH_BERT, + LLM_ARCH_NOMIC_BERT, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, LLM_ARCH_QWEN, @@ -211,27 +212,28 @@ enum llm_arch { }; static std::map LLM_ARCH_NAMES = { - { LLM_ARCH_LLAMA, "llama" }, - { LLM_ARCH_FALCON, "falcon" }, - { LLM_ARCH_GPT2, "gpt2" }, - { LLM_ARCH_GPTJ, "gptj" }, - { LLM_ARCH_GPTNEOX, "gptneox" }, - { LLM_ARCH_MPT, "mpt" }, - { LLM_ARCH_BAICHUAN, "baichuan" }, - { LLM_ARCH_STARCODER, "starcoder" }, - { LLM_ARCH_PERSIMMON, "persimmon" }, - { LLM_ARCH_REFACT, "refact" }, - { LLM_ARCH_BERT, "bert" }, - { LLM_ARCH_BLOOM, "bloom" }, - { LLM_ARCH_STABLELM, "stablelm" }, - { LLM_ARCH_QWEN, "qwen" }, - { LLM_ARCH_QWEN2, "qwen2" }, - { LLM_ARCH_PHI2, "phi2" }, - { LLM_ARCH_PLAMO, "plamo" }, - { LLM_ARCH_CODESHELL, "codeshell" }, - { LLM_ARCH_ORION, "orion" }, - { LLM_ARCH_INTERNLM2, "internlm2" }, - { LLM_ARCH_MINICPM, "minicpm" }, + { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GPT2, "gpt2" }, + { LLM_ARCH_GPTJ, "gptj" }, + { LLM_ARCH_GPTNEOX, "gptneox" }, + { LLM_ARCH_MPT, "mpt" }, + { LLM_ARCH_BAICHUAN, "baichuan" }, + { LLM_ARCH_STARCODER, "starcoder" }, + { LLM_ARCH_PERSIMMON, "persimmon" }, + { LLM_ARCH_REFACT, "refact" }, + { LLM_ARCH_BERT, "bert" }, + { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, + { LLM_ARCH_BLOOM, "bloom" }, + { LLM_ARCH_STABLELM, "stablelm" }, + { LLM_ARCH_QWEN, "qwen" }, + { LLM_ARCH_QWEN2, "qwen2" }, + { LLM_ARCH_PHI2, "phi2" }, + { LLM_ARCH_PLAMO, "plamo" }, + { LLM_ARCH_CODESHELL, "codeshell" }, + { LLM_ARCH_ORION, "orion" }, + { LLM_ARCH_INTERNLM2, "internlm2" }, + { LLM_ARCH_MINICPM, "minicpm" }, }; enum llm_kv { @@ -375,6 +377,7 @@ enum llm_tensor { LLM_TENSOR_ATTN_OUT, LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_NORM_2, + LLM_TENSOR_ATTN_OUT_NORM, LLM_TENSOR_ATTN_ROT_EMBD, LLM_TENSOR_FFN_GATE_INP, LLM_TENSOR_FFN_NORM, @@ -387,6 +390,7 @@ enum llm_tensor { LLM_TENSOR_FFN_UP_EXP, LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_LAYER_OUT_NORM, }; static std::map> LLM_TENSOR_NAMES = { @@ -552,12 +556,27 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, { LLM_TENSOR_TOKEN_TYPES, "token_types" }, { LLM_TENSOR_POS_EMBD, "position_embd" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_NOMIC_BERT, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, @@ -1485,6 +1504,7 @@ enum e_model { MODEL_22M, MODEL_33M, MODEL_109M, + MODEL_137M, MODEL_335M, MODEL_0_5B, MODEL_1B, @@ -1620,6 +1640,8 @@ struct llama_layer { struct ggml_tensor * attn_q_norm_b; struct ggml_tensor * attn_k_norm; struct ggml_tensor * attn_k_norm_b; + struct ggml_tensor * attn_out_norm; + struct ggml_tensor * attn_out_norm_b; // attention struct ggml_tensor * wq; @@ -1638,6 +1660,8 @@ struct llama_layer { // normalization struct ggml_tensor * ffn_norm; struct ggml_tensor * ffn_norm_b; + struct ggml_tensor * layer_out_norm; + struct ggml_tensor * layer_out_norm_b; // ff struct ggml_tensor * ffn_gate; // w1 @@ -2855,6 +2879,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { static const char * llama_model_type_name(e_model type) { switch (type) { + case MODEL_22M: return "22M"; + case MODEL_33M: return "33M"; + case MODEL_109M: return "109M"; + case MODEL_137M: return "137M"; + case MODEL_0_5B: return "0.5B"; case MODEL_1B: return "1B"; case MODEL_2B: return "2B"; case MODEL_3B: return "3B"; @@ -3073,6 +3102,17 @@ static void llm_load_hparams( model.type = e_model::MODEL_335M; break; // bge-large } } break; + case LLM_ARCH_NOMIC_BERT: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); + ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer); + + if (hparams.n_layer == 12 && hparams.n_embd == 768) { + model.type = e_model::MODEL_137M; + } + } break; case LLM_ARCH_BLOOM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -3875,10 +3915,14 @@ static bool llm_load_tensors( } } break; case LLM_ARCH_BERT: + case LLM_ARCH_NOMIC_BERT: { - model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); - model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); + if (model.arch == LLM_ARCH_BERT) { + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); + } + model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); @@ -3888,29 +3932,38 @@ static bool llm_load_tensors( auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + if (model.arch == LLM_ARCH_BERT) { + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); - layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); - layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); + } else { + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + } - layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); - layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); - layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); + layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); + layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}); - layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); - layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); - layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + if (model.arch == LLM_ARCH_BERT) { + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); - layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + } else { + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + } + + layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); + layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}); } } break; case LLM_ARCH_BLOOM: @@ -5773,6 +5826,7 @@ struct llm_build_context { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -5789,7 +5843,9 @@ struct llm_build_context { // token types are hardcoded to zero ("Sentence A") struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); inpL = ggml_add(ctx0, inpL, type_row0); - inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL); + if (model.arch == LLM_ARCH_BERT) { + inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL); + } cb(inpL, "inp_embd", -1); // embed layer norm @@ -5805,7 +5861,7 @@ struct llm_build_context { struct ggml_tensor * cur = inpL; // self-attention - { + if (model.arch == LLM_ARCH_BERT) { struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq); cb(Qcur, "Qcur", il); @@ -5818,6 +5874,37 @@ struct llm_build_context { // seems like we just need to do this for Q? Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } else { + // compute Q and K and RoPE them + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); @@ -5828,25 +5915,34 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, inpL); // attention layer norm - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il); + cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il); struct ggml_tensor * ffn_inp = cur; cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, - NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, - NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + if (model.arch == LLM_ARCH_BERT) { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + } else { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + } cb(cur, "ffn_out", il); // attentions bypass the intermediate layer cur = ggml_add(ctx0, cur, ffn_inp); // output layer norm - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il); + cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il); // input for next layer inpL = cur; @@ -7289,6 +7385,7 @@ static struct ggml_cgraph * llama_build_graph( result = llm.build_refact(); } break; case LLM_ARCH_BERT: + case LLM_ARCH_NOMIC_BERT: { result = llm.build_bert(); } break; From 6c00a066928b0475b865a2e3e709e2166e02d548 Mon Sep 17 00:00:00 2001 From: John <78893154+cmp-nct@users.noreply.github.com> Date: Tue, 13 Feb 2024 18:56:38 +0100 Subject: [PATCH 10/11] gguf : add python reader example (#5216) * Update CMakeLists.txt * Create reader.py * Update reader.py * Update reader.py another whitespace :| * Update reader.py * lintlintlint --- examples/CMakeLists.txt | 1 + gguf-py/examples/reader.py | 45 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 gguf-py/examples/reader.py diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 68ad89964..653abc73a 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -38,6 +38,7 @@ else() add_subdirectory(speculative) add_subdirectory(lookahead) add_subdirectory(lookup) + add_subdirectory(gguf) add_subdirectory(train-text-from-scratch) add_subdirectory(imatrix) if (LLAMA_BUILD_SERVER) diff --git a/gguf-py/examples/reader.py b/gguf-py/examples/reader.py new file mode 100644 index 000000000..62e0769da --- /dev/null +++ b/gguf-py/examples/reader.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +import sys +from pathlib import Path +from gguf.gguf_reader import GGUFReader + + +sys.path.insert(0, str(Path(__file__).parent.parent)) + + +def read_gguf_file(gguf_file_path): + """ + Reads and prints key-value pairs and tensor information from a GGUF file in an improved format. + + Parameters: + - gguf_file_path: Path to the GGUF file. + """ + + reader = GGUFReader(gguf_file_path) + + # List all key-value pairs in a columnized format + print("Key-Value Pairs:") + max_key_length = max(len(key) for key in reader.fields.keys()) + for key, field in reader.fields.items(): + value = field.parts[field.data[0]] + print(f"{key:{max_key_length}} : {value}") + print("----") + + # List all tensors + print("Tensors:") + tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}" + print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) + print("-" * 80) + for tensor in reader.tensors: + shape_str = "x".join(map(str, tensor.shape)) + size_str = str(tensor.n_elements) + quantization_str = tensor.tensor_type.name + print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) + + +if __name__ == '__main__': + if len(sys.argv) < 2: + print("Usage: reader.py ") + sys.exit(1) + gguf_file_path = sys.argv[1] + read_gguf_file(gguf_file_path) From f5ca054855dea83f424003162f26de376e5643f6 Mon Sep 17 00:00:00 2001 From: AT Date: Tue, 13 Feb 2024 15:44:25 -0600 Subject: [PATCH 11/11] Early return for zero size calls to get_tensor. (#5482) * Early return for zero size calls to get_tensor. Signed-off-by: Adam Treat * Update ggml-kompute.cpp Co-authored-by: Georgi Gerganov * Update ggml-kompute.cpp Co-authored-by: Georgi Gerganov * Add an early return to the get/set tensor when the size is null. Signed-off-by: Adam Treat * Early return after the assertions. Signed-off-by: Adam Treat * Since we do the early return in the generic backend now no reason to do so here as well. Signed-off-by: Adam Treat --------- Signed-off-by: Adam Treat Co-authored-by: Georgi Gerganov --- ggml-backend.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ggml-backend.c b/ggml-backend.c index 9ee81b766..87eea8440 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -219,6 +219,10 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + if (!size) { + return; + } + tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size); } @@ -229,6 +233,10 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); + if (!size) { + return; + } + tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size); }