From 0c5d4d87b0badd1b9b6b2d8ac89a453c353a023c Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 10:38:46 +0800 Subject: [PATCH 01/20] add placeholder of starcoder in gguf / llama.cpp --- gguf-py/gguf/gguf.py | 32 ++++++++++++++++++-------------- llama.cpp | 16 +++++++++------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 7f7204ea1..97792943c 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -77,13 +77,14 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world" class MODEL_ARCH(IntEnum): - LLAMA : int = auto() - FALCON : int = auto() - BAICHUAN:int = auto() - GPT2 : int = auto() - GPTJ : int = auto() - GPTNEOX: int = auto() - MPT : int = auto() + LLAMA : int = auto() + FALCON : int = auto() + BAICHUAN : int = auto() + GPT2 : int = auto() + GPTJ : int = auto() + GPTNEOX : int = auto() + MPT : int = auto() + STARCODER : int = auto() class MODEL_TENSOR(IntEnum): @@ -107,13 +108,14 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { - MODEL_ARCH.LLAMA: "llama", - MODEL_ARCH.FALCON: "falcon", - MODEL_ARCH.BAICHUAN:"baichuan", - MODEL_ARCH.GPT2: "gpt2", - MODEL_ARCH.GPTJ: "gptj", - MODEL_ARCH.GPTNEOX: "gptneox", - MODEL_ARCH.MPT: "mpt", + MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.FALCON: "falcon", + MODEL_ARCH.BAICHUAN: "baichuan", + MODEL_ARCH.GPT2: "gpt2", + MODEL_ARCH.GPTJ: "gptj", + MODEL_ARCH.GPTNEOX: "gptneox", + MODEL_ARCH.MPT: "mpt", + MODEL_ARCH.STARCODER: "starcoder", } MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { @@ -171,6 +173,8 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", }, + MODEL_ARCH.STARCODER: { + }, MODEL_ARCH.GPT2: { # TODO }, diff --git a/llama.cpp b/llama.cpp index 30728b7cb..2b535a881 100644 --- a/llama.cpp +++ b/llama.cpp @@ -160,17 +160,19 @@ enum llm_arch { LLM_ARCH_GPTJ, LLM_ARCH_GPTNEOX, LLM_ARCH_MPT, + LLM_ARCH_STARCODER, LLM_ARCH_UNKNOWN, }; static std::map LLM_ARCH_NAMES = { - { LLM_ARCH_LLAMA, "llama" }, - { LLM_ARCH_FALCON, "falcon" }, - { LLM_ARCH_GPT2, "gpt2" }, - { LLM_ARCH_GPTJ, "gptj" }, - { LLM_ARCH_GPTNEOX, "gptneox" }, - { LLM_ARCH_MPT, "mpt" }, - { LLM_ARCH_BAICHUAN,"baichuan" }, + { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GPT2, "gpt2" }, + { LLM_ARCH_GPTJ, "gptj" }, + { LLM_ARCH_GPTNEOX, "gptneox" }, + { LLM_ARCH_MPT, "mpt" }, + { LLM_ARCH_BAICHUAN, "baichuan" }, + { LLM_ARCH_STARCODER, "starcoder" }, }; enum llm_kv { From eb7f0eba3e95e5ce6211c5621695e98a803ce5ce Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 11:24:24 +0800 Subject: [PATCH 02/20] support convert starcoder weights to gguf --- convert-starcoder-hf-to-gguf.py | 252 ++++++++++++++++++++++++++++++++ gguf-py/gguf/gguf.py | 10 ++ 2 files changed, 262 insertions(+) create mode 100755 convert-starcoder-hf-to-gguf.py diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py new file mode 100755 index 000000000..4416b5d9e --- /dev/null +++ b/convert-starcoder-hf-to-gguf.py @@ -0,0 +1,252 @@ +#!/usr/bin/env python3 +# HF falcon--> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import struct +import sys +from pathlib import Path +from typing import Any + +import numpy as np +import torch +from transformers import AutoTokenizer # type: ignore[import] + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + +def bytes_to_unicode(): + # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + return dict(zip(bs, (chr(n) for n in cs))) + + +def count_model_parts(dir_model: Path) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + return num_parts + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") + parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +if hparams["architectures"][0] != "GPTBigCodeForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + + sys.exit(1) + +# get number of model parts +num_parts = count_model_parts(dir_model) + +ARCH=gguf.MODEL_ARCH.STARCODER +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +block_count = hparams["n_layer"] + +gguf_writer.add_name("StarCoder") +gguf_writer.add_context_length(2048) # not in config.json +gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform +gguf_writer.add_embedding_length(hparams["n_embd"]) +gguf_writer.add_feed_forward_length(4 * hparams["n_embd"]) +gguf_writer.add_block_count(block_count) +gguf_writer.add_head_count(hparams["n_head"]) +if "n_head_kv" in hparams: + gguf_writer.add_head_count_kv(hparams["n_head_kv"]) +else: + gguf_writer.add_head_count_kv(1) +gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) +gguf_writer.add_file_type(ftype) + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytearray] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +tokenizer_json_file = dir_model / 'tokenizer.json' +if not tokenizer_json_file.is_file(): + print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr) + sys.exit(1) + +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") + +with open(tokenizer_json_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + +print("gguf: get gpt2 tokenizer vocab") + +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"]) + +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) + +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} +byte_encoder = bytes_to_unicode() +byte_decoder = {v: k for k, v in byte_encoder.items()} + +for i in range(vocab_size): + if i in reverse_vocab: + try: + text = bytearray([byte_decoder[c] for c in reverse_vocab[i]]) + except KeyError: + text = bytearray() + for c in reverse_vocab[i]: + if ord(c) < 256: # single byte character + text.append(byte_decoder[ord(c)]) + else: # multibyte special token character + text.extend(c.encode('utf-8')) + else: + print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.") + pad_token = f"[PAD{i}]".encode("utf8") + text = bytearray(pad_token) + + tokens.append(text) + scores.append(0.0) # dymmy + toktypes.append(gguf.TokenType.NORMAL) # dummy + +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_scores(scores) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH,block_count) + +# params for qkv transform +n_head = hparams["n_head"] +n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1 + +head_dim = hparams["n_embd"] // n_head + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(dir_model / part_name, map_location="cpu") + + for name in model_part.keys(): + data = model_part[name] + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(name, "=>", new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 97792943c..ba9bcf04d 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -174,6 +174,16 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", }, MODEL_ARCH.STARCODER: { + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.POS_EMBD: "position_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_up", }, MODEL_ARCH.GPT2: { # TODO From 76d32cca59fcf205f48f61cf5c2b467bb866d0e2 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 11:42:16 +0800 Subject: [PATCH 03/20] convert MQA to MHA --- convert-starcoder-hf-to-gguf.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py index 4416b5d9e..00e4f0d92 100755 --- a/convert-starcoder-hf-to-gguf.py +++ b/convert-starcoder-hf-to-gguf.py @@ -212,6 +212,24 @@ for part_name in part_names: data = data.squeeze().numpy() + if name.endswith(".attn.c_attn.weight") or name.endswith(".attn.c_attn.bias"): + print("Duplicate K,V heads to use MHA instead of MQA for", name) + + embed_dim = hparams["n_embd"] + head_dim = embed_dim // hparams["n_head"] + + # ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim) + q, k ,v = np.split(data, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0) + # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim) + if len(k.shape) == 2: + k = np.tile(k, (hparams["n_head"], 1)) + v = np.tile(v, (hparams["n_head"], 1)) + elif len(k.shape) == 1: + k = np.tile(k, (hparams["n_head"])) + v = np.tile(v, (hparams["n_head"])) + # concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim) + data = np.concatenate((q, k, v), axis=0) + # map tensor names new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) if new_name is None: From 7e0a843b6a99185f7e62133ac991085b0a1b8cde Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 11:45:18 +0800 Subject: [PATCH 04/20] fix ffn_down name --- gguf-py/gguf/gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index ba9bcf04d..514f32610 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -183,7 +183,7 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", }, MODEL_ARCH.GPT2: { # TODO From 7298c37e7e3bd0b139e7c724e1bfec3b6377e742 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 11:45:26 +0800 Subject: [PATCH 05/20] add LLM_ARCH_STARCODER to llama.cpp --- llama.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/llama.cpp b/llama.cpp index 2b535a881..64bfbc4dc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -378,6 +378,21 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, }, }, + { + LLM_ARCH_STARCODER, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_POS_EMBD, "position_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -897,6 +912,7 @@ static llama_state g_state; // available llama models enum e_model { MODEL_UNKNOWN, + MODEL_1B, MODEL_3B, MODEL_7B, MODEL_13B, @@ -1715,6 +1731,14 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_STARCODER: + { + GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; }; From 166a259f679ec7d03e78b16f0649e17c5052ecf1 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 12:12:27 +0800 Subject: [PATCH 06/20] set head_count_kv = 1 --- convert-starcoder-hf-to-gguf.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py index 00e4f0d92..5ce736244 100755 --- a/convert-starcoder-hf-to-gguf.py +++ b/convert-starcoder-hf-to-gguf.py @@ -104,15 +104,11 @@ block_count = hparams["n_layer"] gguf_writer.add_name("StarCoder") gguf_writer.add_context_length(2048) # not in config.json -gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform gguf_writer.add_embedding_length(hparams["n_embd"]) gguf_writer.add_feed_forward_length(4 * hparams["n_embd"]) gguf_writer.add_block_count(block_count) gguf_writer.add_head_count(hparams["n_head"]) -if "n_head_kv" in hparams: - gguf_writer.add_head_count_kv(hparams["n_head_kv"]) -else: - gguf_writer.add_head_count_kv(1) +gguf_writer.add_head_count_kv(1) gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) gguf_writer.add_file_type(ftype) @@ -251,7 +247,7 @@ for part_name in part_names: if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(name, "=>", new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) gguf_writer.add_tensor(new_name, data) From 57f064d7c23c641ca361beab4a768510befed358 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 12:12:33 +0800 Subject: [PATCH 07/20] load starcoder weight --- llama.cpp | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/llama.cpp b/llama.cpp index 64bfbc4dc..20e200881 100644 --- a/llama.cpp +++ b/llama.cpp @@ -937,6 +937,7 @@ struct llama_hparams { uint32_t n_layer = 32; uint32_t n_rot = 64; uint32_t n_ff = 11008; + uint32_t n_positions = -1; // StarCoder float f_norm_eps = 1e-5; float f_norm_rms_eps = 1e-5; @@ -1068,6 +1069,7 @@ struct llama_model { llama_vocab vocab; struct ggml_tensor * tok_embeddings; + struct ggml_tensor * pos_embeddings; struct ggml_tensor * output_norm; struct ggml_tensor * output_norm_b; @@ -2184,6 +2186,73 @@ static void llm_load_tensors( layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) + + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + } + } + } break; + case LLM_ARCH_STARCODER: + { + model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_positions}, GGML_BACKEND_CPU); + + // output + { + ggml_backend backend_norm; + ggml_backend backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + vram_weights += ggml_nbytes(model.output_norm_b); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd_gqa}, backend_split); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + + layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + if (backend == GGML_BACKEND_GPU) { vram_weights += ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + From a17ef397923997950b42b92ea03d779fd6f30ddd Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 12:35:17 +0800 Subject: [PATCH 08/20] add max_position_embeddings --- gguf-py/gguf/gguf.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 514f32610..0a9200bf4 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -36,12 +36,13 @@ KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository" KEY_GENERAL_FILE_TYPE = "general.file_type" # LLM -KEY_CONTEXT_LENGTH = "{arch}.context_length" -KEY_EMBEDDING_LENGTH = "{arch}.embedding_length" -KEY_BLOCK_COUNT = "{arch}.block_count" -KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" -KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" -KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" +KEY_CONTEXT_LENGTH = "{arch}.context_length" +KEY_EMBEDDING_LENGTH = "{arch}.embedding_length" +KEY_BLOCK_COUNT = "{arch}.block_count" +KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" +KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" +KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" +KEY_MAX_POSITION_EMBEDDINGS = "{arch}.max_position_embeddings" # attention KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count" @@ -182,8 +183,8 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", }, MODEL_ARCH.GPT2: { # TODO @@ -717,6 +718,10 @@ class GGUFWriter: self.add_uint32( KEY_EMBEDDING_LENGTH.format(arch=self.arch), length) + def add_max_position_embeddings(self, length: int): + self.add_uint32( + KEY_MAX_POSITION_EMBEDDINGS.format(arch=self.arch), length) + def add_block_count(self, length: int): self.add_uint32( KEY_BLOCK_COUNT.format(arch=self.arch), length) From 26836119449d2405a3cdfca9fff2a98f5fa6648a Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 12:35:29 +0800 Subject: [PATCH 09/20] set n_positions to max_positioin_embeddings --- convert-starcoder-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py index 5ce736244..6cc384058 100755 --- a/convert-starcoder-hf-to-gguf.py +++ b/convert-starcoder-hf-to-gguf.py @@ -105,10 +105,10 @@ block_count = hparams["n_layer"] gguf_writer.add_name("StarCoder") gguf_writer.add_context_length(2048) # not in config.json gguf_writer.add_embedding_length(hparams["n_embd"]) +gguf_writer.add_max_position_embeddings(hparams["n_positions"]) gguf_writer.add_feed_forward_length(4 * hparams["n_embd"]) gguf_writer.add_block_count(block_count) gguf_writer.add_head_count(hparams["n_head"]) -gguf_writer.add_head_count_kv(1) gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) gguf_writer.add_file_type(ftype) From 77c7ec179c2b2e45710c83e2a264305ebb694813 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 12:36:11 +0800 Subject: [PATCH 10/20] properly load all starcoder params --- llama.cpp | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 20e200881..db74b6db9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -193,6 +193,7 @@ enum llm_kv { LLM_KV_FEED_FORWARD_LENGTH, LLM_KV_USE_PARALLEL_RESIDUAL, LLM_KV_TENSOR_DATA_LAYOUT, + LLM_KV_MAX_POSITION_EMBEDDINGS, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -237,6 +238,7 @@ static std::map LLM_KV_NAMES = { { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" }, { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, + { LLM_KV_MAX_POSITION_EMBEDDINGS, "%s.max_position_embeddings" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -937,7 +939,7 @@ struct llama_hparams { uint32_t n_layer = 32; uint32_t n_rot = 64; uint32_t n_ff = 11008; - uint32_t n_positions = -1; // StarCoder + uint32_t n_positions = 0; // StarCoder float f_norm_eps = 1e-5; float f_norm_rms_eps = 1e-5; @@ -985,13 +987,22 @@ struct llama_layer { struct ggml_tensor * wo; struct ggml_tensor * wqkv; + // attention bias + struct ggml_tensor * bo; + struct ggml_tensor * bqkv; + // normalization struct ggml_tensor * ffn_norm; + struct ggml_tensor * ffn_norm_b; // ff struct ggml_tensor * w1; // ffn_gate struct ggml_tensor * w2; // ffn_down struct ggml_tensor * w3; // ffn_up + + // ff bias + struct ggml_tensor * b2; // ffn_down + struct ggml_tensor * b3; // ffn_up }; struct llama_kv_cache { @@ -1654,6 +1665,7 @@ static void llm_load_hparams( GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); + GGUF_GET_KEY(ctx, hparams.n_positions, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_MAX_POSITION_EMBEDDINGS)); // n_head_kv is optional, default to n_head hparams.n_head_kv = hparams.n_head; @@ -2247,11 +2259,20 @@ static void llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {3*n_embd}, backend_split); + + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + + layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += From 0be15e162c70da52b926ecb1a995a4aa5334b778 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 12:56:20 +0800 Subject: [PATCH 11/20] fix head count kv --- convert-starcoder-hf-to-gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py index 6cc384058..9f6e38d40 100755 --- a/convert-starcoder-hf-to-gguf.py +++ b/convert-starcoder-hf-to-gguf.py @@ -109,6 +109,7 @@ gguf_writer.add_max_position_embeddings(hparams["n_positions"]) gguf_writer.add_feed_forward_length(4 * hparams["n_embd"]) gguf_writer.add_block_count(block_count) gguf_writer.add_head_count(hparams["n_head"]) +gguf_writer.add_head_count_kv(1) gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) gguf_writer.add_file_type(ftype) From dac31da489f4795624bee93fed505046532b87cd Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 12:57:38 +0800 Subject: [PATCH 12/20] fix comments --- convert-starcoder-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py index 9f6e38d40..1a48513ef 100755 --- a/convert-starcoder-hf-to-gguf.py +++ b/convert-starcoder-hf-to-gguf.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# HF falcon--> gguf conversion +# HF starcoder --> gguf conversion from __future__ import annotations From 4420cff6547d44791e03a04a0606991d006e6ebc Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 13:52:43 +0800 Subject: [PATCH 13/20] fix vram calculation for starcoder --- llama.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index db74b6db9..3aa247d27 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2277,8 +2277,11 @@ static void llm_load_tensors( if (backend == GGML_BACKEND_GPU) { vram_weights += ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + - ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) + - ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + + ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) + + ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3); } } } break; From ab13d071e10caab04516e0101bfbc6cc0befa912 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 14:18:36 +0800 Subject: [PATCH 14/20] store mqa directly --- convert-starcoder-hf-to-gguf.py | 18 ------------------ llama.cpp | 4 ++-- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py index 1a48513ef..fcdf86b3f 100755 --- a/convert-starcoder-hf-to-gguf.py +++ b/convert-starcoder-hf-to-gguf.py @@ -209,24 +209,6 @@ for part_name in part_names: data = data.squeeze().numpy() - if name.endswith(".attn.c_attn.weight") or name.endswith(".attn.c_attn.bias"): - print("Duplicate K,V heads to use MHA instead of MQA for", name) - - embed_dim = hparams["n_embd"] - head_dim = embed_dim // hparams["n_head"] - - # ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim) - q, k ,v = np.split(data, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0) - # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim) - if len(k.shape) == 2: - k = np.tile(k, (hparams["n_head"], 1)) - v = np.tile(v, (hparams["n_head"], 1)) - elif len(k.shape) == 1: - k = np.tile(k, (hparams["n_head"])) - v = np.tile(v, (hparams["n_head"])) - # concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim) - data = np.concatenate((q, k, v), axis=0) - # map tensor names new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) if new_name is None: diff --git a/llama.cpp b/llama.cpp index 3aa247d27..acadc3dbd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2259,8 +2259,8 @@ static void llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {3*n_embd}, backend_split); + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); From 8bc76a225d5830bd5ab55e3355124f4402b03e07 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 14:47:04 +0800 Subject: [PATCH 15/20] add input embeddings handling --- llama.cpp | 329 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 329 insertions(+) diff --git a/llama.cpp b/llama.cpp index acadc3dbd..3ca973677 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3424,6 +3424,331 @@ static struct ggml_cgraph * llm_build_falcon( return gf; } +static struct ggml_cgraph * llm_build_starcoder( + llama_context & lctx, + const llama_token * tokens, + const float * embd, + int n_tokens, + int n_past) { + + GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT + + const int N = n_tokens; + + const auto & model = lctx.model; + const auto & hparams = model.hparams; + + const auto & kv_self = lctx.kv_self; + + GGML_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = hparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + GGML_ASSERT(n_embd_head == hparams.n_rot); + + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + const float norm_eps = hparams.f_norm_eps; + + const int n_gpu_layers = model.n_gpu_layers; + + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ false, + }; + + params.no_alloc = true; + + struct ggml_context * ctx0 = ggml_init(params); + + ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * token; + struct ggml_tensor * position; + struct ggml_tensor * inpL; + + if (tokens) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + + ggml_allocr_alloc(lctx.alloc, inp_tokens); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); + } + ggml_set_name(inp_tokens, "inp_tokens"); + + token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + + ggml_allocr_alloc(lctx.alloc, token); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(token->data, embd, N * n_embd * ggml_element_size(inpL)); + } + } + + { + // Compute position embeddings. + struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(lctx.alloc, inp_positions); + if (!ggml_allocr_is_measure(lctx.alloc)) { + for (int i = 0; i < N; ++i) { + ((int32_t *) inp_positions->data)[i] = n_past + i; + } + } + ggml_set_name(inp_positions, "inp_positions"); + + position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions); + } + + inpL = ggml_add(ctx0, token, position); + + const int i_gpu_start = n_layer - n_gpu_layers; + (void) i_gpu_start; + + // offload functions set the tensor output backend to GPU + // tensors are GPU-accelerated if any input or the output has been offloaded + // + // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal + // in that case ggml_cuda_assign_buffers has no effect + offload_func_t offload_func_nr = llama_nop; // nr = non-repeating + offload_func_t offload_func_kq = llama_nop; + offload_func_t offload_func_v = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer) { + offload_func_nr = ggml_cuda_assign_buffers_no_alloc; + } + if (n_gpu_layers > n_layer + 1) { + offload_func_v = ggml_cuda_assign_buffers_no_alloc; + } + if (n_gpu_layers > n_layer + 2) { + offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + } +#endif // GGML_USE_CUBLAS + + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * attn_norm; + + offload_func_t offload_func = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (il >= i_gpu_start) { + offload_func = ggml_cuda_assign_buffers_no_alloc; + } +#endif // GGML_USE_CUBLAS + + // self-attention + // TODO: refactor into common function (shared with LLaMA) + { + attn_norm = ggml_norm(ctx0, inpL, norm_eps); + offload_func(attn_norm); + + attn_norm = ggml_add(ctx0, + ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm), + model.layers[il].attn_norm_b); + offload_func(attn_norm->src[0]); + offload_func(attn_norm); + + if (model.layers[il].attn_norm_2) { // Falcon-40B + cur = ggml_norm(ctx0, inpL, norm_eps); + offload_func(cur); + + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, model.layers[il].attn_norm_2), + model.layers[il].attn_norm_2_b); + offload_func(cur->src[0]); + offload_func(cur); + } else { // Falcon 7B + cur = attn_norm; + } + + // compute QKV + + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + offload_func_kq(cur); + + // Note that the strides for Kcur, Vcur are set up so that the + // resulting views are misaligned with the tensor's storage + // (by applying the K/V offset we shift the tensor's original + // view to stick out behind the viewed QKV tensor's allocated + // memory, so to say). This is ok because no actual accesses + // happen to that out-of-range memory, but it can require some + // trickery when trying to accurately dump these views for + // debugging. + + const size_t wsize = ggml_type_size(cur->type); + + // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for + // non-contiguous views is added for the rope operator + struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d( + ctx0, cur, n_embd_head, n_head, N, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + 0)); + offload_func_kq(tmpq); + + struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( + ctx0, cur, n_embd_head, n_head_kv, N, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + wsize * n_embd_head * n_head)); + offload_func_kq(tmpk); + + struct ggml_tensor * tmpv = ggml_view_3d( + ctx0, cur, n_embd_head, n_head_kv, N, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + wsize * n_embd_head * (n_head + n_head_kv)); + offload_func_v(tmpv); + + // using mode = 2 for neox mode + struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale); + offload_func_kq(Qcur); + struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale); + offload_func_kq(Kcur); + + { + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N)); + offload_func_v(Vcur); + offload_func_v(Vcur->src[0]->src[0]); + ggml_set_name(Vcur, "Vcur"); + + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); + offload_func_kq(k); + ggml_set_name(k, "k"); + + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); + offload_func_v(v); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + offload_func_kq(Q); + ggml_set_name(Q, "Q"); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_past + N, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + offload_func_kq(K); + ggml_set_name(K, "K"); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); + ggml_set_name(KQ, "KQ"); + + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + offload_func_kq(KQ_scaled); + ggml_set_name(KQ_scaled, "KQ_scaled"); + + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + offload_func_kq(KQ_masked); + ggml_set_name(KQ_masked, "KQ_masked"); + + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + offload_func_v(KQ_soft_max); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); + + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_past + N, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + offload_func_v(V); + ggml_set_name(V, "V"); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); + ggml_set_name(KQV, "KQV"); + + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); + ggml_set_name(KQV_merged, "KQV_merged"); + + cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + offload_func_v(cur); + ggml_set_name(cur, "KQV_merged_contiguous"); + + cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + offload_func(cur); + ggml_set_name(cur, "result_wo"); + } + + struct ggml_tensor * attn_out = cur; + + // feed forward + { + struct ggml_tensor * inpFF = attn_norm; + + cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); + offload_func(cur); + + cur = ggml_gelu(ctx0, cur); + offload_func(cur); + cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); + offload_func(cur); + } + + cur = ggml_add(ctx0, cur, attn_out); + offload_func(cur); + cur = ggml_add(ctx0, cur, inpL); + offload_func(cur); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + // norm + { + cur = ggml_norm(ctx0, cur, norm_eps); + offload_func_nr(cur); + + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, model.output_norm), + model.output_norm_b); + ggml_set_name(cur, "result_norm"); + } + + cur = ggml_mul_mat(ctx0, model.output, cur); + ggml_set_name(cur, "result_output"); + + ggml_build_forward_expand(gf, cur); + + ggml_free(ctx0); + + return gf; +} + static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_token * tokens, @@ -3447,6 +3772,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past); } break; + case LLM_ARCH_STARCODER: + { + result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past); + } break; default: GGML_ASSERT(false); }; From 101c57871534f2bb6a425c9281a59bc46cb3d6fc Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 15:23:50 +0800 Subject: [PATCH 16/20] add TBD --- llama.cpp | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/llama.cpp b/llama.cpp index 3ca973677..3f2e2f854 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3570,24 +3570,16 @@ static struct ggml_cgraph * llm_build_starcoder( offload_func(attn_norm->src[0]); offload_func(attn_norm); - if (model.layers[il].attn_norm_2) { // Falcon-40B - cur = ggml_norm(ctx0, inpL, norm_eps); - offload_func(cur); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.layers[il].attn_norm_2), - model.layers[il].attn_norm_2_b); - offload_func(cur->src[0]); - offload_func(cur); - } else { // Falcon 7B - cur = attn_norm; - } + cur = attn_norm; // compute QKV - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); offload_func_kq(cur); + // ===== TBD (QKV Split + FF) ==== +#define PRINT_SHAPE(x) fprintf(stderr, "%d %s: (%s)\n", __LINE__, #x, llama_format_tensor_shape(x).c_str()) + GGML_ASSERT(false); + // Note that the strides for Kcur, Vcur are set up so that the // resulting views are misaligned with the tensor's storage // (by applying the K/V offset we shift the tensor's original From a1cf66ea94509a017abf08b4418f66502f94e958 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 16:56:50 +0800 Subject: [PATCH 17/20] working in cpu, metal buggy --- convert-starcoder-hf-to-gguf.py | 20 ++- llama.cpp | 294 ++++++++++++++++---------------- 2 files changed, 163 insertions(+), 151 deletions(-) diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py index fcdf86b3f..34de69c6b 100755 --- a/convert-starcoder-hf-to-gguf.py +++ b/convert-starcoder-hf-to-gguf.py @@ -109,7 +109,7 @@ gguf_writer.add_max_position_embeddings(hparams["n_positions"]) gguf_writer.add_feed_forward_length(4 * hparams["n_embd"]) gguf_writer.add_block_count(block_count) gguf_writer.add_head_count(hparams["n_head"]) -gguf_writer.add_head_count_kv(1) +gguf_writer.add_head_count_kv(hparams["n_head"]) gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) gguf_writer.add_file_type(ftype) @@ -209,6 +209,24 @@ for part_name in part_names: data = data.squeeze().numpy() + if name.endswith(".attn.c_attn.weight") or name.endswith(".attn.c_attn.bias"): + print("Duplicate K,V heads to use MHA instead of MQA for", name) + + embed_dim = hparams["n_embd"] + head_dim = embed_dim // hparams["n_head"] + + # ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim) + q, k ,v = np.split(data, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0) + # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim) + if len(k.shape) == 2: + k = np.tile(k, (hparams["n_head"], 1)) + v = np.tile(v, (hparams["n_head"], 1)) + elif len(k.shape) == 1: + k = np.tile(k, (hparams["n_head"])) + v = np.tile(v, (hparams["n_head"])) + # concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim) + data = np.concatenate((q, k, v), axis=0) + # map tensor names new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) if new_name is None: diff --git a/llama.cpp b/llama.cpp index 3f2e2f854..f8c2b3d74 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1221,6 +1221,7 @@ static bool llama_kv_cache_init( return false; } + fprintf(stderr, "n_embed: %d n_layer: %d n_ctx: %d n_elements: %d\n", n_embd, n_layer, n_ctx, n_elements); cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); ggml_set_name(cache.k, "cache_k"); @@ -2259,8 +2260,8 @@ static void llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {3*n_embd}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); @@ -3540,16 +3541,8 @@ static struct ggml_cgraph * llm_build_starcoder( } #endif // GGML_USE_CUBLAS - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - +#define PRINT_SHAPE(x) fprintf(stderr, "%d %s: (%s)\n", __LINE__, #x, llama_format_tensor_shape(x).c_str()) for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * attn_norm; - offload_func_t offload_func = llama_nop; #ifdef GGML_USE_CUBLAS @@ -3558,186 +3551,187 @@ static struct ggml_cgraph * llm_build_starcoder( } #endif // GGML_USE_CUBLAS - // self-attention - // TODO: refactor into common function (shared with LLaMA) { - attn_norm = ggml_norm(ctx0, inpL, norm_eps); - offload_func(attn_norm); + // Norm + cur = ggml_norm(ctx0, inpL, norm_eps); - attn_norm = ggml_add(ctx0, - ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm), - model.layers[il].attn_norm_b); - offload_func(attn_norm->src[0]); - offload_func(attn_norm); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); - cur = attn_norm; + } - // compute QKV + { + // Compute QKV cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + } - // ===== TBD (QKV Split + FF) ==== -#define PRINT_SHAPE(x) fprintf(stderr, "%d %s: (%s)\n", __LINE__, #x, llama_format_tensor_shape(x).c_str()) - GGML_ASSERT(false); + { + // Self Attention + struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); - // Note that the strides for Kcur, Vcur are set up so that the - // resulting views are misaligned with the tensor's storage - // (by applying the K/V offset we shift the tensor's original - // view to stick out behind the viewed QKV tensor's allocated - // memory, so to say). This is ok because no actual accesses - // happen to that out-of-range memory, but it can require some - // trickery when trying to accurately dump these views for - // debugging. - - const size_t wsize = ggml_type_size(cur->type); - - // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for - // non-contiguous views is added for the rope operator - struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head, N, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0)); - offload_func_kq(tmpq); - - struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, N, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head)); - offload_func_kq(tmpk); - - struct ggml_tensor * tmpv = ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, N, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * (n_head + n_head_kv)); - offload_func_v(tmpv); - - // using mode = 2 for neox mode - struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(Qcur); - struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(Kcur); - - { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N)); - offload_func_v(Vcur); - offload_func_v(Vcur->src[0]->src[0]); - ggml_set_name(Vcur, "Vcur"); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); - offload_func_kq(k); - ggml_set_name(k, "k"); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); - offload_func_v(v); + // store key and value to memory + if (N >= 1) { + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + // [64, N, 12] + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + 0, 2, 1, 3); + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + // [64, n_past + N, 12] struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_past + N, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); //TODO: need to be tiled - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); + // GG: flash attention + //struct ggml_tensor * V = + // ggml_cpy(ctx0, + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 1, 2, 0, 3), + // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); + //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); + // K * Q + // [n_past + N, N, 12] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); //TODO: check if it broadcasts + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_scaled = + ggml_scale_inplace(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) + ); + + // KQ_masked = mask_past(KQ_scaled) + // [n_past + N, N, 12] struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); + // KQ = soft_max(KQ_masked) + // [n_past + N, N, 12] struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_past + N, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + // [n_past + N, 64, 12] + struct ggml_tensor * V_trans = + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd), + n_embd/n_head, n_head, n_past + N), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head)); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); + // KQV = transpose(V) * KQ_soft_max + // [64, N, 12] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + // KQV_merged = KQV.permute(0, 2, 1, 3) + // [64, 12, N] struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); - cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); - - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); + // cur = KQV_merged.contiguous().view(n_embd, N) + // [768, N] + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); } - struct ggml_tensor * attn_out = cur; - - // feed forward + // Projection { - struct ggml_tensor * inpFF = attn_norm; - - cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); - offload_func(cur); - - cur = ggml_gelu(ctx0, cur); - offload_func(cur); - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); + cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + cur = ggml_add(ctx0, cur, model.layers[il].bo); } - cur = ggml_add(ctx0, cur, attn_out); - offload_func(cur); + // add the input cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); - // input for next layer - inpL = cur; + struct ggml_tensor * inpFF = cur; + + // FF + { + // norm + { + cur = ggml_norm(ctx0, inpFF, norm_eps); + + // cur = ln_2_g*cur + ln_2_b + // [ 768, N] + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); + } + + // fully connected + // [3072, 768] - model.layers[il].c_mlp_fc_w + // [3072, 1] - model.layers[il].c_mlp_fc_b + // [ 768, N] - cur (in) + // [3072, N] - cur (out) + // + // cur = fc_w*cur + fc_b + // [3072, N] + cur = ggml_mul_mat(ctx0, + model.layers[il].w3, + cur); + + cur = ggml_add(ctx0, cur, model.layers[il].b3); + + // GELU activation + // [3072, N] + cur = ggml_gelu(ctx0, cur); + + // projection + // [ 768, 3072] - model.layers[il].c_mlp_proj_w + // [ 768, 1] - model.layers[il].c_mlp_proj_b + // [3072, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + cur = ggml_mul_mat(ctx0, + model.layers[il].w2, + cur); + + cur = ggml_add(ctx0, cur, model.layers[il].b2); + } + + inpL = ggml_add(ctx0, cur, inpFF); } - cur = inpL; + // norm + { + // [ 768, N] + inpL = ggml_norm(ctx0, inpL, norm_eps); - // norm - { - cur = ggml_norm(ctx0, cur, norm_eps); - offload_func_nr(cur); + // inpL = ln_f_g*inpL + ln_f_b + // [ 768, N] + inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.output_norm), model.output_norm_b); + } + ggml_set_name(inpL, "result_norm"); - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.output_norm), - model.output_norm_b); - ggml_set_name(cur, "result_norm"); - } - - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = ggml_mul_mat(ctx0, model.output, inpL); ggml_set_name(cur, "result_output"); ggml_build_forward_expand(gf, cur); - ggml_free(ctx0); + // norm return gf; } From 6c353dc7c2c1d30467f342d100c42df3b30b8404 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 15 Sep 2023 19:00:14 +0800 Subject: [PATCH 18/20] cleanup useless code --- llama.cpp | 114 +++++------------------------------------------------- 1 file changed, 10 insertions(+), 104 deletions(-) diff --git a/llama.cpp b/llama.cpp index f8c2b3d74..7776211bd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1221,7 +1221,6 @@ static bool llama_kv_cache_init( return false; } - fprintf(stderr, "n_embed: %d n_layer: %d n_ctx: %d n_elements: %d\n", n_embd, n_layer, n_ctx, n_elements); cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); ggml_set_name(cache.k, "cache_k"); @@ -3447,18 +3446,12 @@ static struct ggml_cgraph * llm_build_starcoder( const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = hparams.n_ctx; const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); GGML_ASSERT(n_embd_head == hparams.n_rot); - const float freq_base = hparams.rope_freq_base; - const float freq_scale = hparams.rope_freq_scale; const float norm_eps = hparams.f_norm_eps; - const int n_gpu_layers = model.n_gpu_layers; - auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { @@ -3517,56 +3510,18 @@ static struct ggml_cgraph * llm_build_starcoder( inpL = ggml_add(ctx0, token, position); - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - // - // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal - // in that case ggml_cuda_assign_buffers has no effect - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - -#define PRINT_SHAPE(x) fprintf(stderr, "%d %s: (%s)\n", __LINE__, #x, llama_format_tensor_shape(x).c_str()) for (int il = 0; il < n_layer; ++il) { - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - { // Norm cur = ggml_norm(ctx0, inpL, norm_eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); } - { - // Compute QKV - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - } - { // Self Attention + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); + struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); @@ -3580,8 +3535,6 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - // [64, N, 12] struct ggml_tensor * Q = ggml_permute(ctx0, ggml_cpy(ctx0, @@ -3589,8 +3542,6 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), 0, 2, 1, 3); - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - // [64, n_past + N, 12] struct ggml_tensor * K = ggml_permute(ctx0, ggml_reshape_3d(ctx0, @@ -3598,21 +3549,9 @@ static struct ggml_cgraph * llm_build_starcoder( n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); //TODO: need to be tiled - // GG: flash attention - //struct ggml_tensor * V = - // ggml_cpy(ctx0, - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd), - // n_embd/n_head, n_head, n_past + N), - // 1, 2, 0, 3), - // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); - - //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); - // K * Q // [n_past + N, N, 12] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); //TODO: check if it broadcasts + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); // KQ_scaled = KQ / sqrt(n_embd/n_head) // [n_past + N, N, 12] @@ -3649,18 +3588,13 @@ static struct ggml_cgraph * llm_build_starcoder( // [64, 12, N] struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - // cur = KQV_merged.contiguous().view(n_embd, N) - // [768, N] cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); } // Projection - { - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cur = ggml_add(ctx0, cur, model.layers[il].bo); - } + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); // add the input cur = ggml_add(ctx0, cur, inpL); @@ -3678,37 +3612,13 @@ static struct ggml_cgraph * llm_build_starcoder( cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); } - // fully connected - // [3072, 768] - model.layers[il].c_mlp_fc_w - // [3072, 1] - model.layers[il].c_mlp_fc_b - // [ 768, N] - cur (in) - // [3072, N] - cur (out) - // - // cur = fc_w*cur + fc_b - // [3072, N] - cur = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - - cur = ggml_add(ctx0, cur, model.layers[il].b3); + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); // GELU activation - // [3072, N] cur = ggml_gelu(ctx0, cur); // projection - // [ 768, 3072] - model.layers[il].c_mlp_proj_w - // [ 768, 1] - model.layers[il].c_mlp_proj_b - // [3072, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - - cur = ggml_add(ctx0, cur, model.layers[il].b2); + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); } inpL = ggml_add(ctx0, cur, inpFF); @@ -3716,16 +3626,12 @@ static struct ggml_cgraph * llm_build_starcoder( // norm { - // [ 768, N] - inpL = ggml_norm(ctx0, inpL, norm_eps); - - // inpL = ln_f_g*inpL + ln_f_b - // [ 768, N] - inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.output_norm), model.output_norm_b); + cur = ggml_norm(ctx0, inpL, norm_eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); } - ggml_set_name(inpL, "result_norm"); + ggml_set_name(cur, "result_norm"); - cur = ggml_mul_mat(ctx0, model.output, inpL); + cur = ggml_mul_mat(ctx0, model.output, cur); ggml_set_name(cur, "result_output"); ggml_build_forward_expand(gf, cur); From f82328ab658b76dbbf174fbf6bb911f5d2e9cf16 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 15 Sep 2023 17:56:49 +0300 Subject: [PATCH 19/20] metal : fix out-of-bounds access in soft_max kernels --- ggml-metal.metal | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-metal.metal b/ggml-metal.metal index f45b1490f..7f1a14868 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -118,7 +118,7 @@ kernel void kernel_soft_max( device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; // parallel max - float lmax = psrc0[tpitg[0]]; + float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY; for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) { lmax = MAX(lmax, psrc0[i00]); } @@ -158,7 +158,7 @@ kernel void kernel_soft_max_4( device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); // parallel max - float4 lmax4 = psrc4[tpitg[0]]; + float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY; for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) { lmax4 = fmax(lmax4, psrc4[i00]); } From 92a4f8687933c9f0a6847097e1a5086f6cd44720 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 15 Sep 2023 17:57:10 +0300 Subject: [PATCH 20/20] llama : make starcoder graph build more consistent with others --- llama.cpp | 98 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 37 deletions(-) diff --git a/llama.cpp b/llama.cpp index 7776211bd..21eebfbd7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3446,7 +3446,9 @@ static struct ggml_cgraph * llm_build_starcoder( const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = hparams.n_ctx; const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -3508,28 +3510,44 @@ static struct ggml_cgraph * llm_build_starcoder( position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions); } + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + inpL = ggml_add(ctx0, token, position); + ggml_set_name(inpL, "inpL"); for (int il = 0; il < n_layer; ++il) { { // Norm cur = ggml_norm(ctx0, inpL, norm_eps); cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); - } { // Self Attention cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); - struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); - // store key and value to memory - if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * Qcur = tmpq; + struct ggml_tensor * Kcur = tmpk; + + { + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N)); + ggml_set_name(Vcur, "Vcur"); + + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); + ggml_set_name(k, "k"); + + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); @@ -3541,56 +3559,62 @@ static struct ggml_cgraph * llm_build_starcoder( Qcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), 0, 2, 1, 3); + ggml_set_name(Q, "Q"); struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), - n_embd/n_head, n_head, n_past + N), - 0, 2, 1, 3); //TODO: need to be tiled + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_past + N, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + ggml_set_name(K, "K"); // K * Q - // [n_past + N, N, 12] struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + ggml_set_name(KQ, "KQ"); - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) - ); + // KQ_scaled = KQ / sqrt(n_embd_head) + // KQ_scaled shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + ggml_set_name(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) - // [n_past + N, N, 12] struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + ggml_set_name(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) - // [n_past + N, N, 12] struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - // [n_past + N, 64, 12] - struct ggml_tensor * V_trans = - ggml_cpy(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd), - n_embd/n_head, n_head, n_past + N), - 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head)); + // split cached V into n_head heads + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_past + N, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + ggml_set_name(V, "V"); - // KQV = transpose(V) * KQ_soft_max - // [64, N, 12] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); +#if 1 + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + ggml_set_name(KQV, "KQV"); +#else + // make V contiguous in memory to speed up the matmul, however we waste time on the copy + // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation + // is there a better way? + struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head)); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); +#endif // KQV_merged = KQV.permute(0, 2, 1, 3) - // [64, 12, N] struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + ggml_set_name(KQV_merged, "KQV_merged"); + // cur = KQV_merged.contiguous().view(n_embd, N) cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + ggml_set_name(cur, "KQV_merged_contiguous"); } // Projection