From a03ce38455544121c5c00cf845def1443acd6ac8 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 2 Oct 2023 15:15:45 +0200 Subject: [PATCH 1/6] finetune : fix #3404 (#3437) the shapes for init model of gqa models was wrong --- examples/finetune/finetune.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 8ca1874da..9ae4bc198 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -332,8 +332,8 @@ static void init_model(struct llama_model * input, struct my_llama_model * model assert_shape_1d(layer.attention_norm, hparams.n_embd); assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd); - assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd); - assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd); + assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd_gqa()); + assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa()); assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd); assert_shape_1d(layer.ffn_norm, hparams.n_embd); assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff); From 9476b012260a2fb6c67976582d64484ce7406ed9 Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Mon, 2 Oct 2023 09:16:50 -0400 Subject: [PATCH 2/6] cmake : make CUDA flags more similar to the Makefile (#3420) * cmake : fix misuse of cxx_flags * cmake : make CUDA flags more similar to the Makefile * cmake : fix MSVC build --- CMakeLists.txt | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 17d705422..221fc07a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -343,8 +343,9 @@ if (LLAMA_MPI) set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h) add_compile_definitions(GGML_USE_MPI) add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS}) - set(cxx_flags ${cxx_flags} -Wno-cast-qual) - set(c_flags ${c_flags} -Wno-cast-qual) + if (NOT MSVC) + add_compile_options(-Wno-cast-qual) + endif() set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS}) # Even if you're only using the C header, C++ programs may bring in MPI @@ -418,10 +419,11 @@ if (LLAMA_ALL_WARNINGS) set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration) set(cxx_flags -Wmissing-declarations -Wmissing-noreturn) + set(host_cxx_flags "") if (CMAKE_C_COMPILER_ID MATCHES "Clang") set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return) - set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi) + set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi) if ( (CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR @@ -431,27 +433,38 @@ if (LLAMA_ALL_WARNINGS) endif() elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU") set(c_flags ${c_flags} -Wdouble-promotion) - set(cxx_flags ${cxx_flags} -Wno-array-bounds) + set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds) if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0) - set(cxx_flags ${cxx_flags} -Wno-format-truncation) + set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation) endif() if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0) - set(cxx_flags ${cxx_flags} -Wextra-semi) + set(host_cxx_flags ${host_cxx_flags} -Wextra-semi) endif() endif() else() # todo : msvc endif() - add_compile_options( - ${warning_flags} - "$<$:${c_flags}>" - "$<$:${cxx_flags}>" - ) + set(c_flags ${c_flags} ${warning_flags}) + set(cxx_flags ${cxx_flags} ${warning_flags}) + add_compile_options("$<$:${c_flags}>" + "$<$:${cxx_flags} ${host_cxx_flags}>") endif() +if (NOT MSVC) + set(cuda_flags -Wno-pedantic) +endif() +set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags}) + +list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument +if (NOT cuda_host_flags STREQUAL "") + set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags}) +endif() + +add_compile_options("$<$:${cuda_flags}>") + if (WIN32) add_compile_definitions(_CRT_SECURE_NO_WARNINGS) From 0fe321031a5c670ab5fb5f49d69c4c91d783c93f Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Mon, 2 Oct 2023 14:58:46 -0400 Subject: [PATCH 3/6] gguf : general usability improvements (#3409) --- convert.py | 5 +- .../convert-finetune-checkpoint-to-gguf.py | 2 +- .../convert-train-checkpoint-to-gguf.py | 2 +- gguf-py/gguf/gguf.py | 210 ++++++++++-------- gguf-py/pyproject.toml | 2 +- 5 files changed, 120 insertions(+), 101 deletions(-) diff --git a/convert.py b/convert.py index 8bb6c7e41..e14b9ef80 100755 --- a/convert.py +++ b/convert.py @@ -41,8 +41,7 @@ if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): NDArray: TypeAlias = 'np.ndarray[Any, Any]' -ARCH=gguf.MODEL_ARCH.LLAMA -NAMES=gguf.MODEL_TENSOR_NAMES[ARCH] +ARCH = gguf.MODEL_ARCH.LLAMA DEFAULT_CONCURRENCY = 8 # @@ -953,7 +952,7 @@ class OutputFile: of.close() def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: - wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type + wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): return GGMLFileType.AllF32 diff --git a/examples/finetune/convert-finetune-checkpoint-to-gguf.py b/examples/finetune/convert-finetune-checkpoint-to-gguf.py index 96d6633ed..c8e14da87 100644 --- a/examples/finetune/convert-finetune-checkpoint-to-gguf.py +++ b/examples/finetune/convert-finetune-checkpoint-to-gguf.py @@ -313,7 +313,7 @@ class ModelParams: gguf_writer.add_feed_forward_length(self.get_n_ff()) def tensor_name(key, bid=None, suffix=".weight"): - return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + suffix + return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix class Layer: def __init__(self, params, lora_params, bid): diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py index 351e7bc2d..887ed2e21 100644 --- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py +++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py @@ -364,7 +364,7 @@ class ModelParams: gguf_writer.add_feed_forward_length(self.get_n_ff()) def tensor_name(key, bid=None): - return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + ".weight" + return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight" class Layer: def __init__(self, params, bid): diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 598cf8e59..e83187d30 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -118,76 +118,97 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.STARCODER: "starcoder", } -MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { - MODEL_ARCH.LLAMA: { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ROPE_FREQS: "rope_freqs", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", - MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", - MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - }, - MODEL_ARCH.GPTNEOX: { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - }, - MODEL_ARCH.FALCON: { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - }, - MODEL_ARCH.BAICHUAN: { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ROPE_FREQS: "rope_freqs", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", - MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", - MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - }, - MODEL_ARCH.STARCODER: { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.POS_EMBD: "position_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - }, - MODEL_ARCH.GPT2: { +TENSOR_NAMES: dict[MODEL_TENSOR, str] = { + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.POS_EMBD: "position_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", +} + +MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { + MODEL_ARCH.LLAMA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.GPTNEOX: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.FALCON: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_NORM_2, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.BAICHUAN: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.STARCODER: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.POS_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.GPT2: [ # TODO - }, + ], # TODO } @@ -338,28 +359,24 @@ class TensorNameMap: mapping: dict[str, tuple[MODEL_TENSOR, str]] - tensor_names: dict[MODEL_TENSOR, str] - def __init__(self, arch: MODEL_ARCH, n_blocks: int): - mapping = self.mapping = {} - tensor_names = self.tensor_names = MODEL_TENSOR_NAMES[arch] + self.mapping = {} for tensor, keys in self.mappings_cfg.items(): - tensor_name = tensor_names.get(tensor) - if tensor_name is None: + if tensor not in MODEL_TENSORS[arch]: continue - mapping[tensor_name] = (tensor, tensor_name) + tensor_name = TENSOR_NAMES[tensor] + self.mapping[tensor_name] = (tensor, tensor_name) for key in keys: - mapping[key] = (tensor, tensor_name) + self.mapping[key] = (tensor, tensor_name) for bid in range(n_blocks): for tensor, keys in self.block_mappings_cfg.items(): - tensor_name = tensor_names.get(tensor) - if tensor_name is None: + if tensor not in MODEL_TENSORS[arch]: continue - tensor_name = tensor_name.format(bid = bid) - mapping[tensor_name] = (tensor, tensor_name) + tensor_name = TENSOR_NAMES[tensor].format(bid = bid) + self.mapping[tensor_name] = (tensor, tensor_name) for key in keys: key = key.format(bid = bid) - mapping[key] = (tensor, tensor_name) + self.mapping[key] = (tensor, tensor_name) def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: result = self.mapping.get(key) @@ -800,22 +817,25 @@ class SpecialVocab: special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad') special_token_ids: dict[str, int] = {} - def __init__(self, path: Path, load_merges: bool = False, special_token_types: tuple[str, ...] | None = None): + def __init__( + self, path: str | os.PathLike[str], load_merges: bool = False, + special_token_types: tuple[str, ...] | None = None, + ): self.special_token_ids = {} self.load_merges = load_merges if special_token_types is not None: self.special_token_types = special_token_types - self.load(path) + self._load(Path(path)) - def load(self, path: Path): - if not self.try_load_from_tokenizer_json(path): - self.try_load_from_config_json(path) + def _load(self, path: Path) -> None: + if not self._try_load_from_tokenizer_json(path): + self._try_load_from_config_json(path) - def try_load_from_tokenizer_json(self, path: Path) -> bool: + def _try_load_from_tokenizer_json(self, path: Path) -> bool: tokenizer_file = path / 'tokenizer.json' if not tokenizer_file.is_file(): return False - with open(tokenizer_file, 'r', encoding = 'utf-8') as f: + with open(tokenizer_file, encoding = 'utf-8') as f: tokenizer = json.load(f) if self.load_merges: merges = tokenizer.get('model', {}).get('merges') @@ -825,7 +845,7 @@ class SpecialVocab: added_tokens = tokenizer.get('added_tokens') if added_tokens is None or not tokenizer_config_file.is_file(): return True - with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f: + with open(tokenizer_config_file, encoding = 'utf-8') as f: tokenizer_config = json.load(f) for typ in self.special_token_types: entry = tokenizer_config.get(f'{typ}_token') @@ -844,11 +864,11 @@ class SpecialVocab: break return True - def try_load_from_config_json(self, path: Path) -> bool: + def _try_load_from_config_json(self, path: Path) -> bool: config_file = path / 'config.json' if not config_file.is_file(): return False - with open(config_file, 'r', encoding = 'utf-8') as f: + with open(config_file, encoding = 'utf-8') as f: config = json.load(f) for typ in self.special_token_types: maybe_token_id = config.get(f'{typ}_token_id') @@ -856,7 +876,7 @@ class SpecialVocab: self.special_token_ids[typ] = maybe_token_id return True - def add_to_gguf(self, gw: GGUFWriter): + def add_to_gguf(self, gw: GGUFWriter) -> None: if len(self.merges) > 0: print(f'gguf: Adding {len(self.merges)} merge(s).') gw.add_token_merges(self.merges) @@ -868,8 +888,8 @@ class SpecialVocab: print(f'gguf: Setting special token type {typ} to {tokid}') handler(tokid) - def __repr__(self): - return f'' + def __repr__(self) -> str: + return f'' # Example usage: diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 9489ccd6f..400607ce1 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.3.3" +version = "0.4.0" description = "Write ML models in GGUF for GGML" authors = ["GGML "] packages = [ From 29a404a951fb0b3f9c3b6ab8c4c9c76ac50d2bb3 Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Mon, 2 Oct 2023 15:20:28 -0400 Subject: [PATCH 4/6] gguf : add BERT, MPT, and GPT-J arch info (#3408) --- gguf-py/gguf/gguf.py | 170 +++++++++++++++++++++++++++++-------------- 1 file changed, 117 insertions(+), 53 deletions(-) diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index e83187d30..c975da0cb 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -85,10 +85,12 @@ class MODEL_ARCH(IntEnum): GPTNEOX : int = auto() MPT : int = auto() STARCODER : int = auto() + BERT : int = auto() class MODEL_TENSOR(IntEnum): TOKEN_EMBD : int = auto() + TOKEN_TYPES : int = auto() POS_EMBD : int = auto() OUTPUT : int = auto() OUTPUT_NORM : int = auto() @@ -116,10 +118,12 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.GPTNEOX: "gptneox", MODEL_ARCH.MPT: "mpt", MODEL_ARCH.STARCODER: "starcoder", + MODEL_ARCH.BERT: "bert", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.TOKEN_TYPES: "token_types", MODEL_TENSOR.POS_EMBD: "position_embd", MODEL_TENSOR.OUTPUT_NORM: "output_norm", MODEL_TENSOR.OUTPUT: "output", @@ -206,6 +210,43 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.BERT: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_TYPES, + MODEL_TENSOR.POS_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.MPT: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.GPTJ: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.GPT2: [ # TODO ], @@ -229,31 +270,40 @@ class TensorNameMap: mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { # Token embeddings MODEL_TENSOR.TOKEN_EMBD: ( - "gpt_neox.embed_in", # gptneox - "transformer.wte", # gpt2 mpt - "transformer.word_embeddings", # falcon - "model.embed_tokens", # llama-hf - "tok_embeddings", # llama-pth + "gpt_neox.embed_in", # gptneox + "transformer.wte", # gpt2 gpt-j mpt + "transformer.word_embeddings", # falcon + "model.embed_tokens", # llama-hf + "tok_embeddings", # llama-pth + "embeddings.word_embeddings", # bert + ), + + # Token type embeddings + MODEL_TENSOR.TOKEN_TYPES: ( + "embeddings.token_type_embeddings", # bert ), # Position embeddings MODEL_TENSOR.POS_EMBD: ( - "transformer.wpe", # gpt2 + "transformer.wpe", # gpt2 + "embeddings.position_embeddings", # bert ), # Output MODEL_TENSOR.OUTPUT: ( - "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan - "output", # llama-pth + "embed_out", # gptneox + "lm_head", # gpt2 gpt-j mpt falcon llama-hf baichuan + "output", # llama-pth ), # Output norm MODEL_TENSOR.OUTPUT_NORM: ( - "gpt_neox.final_layer_norm", # gptneox - "transformer.ln_f", # gpt2 falcon - "model.norm", # llama-hf baichuan - "norm", # llama-pth + "gpt_neox.final_layer_norm", # gptneox + "transformer.ln_f", # gpt2 gpt-j falcon + "model.norm", # llama-hf baichuan + "norm", # llama-pth + "embeddings.LayerNorm", # bert + "transformer.norm_f", # mpt ), # Rope frequencies @@ -265,13 +315,14 @@ class TensorNameMap: block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { # Attention norm MODEL_TENSOR.ATTN_NORM: ( - "gpt_neox.layers.{bid}.input_layernorm", # gptneox - "transformer.h.{bid}.ln_1", # gpt2 - "transformer.blocks.{bid}.norm_1", # mpt - "transformer.h.{bid}.input_layernorm", # falcon7b - "transformer.h.{bid}.ln_mlp", # falcon40b - "model.layers.{bid}.input_layernorm", # llama-hf - "layers.{bid}.attention_norm", # llama-pth + "gpt_neox.layers.{bid}.input_layernorm", # gptneox + "transformer.h.{bid}.ln_1", # gpt2 gpt-j + "transformer.blocks.{bid}.norm_1", # mpt + "transformer.h.{bid}.input_layernorm", # falcon7b + "transformer.h.{bid}.ln_mlp", # falcon40b + "model.layers.{bid}.input_layernorm", # llama-hf + "layers.{bid}.attention_norm", # llama-pth + "encoder.layer.{bid}.attention.output.LayerNorm", # bert ), # Attention norm 2 @@ -281,38 +332,46 @@ class TensorNameMap: # Attention query-key-value MODEL_TENSOR.ATTN_QKV: ( - "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox - "transformer.h.{bid}.attn.c_attn", # gpt2 - "transformer.blocks.{bid}.attn.Wqkv", # mpt - "transformer.h.{bid}.self_attention.query_key_value", # falcon + "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox + "transformer.h.{bid}.attn.c_attn", # gpt2 + "transformer.blocks.{bid}.attn.Wqkv", # mpt + "transformer.h.{bid}.self_attention.query_key_value", # falcon ), # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf - "layers.{bid}.attention.wq", # llama-pth + "model.layers.{bid}.self_attn.q_proj", # llama-hf + "layers.{bid}.attention.wq", # llama-pth + "encoder.layer.{bid}.attention.self.query", # bert + "transformer.h.{bid}.attn.q_proj", # gpt-j ), # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf - "layers.{bid}.attention.wk", # llama-pth + "model.layers.{bid}.self_attn.k_proj", # llama-hf + "layers.{bid}.attention.wk", # llama-pth + "encoder.layer.{bid}.attention.self.key", # bert + "transformer.h.{bid}.attn.k_proj", # gpt-j ), # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf - "layers.{bid}.attention.wv", # llama-pth + "model.layers.{bid}.self_attn.v_proj", # llama-hf + "layers.{bid}.attention.wv", # llama-pth + "encoder.layer.{bid}.attention.self.value", # bert + "transformer.h.{bid}.attn.v_proj", # gpt-j ), # Attention output MODEL_TENSOR.ATTN_OUT: ( - "gpt_neox.layers.{bid}.attention.dense", # gptneox - "transformer.h.{bid}.attn.c_proj", # gpt2 - "transformer.blocks.{bid}.attn.out_proj", # mpt - "transformer.h.{bid}.self_attention.dense", # falcon - "model.layers.{bid}.self_attn.o_proj", # llama-hf - "layers.{bid}.attention.wo", # llama-pth + "gpt_neox.layers.{bid}.attention.dense", # gptneox + "transformer.h.{bid}.attn.c_proj", # gpt2 + "transformer.blocks.{bid}.attn.out_proj", # mpt + "transformer.h.{bid}.self_attention.dense", # falcon + "model.layers.{bid}.self_attn.o_proj", # llama-hf + "layers.{bid}.attention.wo", # llama-pth + "encoder.layer.{bid}.attention.output.dense", # bert + "transformer.h.{bid}.attn.out_proj", # gpt-j ), # Rotary embeddings @@ -323,21 +382,24 @@ class TensorNameMap: # Feed-forward norm MODEL_TENSOR.FFN_NORM: ( - "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox - "transformer.h.{bid}.ln_2", # gpt2 - "transformer.blocks.{bid}.norm_2", # mpt - "model.layers.{bid}.post_attention_layernorm", # llama-hf - "layers.{bid}.ffn_norm", # llama-pth + "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox + "transformer.h.{bid}.ln_2", # gpt2 + "transformer.blocks.{bid}.norm_2", # mpt + "model.layers.{bid}.post_attention_layernorm", # llama-hf + "layers.{bid}.ffn_norm", # llama-pth + "encoder.layer.{bid}.output.LayerNorm", # bert ), # Feed-forward up MODEL_TENSOR.FFN_UP: ( - "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox - "transformer.h.{bid}.mlp.c_fc", # gpt2 - "transformer.blocks.{bid}.ffn.up_proj", # mpt - "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon - "model.layers.{bid}.mlp.up_proj", # llama-hf - "layers.{bid}.feed_forward.w3", # llama-pth + "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox + "transformer.h.{bid}.mlp.c_fc", # gpt2 + "transformer.blocks.{bid}.ffn.up_proj", # mpt + "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon + "model.layers.{bid}.mlp.up_proj", # llama-hf + "layers.{bid}.feed_forward.w3", # llama-pth + "encoder.layer.{bid}.intermediate.dense", # bert + "transformer.h.{bid}.mlp.fc_in", # gpt-j ), # Feed-forward gate @@ -348,12 +410,14 @@ class TensorNameMap: # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( - "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox - "transformer.h.{bid}.mlp.c_proj", # gpt2 - "transformer.blocks.{bid}.ffn.down_proj", # mpt - "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon - "model.layers.{bid}.mlp.down_proj", # llama-hf - "layers.{bid}.feed_forward.w2", # llama-pth + "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox + "transformer.h.{bid}.mlp.c_proj", # gpt2 + "transformer.blocks.{bid}.ffn.down_proj", # mpt + "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon + "model.layers.{bid}.mlp.down_proj", # llama-hf + "layers.{bid}.feed_forward.w2", # llama-pth + "encoder.layer.{bid}.output.dense", # bert + "transformer.h.{bid}.mlp.fc_out", # gpt-j ), } From 665018c749101e81c816675198e731e47d6b1dbe Mon Sep 17 00:00:00 2001 From: shibe2 Date: Mon, 2 Oct 2023 23:26:15 +0400 Subject: [PATCH 5/6] CLBlast: Add broadcast support for matrix multiplication (#3402) Broadcast src0 into src1 across dimensions 2 and 3 when needed. This is required for models that use GQA. --- ggml-opencl.cpp | 90 ++++++++++++++++++++++++++++++++++++------------- ggml.c | 5 --- 2 files changed, 67 insertions(+), 28 deletions(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index c7d9150fe..7e4069d76 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1476,10 +1476,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; + const int64_t r2 = ne12 / ne02; + const int64_t r3 = ne13 / ne03; + const float alpha = 1.0f; const float beta = 0.0f; const int x_ne = ne01 * ne00; @@ -1498,13 +1503,22 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { + int64_t pi02 = -1; + int64_t pi03 = -1; + + for (int64_t i13 = 0; i13 < ne13; i13++) { + int64_t i03 = i13 / r3; + + for (int64_t i12 = 0; i12 < ne12; i12++) { + int64_t i02 = i12 / r2; + // copy data to device - if (src0->backend != GGML_BACKEND_GPU) { + if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) { CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); + pi02 = i02; + pi03 = i03; } - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); CL_CHECK(clFinish(queue)); @@ -1525,7 +1539,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); } } @@ -1547,6 +1561,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; const int nb10 = src1->nb[0]; const int nb11 = src1->nb[1]; @@ -1556,6 +1572,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; + const int64_t r2 = ne12 / ne02; + const int64_t r3 = ne13 / ne03; + const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f); const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f); const int x_ne = ne01 * ne00; @@ -1577,32 +1596,41 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr bool src1_cont_rows = nb10 == sizeof(float); bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { + int64_t pi02 = -1; + int64_t pi03 = -1; + + for (int64_t i13 = 0; i13 < ne13; i13++) { + int64_t i03 = i13 / r3; + + for (int64_t i12 = 0; i12 < ne12; i12++) { + int64_t i02 = i12 / r2; + // copy src0 to device - if (src0->backend != GGML_BACKEND_GPU) { + if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) { CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); + pi02 = i02; + pi03 = i03; } // convert src1 to fp16 // TODO: use multiple threads - ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02); - char * src1i = (char *) src1->data + i03*nb13 + i02*nb12; + ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12); + char * src1i = (char *) src1->data + i13*nb13 + i12*nb12; if (src1_cont_rows) { if (src1_cont_cols) { ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11); } else { - for (int64_t i01 = 0; i01 < ne11; i01++) { - ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10); + for (int64_t i11 = 0; i11 < ne11; i11++) { + ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10); } } } else { - for (int64_t i01 = 0; i01 < ne11; i01++) { - for (int64_t i00 = 0; i00 < ne10; i00++) { + for (int64_t i11 = 0; i11 < ne11; i11++) { + for (int64_t i10 = 0; i10 < ne10; i10++) { // very slow due to no inlining - tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10)); + tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10)); } } } @@ -1631,7 +1659,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr // copy dst to host, then convert to float CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); ggml_fp16_to_fp32_row(tmp, d, d_ne); } @@ -1652,12 +1680,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; const ggml_type type = src0->type; const bool mul_mat_vec = ne11 == 1; + const int64_t r2 = ne12 / ne02; + const int64_t r3 = ne13 / ne03; + const float alpha = 1.0f; const float beta = 0.0f; const int x_ne = ne01 * ne00; @@ -1690,12 +1723,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * size_t ev_idx = 0; std::vector events; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { + int64_t pi02 = -1; + int64_t pi03 = -1; + + for (int64_t i13 = 0; i13 < ne13; i13++) { + int64_t i03 = i13 / r3; + + for (int64_t i12 = 0; i12 < ne12; i12++) { + int64_t i02 = i12 / r2; + // copy src0 to device if necessary if (src0->backend == GGML_BACKEND_CPU) { - events.emplace_back(); - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); + if (i02 != pi02 || i03 != pi03) { + events.emplace_back(); + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); + pi02 = i02; + pi03 = i03; + } } else if (src0->backend == GGML_BACKEND_GPU) { d_Q = (cl_mem) src0->extra; } else { @@ -1704,7 +1748,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel // copy src1 to device events.emplace_back(); - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++)); + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++)); // compute const size_t global = ne01 * CL_DMMV_BLOCK_SIZE; @@ -1725,7 +1769,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); // copy src1 to device - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); events.emplace_back(); @@ -1749,7 +1793,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } // copy dst to host - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL)); for (auto *event : events) { clReleaseEvent(event); diff --git a/ggml.c b/ggml.c index 820fe2e74..bf1426d25 100644 --- a/ggml.c +++ b/ggml.c @@ -11621,11 +11621,6 @@ static void ggml_compute_forward_mul_mat( #if defined(GGML_USE_CLBLAST) if (ggml_cl_can_mul_mat(src0, src1, dst)) { - // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension - // ref: https://github.com/ggerganov/ggml/pull/224 - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne03 == ne13); - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); } From e78f0b0d0572168f328dd0e2ed3175a53fe52acc Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Mon, 2 Oct 2023 15:38:43 -0400 Subject: [PATCH 6/6] cmake : increase minimum version for add_link_options (#3444) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 221fc07a2..49b37f7d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason +cmake_minimum_required(VERSION 3.13) # for add_link_options project("llama.cpp" C CXX) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)