diff --git a/.gitignore b/.gitignore index 708e8582e..009297b42 100644 --- a/.gitignore +++ b/.gitignore @@ -98,3 +98,5 @@ tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe + +build-info.h diff --git a/common/common.cpp b/common/common.cpp index 6a7114200..9f8cb33f0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -471,6 +471,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.lora_base = argv[i]; + } else if (arg == "--mlp-adapter") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.mlp_adapter = argv[i]; } else if (arg == "--mmproj") { if (++i >= argc) { invalid_param = true; @@ -950,8 +956,26 @@ std::tuple llama_init_from_gpt_par return std::make_tuple(nullptr, nullptr); } - auto cparams = llama_context_params_from_gpt_params(params); + if (llama_use_sparse_inference(model)) { + fprintf(stderr, "%s: postprocessing PowerInfer model '%s'\n", __func__, params.model.c_str()); + if (!params.mlp_adapter.empty()) { + fprintf(stderr, "%s: warning: --mlp-adapter is deprecated and has no effect\n", __func__); + int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + } + if (llama_model_apply_augmentation(model) != 0) { + fprintf(stderr, "%s: error: failed to apply augmentation\n", __func__); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + } + + auto cparams = llama_context_params_from_gpt_params(params); llama_context * lctx = llama_new_context_with_model(model, cparams); if (lctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); @@ -981,6 +1005,8 @@ std::tuple llama_init_from_gpt_par params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } + + { LOG("warming up the model with an empty run\n"); @@ -1320,6 +1346,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); } fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); + fprintf(stream, "mlp_adapter: %s\n", params.mlp_adapter.c_str()); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false"); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); diff --git a/common/common.h b/common/common.h index dd6b002eb..7714102ec 100644 --- a/common/common.h +++ b/common/common.h @@ -90,6 +90,8 @@ struct gpt_params { std::vector> lora_adapter; // lora adapter path with user defined scale std::string lora_base = ""; // base model path for the lora adapter + std::string mlp_adapter = ""; // sparse activation mlp adapter path + int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line // (which is more convenient to use for plotting) diff --git a/convert-hf-to-powerinfer-gguf.py b/convert-hf-to-powerinfer-gguf.py new file mode 100644 index 000000000..87df93be7 --- /dev/null +++ b/convert-hf-to-powerinfer-gguf.py @@ -0,0 +1,601 @@ +#!/usr/bin/env python3 + +from __future__ import annotations +from abc import ABC, abstractmethod + +import argparse +import contextlib +import json +import os +import re +import struct +import sys +from enum import IntEnum +from pathlib import Path +from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Optional, cast + +import numpy as np +import torch +import torch.nn as tnn + +if TYPE_CHECKING: + from torch import Tensor + +if "NO_LOCAL_GGUF" not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) +import gguf + + +###### MODEL DEFINITIONS ###### + + +class SentencePieceTokenTypes(IntEnum): + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 + USER_DEFINED = 4 + UNUSED = 5 + BYTE = 6 + + +class ReluMLP(tnn.Module): + def __init__(self, input_dim: int, hidden_dim: int, output_dim: int): + super(ReluMLP, self).__init__() + self.fc1 = tnn.Linear(input_dim, hidden_dim, bias=False) + self.relu = tnn.ReLU() + self.fc2 = tnn.Linear(hidden_dim, output_dim, bias=False) + + def forward(self, x): + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + return x + + @staticmethod + def from_file(model_file: Path): + model = torch.load(model_file, map_location="cpu") + hidden_size, input_size = model.get("fc1.weight").shape + output_size, _ = model.get("fc2.weight").shape + mlp = ReluMLP(input_size, hidden_size, output_size) + mlp.load_state_dict(model) + return mlp + + +class Model(ABC): + """Base class for model conversion""" + + def __init__( + self, + dir_model: Path, + dir_mlp_pred: Path, + ftype: int, + fname_out: Path, + is_big_endian: bool, + ): + self.dir_model = dir_model + self.dir_mlp_pred = dir_mlp_pred + self.ftype = ftype + self.fname_out = fname_out + self.is_big_endian = is_big_endian + self.endianess = ( + gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + ) + self.is_safetensors = self._is_model_safetensors() + self.num_parts = Model.count_model_parts( + self.dir_model, ".safetensors" if self.is_safetensors else ".bin" + ) + self.part_names = self._get_part_names() + self.hparams = Model.load_hparams(self.dir_model) + self.model_arch = self._get_model_architecture() + self.gguf_writer = gguf.GGUFWriter( + fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file = False + ) + + def set_vocab(self): + self._set_vocab_gpt2() + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for model_layer, part_name in self._get_mlp_part_layer_names(): + print(f"gguf: loading mlp part '{part_name}'") + mlp_model = ReluMLP.from_file(self.dir_mlp_pred / part_name) + for name, data in mlp_model.state_dict().items(): + yield f"blk.{model_layer}.{name}", data + + for part_name in self.part_names: + print(f"gguf: loading model part '{part_name}'") + ctx: ContextManager[Any] + if self.is_safetensors: + from safetensors import safe_open + + ctx = cast( + ContextManager[Any], + safe_open(self.dir_model / part_name, framework="pt", device="cpu"), + ) + else: + ctx = contextlib.nullcontext( + torch.load(self.dir_model / part_name, map_location="cpu") + ) + + with ctx as model_part: + for name in model_part.keys(): + data = ( + model_part.get_tensor(name) + if self.is_safetensors + else model_part[name] + ) + yield name, data + + @abstractmethod + def set_gguf_parameters(self): + pass + # self.gguf_writer.add_name(self.dir_model.name) + # self.gguf_writer.add_block_count( + # self.hparams.get( + # "n_layers", + # self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), + # ) + # ) + # if (n_ctx := self.hparams.get("max_position_embeddings")) is not None: + # self.gguf_writer.add_context_length(n_ctx) + # if (n_embd := self.hparams.get("hidden_size")) is not None: + # self.gguf_writer.add_embedding_length(n_embd) + # if (n_ff := self.hparams.get("intermediate_size")) is not None: + # self.gguf_writer.add_feed_forward_length(n_ff) + # if (n_head := self.hparams.get("num_attention_head")) is not None: + # self.gguf_writer.add_head_count(n_head) + # self.gguf_writer.add_parallel_residual( + # self.hparams.get("use_parallel_residual", True) + # ) + + @abstractmethod + def write_tensors(self): + pass + + def write(self): + self.write_tensors() + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file() + self.gguf_writer.close() + + def write_vocab(self): + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.close() + + @staticmethod + def count_model_parts(dir_model: Path, prefix: str) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.endswith(prefix): + num_parts += 1 + + return num_parts + + @staticmethod + def load_hparams(dir_model): + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + return json.load(f) + + @staticmethod + def from_model_architecture(model_architecture): + if model_architecture in ("FalconForCausalLM", "RWForCausalLM"): + return FalconModel + if model_architecture == "LlamaForCausalLM": + return LlamaModel + + raise NotImplementedError(f'Architecture "{model_architecture}" not supported!') + + def _is_model_safetensors(self) -> bool: + return Model.count_model_parts(self.dir_model, ".safetensors") > 0 + + def _get_mlp_part_layer_names(self): + """Returns a generator of (index, name) for MLP predictors of each model layer""" + n_mlp_parts = Model.count_model_parts(self.dir_mlp_pred, ".pt") + return ((n, f"model_{n}.pt") for n in range(n_mlp_parts)) + + def _get_part_names(self): + if self.is_safetensors: + if self.num_parts == 1: # there's only one .safetensors file + return ("model.safetensors",) + return ( + f"model-{n:05}-of-{self.num_parts:05}.safetensors" + for n in range(1, self.num_parts + 1) + ) + + if self.num_parts == 1: # there's only one .bin file + return ("pytorch_model.bin",) + return ( + f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" + for n in range(1, self.num_parts + 1) + ) + + def _get_model_architecture(self) -> gguf.MODEL_ARCH: + arch = self.hparams["architectures"][0] + if arch == "FalconForCausalLM": + return gguf.MODEL_ARCH.FALCON + if arch == "RWForCausalLM" or arch == "LlamaForCausalLM": + return gguf.MODEL_ARCH.LLAMA + + raise NotImplementedError(f'Architecture "{arch}" not supported!') + + def _translate_tensor_key( + self, key: str, try_suffixes=(".weight", ".bias") + ) -> Optional[str]: + block_count = self.hparams.get( + "n_layers", + self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), + ) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + arch_tensor_key = tensor_map.get_name(key, try_suffixes=try_suffixes) + if arch_tensor_key is not None: + return arch_tensor_key + # check and handle ReluMLP layers + mlp_match = re.match(r"^blk\.\d+\.fc\d\.weight$", key) + if mlp_match: + return mlp_match.group(0) + return None + + def _set_vocab_gpt2(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[bytearray] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer # type: ignore[attr-defined] + + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + assert max(tokenizer.vocab.values()) < vocab_size + + reverse_vocab = { + id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items() + } + added_vocab = tokenizer.get_added_vocab() + + for i in range(vocab_size): + if i not in reverse_vocab: + pad_token = f"[PAD{i}]".encode("utf-8") + tokens.append(bytearray(pad_token)) + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_sentencepiece(self): + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / "tokenizer.model" + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + print(f"Error: Missing {tokenizer_path}", file=sys.stderr) + sys.exit(1) + + tokenizer = SentencePieceProcessor(str(tokenizer_path)) + vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.id_to_piece(token_id) + text = piece.encode("utf-8") + score = tokenizer.get_score(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.is_unknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.is_control(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.is_unused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.is_byte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / "added_tokens.json" + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + +class LlamaModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + self.gguf_writer.add_name("Llama") + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith( + ( + ".attention.masked_bias", + ".attention.bias", + ".attention.rotary_emb.inv_freq", + ) + ): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = self._translate_tensor_key(name) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + # We need to transpose the weight matrices for the FFN Down layers to support the + # Axpy operation in PowerInfer. So we don't need to transpose them at runtime. + if "ffn_down" in new_name: + new_name = new_name.replace("ffn_down", "ffn_down_t") + data = data.T + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +class FalconModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams.get("num_hidden_layers") + if block_count is None: + block_count = self.hparams["n_layer"] # old name + + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + self.gguf_writer.add_name("Falcon") + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + head_dim = self.hparams["hidden_size"] // n_head + + for name, data_torch in self.get_tensors(): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py + + if "query_key_value" in name: + qkv = data_torch.view( + n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head + ) + q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) + k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) + v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) + data_torch = torch.cat((q, k, v)).reshape_as(data_torch) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = self._translate_tensor_key(name) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + # We need to transpose the weight matrices for the FFN Down layers to support the + # Axpy operation in PowerInfer. So we don't need to transpose them at runtime. + if "ffn_down" in new_name: + new_name = new_name.replace("ffn_down", "ffn_down_t") + data = data.T + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ( + self.ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +###### CONVERSION LOGIC ###### + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert a huggingface model to a GGML compatible file" + ) + parser.add_argument( + "--vocab-only", + action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--outfile", + type=Path, + help="path to write to; default: based on input", + ) + parser.add_argument( + "--outtype", + type=str, + choices=["f32", "f16"], + default="f16", + help="output format - use f32 for float32, f16 for float16", + ) + parser.add_argument( + "--bigendian", + action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "model", + type=Path, + help="directory containing model file", + ) + parser.add_argument( + "mlp_predictors", + type=Path, + help="directory containing MLP predictors for model", + ) + + return parser.parse_args() + + +args = parse_args() + +dir_model = args.model +dir_mlp_pred = args.mlp_predictors +if not dir_model.is_dir(): + print(f"Error: {args.model} is not a directory", file=sys.stderr) + sys.exit(1) +if not dir_mlp_pred.is_dir(): + print(f"Error: {args.mlp_predictors} is not a directory", file=sys.stderr) + sys.exit(1) + +ftype_map = { + "f32": gguf.GGMLQuantizationType.F32, + "f16": gguf.GGMLQuantizationType.F16, +} + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f"ggml-model-{args.outtype}.gguf" + +print(f"Loading model: {dir_model.name}") + +hparams = Model.load_hparams(dir_model) + +model_class = Model.from_model_architecture(hparams["architectures"][0]) +model_instance = model_class( + dir_model, dir_mlp_pred, ftype_map[args.outtype], fname_out, args.bigendian +) + +print("Set model parameters") +model_instance.set_gguf_parameters() + +print("Set model tokenizer") +model_instance.set_vocab() + +if args.vocab_only: + print(f"Exporting model vocab to '{fname_out}'") + model_instance.write_vocab() +else: + print(f"Exporting model to '{fname_out}'") + model_instance.write() + +# post-process: write another unique file header to distinguish from the origianl GGUF file +with open(fname_out, "r+b") as fout: + POWERINFER_MAGIC = int.from_bytes(b"PWRI", "little") + fout.write(struct.pack(" Tensor: return self.load().astype(data_type) return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}') + + def transposed(self) -> LazyTensor: + def load() -> Tensor: + loaded = self.load() + assert isinstance(loaded, UnquantizedTensor), f'Cannot transpose {loaded}' + loaded.ndarray = loaded.ndarray.T + return loaded + return LazyTensor(load, self.shape[::-1], self.data_type, f'transpose {self.description}') def validate_conversion_to(self, data_type: DataType) -> None: if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions: @@ -571,7 +579,8 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus: except StopIteration: vocab = None - if any("model.embed_tokens.weight" in mp.model for mp in models_plus): + if any("model.embed_tokens.weight" in mp.model for mp in models_plus) or \ + any("model.layers.0.fc1.weight" in mp.model for mp in models_plus): # Transformers models put different tensors in different files, but # don't split indivdual tensors between files. model: LazyModel = {} @@ -992,6 +1001,18 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: return out +def postprocess_transpose(model: LazyModel) -> LazyModel: + """Transpose ffn_down matrices for Axpy ops.""" + out: LazyModel = {} + + for name, lazy_tensor in model.items(): + if name.endswith(".ffn_down.weight"): + out[name.replace("ffn_down", "ffn_down_t")] = lazy_tensor.transposed() + else: + out[name] = lazy_tensor + + return out + def nth_multifile_path(path: Path, n: int) -> Path | None: '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return the nth path in the model. @@ -1003,7 +1024,9 @@ def nth_multifile_path(path: Path, n: int) -> Path | None: # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc. (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'), # x.bin, x.bin.1, etc. - (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}') + (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}'), + # x_0.pt, x_1.pt, etc. + (r'(_[0-9]+)?\.pt$', fr'_{n}.pt'), ] for regex, replacement in patterns: if re.search(regex, path.name): @@ -1057,6 +1080,25 @@ def load_some_model(path: Path) -> ModelPlus: model_plus = merge_multifile_models(models_plus) return model_plus +def load_mlp_model(path: Path) -> ModelPlus: + '''Load MLP models for sparse attention from directory.''' + assert path.is_dir(), f"MLP model path {path} is not a directory" + + first_model_path = path / "model_0.pt" + assert first_model_path.resolve(), f"MLP model path {path} does not contain model_0.pt" + + model_paths = find_multifile_paths(first_model_path) + models_plus: list[ModelPlus] = [] + for model_path in model_paths: + # find number in model_path + model_layer = int(re.search(r'model_(\d+).pt', str(model_path)).group(1)) + print(f"Loading MLP model file {model_path}") + mlp_model = lazy_load_file(model_path) + mlp_model.model = {f"model.layers.{model_layer}.{name}": tensor for name, tensor in mlp_model.model.items()} + models_plus.append(mlp_model) + + return merge_multifile_models(models_plus) + def load_vocab(path: Path, vocabtype: str | None) -> Vocab: # Be extra-friendly and accept either a file or a directory. Also, if it's @@ -1125,6 +1167,7 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin, *.safetensors)") + parser.add_argument("mlp_model", type=Path, help="MLP model for sparse attention") parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm") parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY) @@ -1138,6 +1181,8 @@ def main(args_in: list[str] | None = None) -> None: if not args.vocab_only: model_plus = load_some_model(args.model) + mlp_predictor_plus = load_mlp_model(args.mlp_model) + model_plus = merge_multifile_models([model_plus, mlp_predictor_plus]) else: model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) @@ -1192,6 +1237,7 @@ def main(args_in: list[str] | None = None) -> None: model = model_plus.model model = convert_model_names(model, params) + model = postprocess_transpose(model) ftype = pick_output_type(model, args.outtype) model = convert_to_output_type(model, ftype) outfile = args.outfile or default_outfile(model_plus.paths, ftype) @@ -1202,6 +1248,11 @@ def main(args_in: list[str] | None = None) -> None: OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess) print(f"Wrote {outfile}") + # post-process: write another unique file header to distinguish from the origianl GGUF file + with open(outfile, "r+b") as fout: + POWERINFER_MAGIC = int.from_bytes(b"PWRI", "little") + fout.write(struct.pack("= 7) { + params.n_threads = std::atoi(argv[6]); + } + + if (argc >= 8) { + params.mlp_adapter = argv[7]; + } + + printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, mlp_adapter = %s\n", + params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.mlp_adapter.c_str()); + if (params.prompt.empty()) { params.prompt = "Hello my name is"; } @@ -65,6 +76,21 @@ int main(int argc, char ** argv) { return 1; } + if (!params.mlp_adapter.empty()) { + int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__); + llama_free_model(model); + return 1; + } + } + + if (llama_model_apply_augmentation(model) != 0) { + fprintf(stderr, "%s: error: failed to apply model augmentation\n", __func__); + llama_free_model(model); + return 1; + } + // tokenize the prompt std::vector tokens_list; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7be63925f..78a52a84d 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -88,6 +88,8 @@ #define CC_OFFSET_AMD 1000000 #define CC_RDNA2 (CC_OFFSET_AMD + 1030) +#define GGML_CUDA_MAX_NODES 8192 + // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant // for large computational tasks. the drawback is that this requires some extra amount of VRAM: @@ -507,6 +509,19 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co dst[i] = x[i] + y[i%ky]; } +static __global__ void add_f32_idx(const float * x, const float * y, float * dst, float* idx, const int kx, const int ky) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= kx) { + return; + } + if (idx[i] <= -0.0f) { + dst[i] = 0; + return; + } + dst[i] = x[i] + y[i%ky]; +} + static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -556,6 +571,15 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) { dst[i] = x[i] / (1.0f + expf(-x[i])); } +static __global__ void sigmoid_f32(const float * x, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = 1 / (1.0f + expf(-x[i])); +} + static __global__ void relu_f32(const float * x, float * dst, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -4379,6 +4403,425 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons } } + +template +static __global__ void dequantize_mul_mat_axpy(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) { + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int row = blockIdx.y*blockDim.y + threadIdx.y; + + if (row >= nrows) { + return; + } + const int bid = blockIdx.y; + // if (bid == 0) global_lock = 0; + + extern __shared__ float shared_dst[]; // TODO:dynamic + + const int tid = threadIdx.x; + + const int iter_stride = 2*GGML_CUDA_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter + const int y_offset = qr == 1 ? 1 : qk/2; + +// partial sum for each thread + float tmp = 0.0f; + for (int i = 0; i < ncols; i += GGML_CUDA_DMMV_X) { + shared_dst[i+tid] = 0; + } + __syncthreads(); + + for (int i = 0; i < ncols; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int ib = (row*ncols + col)/qk; // x block index + const int iqs = (col%qk)/qr; // x quant index + const int iybs = col - col%qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel(vx, ib, iqs + j/qr, v); + + // matrix multiplication + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 + tmp = v.x * y[row]; + /* atomicAdd((float *)&dst[iybs + iqs + j/qr + 0], tmp); */ + shared_dst[iybs + iqs + j/qr + 0] = tmp; + tmp = v.y * y[row]; + /* atomicAdd((float *)&dst[iybs + iqs + j/qr + y_offset], tmp); */ + shared_dst[iybs + iqs + j/qr + y_offset] = tmp; + } + } + /* __syncthreads(); */ + + for (int i = 0; i < ncols; i += GGML_CUDA_DMMV_X) { + // dst[i+tid] += shared_dst[i+tid]; + atomicAdd(&dst[i+tid], shared_dst[i+tid]); + } +} + +template +static __global__ void dequantize_mul_mat_axpy_sparse(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, int *lst, float *idx) { + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int row = blockIdx.y*blockDim.y + threadIdx.y; + + if (row >= nrows) { + return; + } + int id = lst[row]; + const int tid = threadIdx.x; + short *d = (short *)((char *)vx + ncols * row * 2); + // if (tid == 0) { + // for(int i = 0; i < 4096; i++) + // printf("%d ", *(d+i)); + // printf("row in gpu %d cols %d, value %d %d %d\n", id, ncols, *d, *(d+1), *(d+4095)); + // } + // int id = row; + if (idx[id] < 0.0f) { + return; + } + + const int bid = blockIdx.y; + // if (bid == 0) global_lock = 0; + + extern __shared__ float shared_dst[]; // TODO:dynamic + + // if (tid == 0) + // printf("row %d y[row] %f\n", row, y[row]); + + const int iter_stride = 2*GGML_CUDA_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter + const int y_offset = qr == 1 ? 1 : qk/2; + +// partial sum for each thread + float tmp = 0.0f; + for (int i = 0; i < ncols; i += GGML_CUDA_DMMV_X) { + shared_dst[i+tid] = 0; + } + __syncthreads(); + + for (int i = 0; i < ncols; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int ib = (row*ncols + col)/qk; // x block index + const int iqs = (col%qk)/qr; // x quant index + const int iybs = col - col%qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel(vx, ib, iqs + j/qr, v); + + // matrix multiplication + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 + tmp = v.x * y[id]; + /* atomicAdd((float *)&dst[iybs + iqs + j/qr + 0], tmp); */ + shared_dst[iybs + iqs + j/qr + 0] = tmp; + tmp = v.y * y[id]; + /* atomicAdd((float *)&dst[iybs + iqs + j/qr + y_offset], tmp); */ + shared_dst[iybs + iqs + j/qr + y_offset] = tmp; + + } + } + __syncthreads(); + + for (int i = 0; i < ncols; i += GGML_CUDA_DMMV_X) { + // dst[i+tid] += shared_dst[i+tid]; + atomicAdd(&dst[i+tid], shared_dst[i+tid]); + // printf("%f", dst[i+tid]); + } +} + +template +static __global__ void dequantize_mul_mat_axpy_sparse_batch(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, int src1_ne0, int src1_ncols, int *lst, float *idx) { + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int row = blockIdx.y*blockDim.y + threadIdx.y; + + if (row >= nrows) { + return; + } + int id = lst[row]; + // int id = row; + // if (idx[id] < 0.0f) { + // return; + // } + const int bid = blockIdx.y; + // if (bid == 0) global_lock = 0; + + extern __shared__ float shared_dst[]; // TODO:dynamic + + const int tid = threadIdx.x; + + const int iter_stride = 2*GGML_CUDA_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter + const int y_offset = qr == 1 ? 1 : qk/2; + float * loop_idx = idx;; + dfloat * loop_y = (dfloat *)y; + float * loop_dst = dst; + +// partial sum for each thread + float tmp = 0.0f; + for (int i = 0; i < ncols; i += GGML_CUDA_DMMV_X) { + shared_dst[i+tid] = 0; + } + // __syncthreads(); + for (int col_id = 0; col_id < src1_ncols; col_id++) { + __syncthreads(); + if (loop_idx[id] < 0.0f) { + loop_dst += ncols; + loop_idx += src1_ne0; + loop_y += src1_ne0; + continue; + } + + + for (int i = 0; i < ncols; i += iter_stride) + { + const int col = i + vals_per_iter * tid; + const int ib = (row * ncols + col) / qk; // x block index + const int iqs = (col % qk) / qr; // x quant index + const int iybs = col - col % qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) + { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel(vx, ib, iqs + j / qr, v); + + // matrix multiplication + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 + tmp = v.x * loop_y[id]; + /* atomicAdd((float *)&dst[iybs + iqs + j/qr + 0], tmp); */ + shared_dst[iybs + iqs + j / qr + 0] = tmp; + tmp = v.y * loop_y[id]; + /* atomicAdd((float *)&dst[iybs + iqs + j/qr + y_offset], tmp); */ + shared_dst[iybs + iqs + j / qr + y_offset] = tmp; + } + } + /* __syncthreads(); */ + + for (int i = 0; i < ncols; i += GGML_CUDA_DMMV_X) + { + // dst[i+tid] += shared_dst[i+tid]; + atomicAdd(&loop_dst[i + tid], shared_dst[i + tid]); + shared_dst[i+tid] = 0; + } + loop_dst += ncols; + loop_idx += src1_ne0; + loop_y += src1_ne0; + // printf("cols %d rows %d\n", ncols, nrows); + + } +} + +template +static __global__ void dequantize_axpy_sparse(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, int * lst, float * idx) { + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int row = blockIdx.y*blockDim.y + threadIdx.y; + + if (row >= nrows) { + return; + } + int id = lst[row]; + if (idx[id] < 0.0f) { + return; + } + + const int tid = threadIdx.x; + + const int iter_stride = 2*GGML_CUDA_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter + const int y_offset = qr == 1 ? 1 : qk/2; + +// partial sum for each thread + float tmp = 0.0f; + + for (int i = 0; i < ncols; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int ib = (row*ncols + col)/qk; // x block index + const int iqs = (col%qk)/qr; // x quant index + const int iybs = col - col%qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel(vx, ib, iqs + j/qr, v); + + // matrix multiplication + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 + tmp = v.x * y[iybs + iqs + j/qr + 0]; + atomicAdd((float *)&dst[iybs + iqs + j/qr + 0], tmp); + tmp = v.y * y[iybs + iqs + j/qr + y_offset]; + atomicAdd((float *)&dst[iybs + iqs + j/qr + y_offset], tmp); + } + } + +} +template +static __global__ void dequantize_mul_mat_vec_sparse(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, int * lst, float * idx) { + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int row = blockIdx.y*blockDim.y + threadIdx.y; + + if (row >= nrows) { + return; + } + int id = lst[row]; + // int id = row; + if (idx[id] < 0.0f) { + return; + } + + const int tid = threadIdx.x; + + const int iter_stride = 2*GGML_CUDA_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter + const int y_offset = qr == 1 ? 1 : qk/2; + +// partial sum for each thread +#ifdef GGML_CUDA_F16 + half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics +#else + float tmp = 0.0f; +#endif // GGML_CUDA_F16 + + for (int i = 0; i < ncols; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int ib = (row*ncols + col)/qk; // x block index + const int iqs = (col%qk)/qr; // x quant index + const int iybs = col - col%qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel(vx, ib, iqs + j/qr, v); + + // matrix multiplication + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 +// #ifdef GGML_CUDA_F16 +// tmp += __hmul2(v, { +// y[iybs + iqs + j/qr + 0], +// y[iybs + iqs + j/qr + y_offset] +// }); +// #else + tmp += v.x * y[iybs + iqs + j/qr + 0]; + tmp += v.y * y[iybs + iqs + j/qr + y_offset]; +// #endif + } + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (tid == 0) { +#ifdef GGML_CUDA_F16 + dst[row] = tmp.x + tmp.y; +#else + // dst[row] = tmp; + dst[id] = tmp; +#endif // GGML_CUDA_F16 + } +} + +template +static __global__ void dequantize_mul_mat_batch_sparse(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, int src1_cols, int dst_ne0,int * lst, float * idx) { + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int row = blockIdx.y*blockDim.y + threadIdx.y; + + if (row >= nrows) { + return; + } + int id = lst[row]; + + + const int tid = threadIdx.x; + + const int iter_stride = 2*GGML_CUDA_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter + const int y_offset = qr == 1 ? 1 : qk/2; + float * loop_idx = idx;; + dfloat * loop_y = (dfloat *)y; + float * loop_dst = dst; + + + + float tmp = 0.0f; + + for (int col_id = 0; col_id < src1_cols; col_id++) + { + __syncthreads(); + tmp = 0.0f; + if (loop_idx[id] < 0.0f) + { + loop_dst += dst_ne0; + loop_idx += dst_ne0; + loop_y += ncols; + continue; + } + + for (int i = 0; i < ncols; i += iter_stride) + { + const int col = i + vals_per_iter * tid; + const int ib = (row * ncols + col) / qk; // x block index + const int iqs = (col % qk) / qr; // x quant index + const int iybs = col - col % qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) + { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel(vx, ib, iqs + j / qr, v); + + // matrix multiplication + + tmp += v.x * loop_y[iybs + iqs + j / qr + 0]; + tmp += v.y * loop_y[iybs + iqs + j / qr + y_offset]; + // #endif + } + } + atomicAdd(&loop_dst[id], tmp); + loop_dst += dst_ne0; + loop_idx += dst_ne0; + loop_y += ncols; + } +} + static __global__ void mul_mat_p021_f16_f32( const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) { @@ -4776,6 +5219,11 @@ static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const k_get_rows<<>>(x, y, dst, ncols); } +static void add_idx_f32_cuda(const float * x, const float * y, float * dst, float * idx, const int kx, const int ky, cudaStream_t stream) { + const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; + add_f32_idx<<>>(x, y, dst, idx, kx, ky); +} + static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) { const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; add_f32<<>>(x, y, dst, kx, ky); @@ -4930,6 +5378,25 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, dequantize_mul_mat_vec <<>>(vx, y, dst, ncols, nrows); } +static void dequantize_mul_mat_vec_q4_0_cuda_sparse(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_vec_sparse + <<>>(vx, y, dst, ncols, nrows, lst, idx); +} +static void dequantize_mul_mat_batch_q4_0_cuda_sparse(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_ncols, int dst_ne0, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + // printf("ncols %d, nrows %d, src1_ncols %d, dst_ne0 %d\n", ncols, nrows, src1_ncols, dst_ne0); + + dequantize_mul_mat_batch_sparse + <<>>(vx, y, dst, ncols, nrows, src1_ncols, dst_ne0, lst, idx); + +} static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); @@ -5117,6 +5584,79 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa dequantize_mul_mat_vec<1, 1, convert_f16> <<>>(vx, y, dst, ncols, nrows); } +static void convert_mul_mat_vec_f16_cuda_sparse(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + + dequantize_mul_mat_vec_sparse<1, 1, convert_f16> + <<>>(vx, y, dst, ncols, nrows, lst, idx); + +} +static void convert_mul_mat_batch_f16_cuda_sparse(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_ncols, int dst_ne0, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + + dequantize_mul_mat_batch_sparse<1, 1, convert_f16> + <<>>(vx, y, dst, ncols, nrows, src1_ncols, dst_ne0, lst, idx); + +} + +static void dequantize_axpy_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_axpy + <<>>(vx, y, dst, ncols, nrows); +} +static void dequantize_axpy_sparse_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + // dequantize_mul_mat_axpy + // <<>>(vx, y, dst, ncols, nrows); + dequantize_mul_mat_axpy_sparse + <<>>(vx, y, dst, ncols, nrows, lst, idx); +} + +static void dequantize_axpy_sparse_batch_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_rows, int src1_ncols, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_axpy_sparse_batch + <<>>(vx, y, dst, ncols, nrows, src1_rows, src1_ncols, lst, idx); +} +static void convert_axpy_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_axpy<1, 1, convert_f16> + <<>>(vx, y, dst, ncols, nrows); +} + +static void convert_axpy_sparse_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_axpy_sparse<1, 1, convert_f16> + <<>>(vx, y, dst, ncols, nrows, lst, idx); +} +static void convert_axpy_sparse_batch_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, int src1_rows, int src1_ncols, cudaStream_t stream, int *lst, float *idx) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_axpy_sparse_batch<1, 1, convert_f16> + <<>>(vx, y, dst, ncols, nrows, src1_rows, src1_ncols, lst, idx); +} static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { switch (type) { @@ -6136,6 +6676,57 @@ static void ggml_cuda_op_get_rows( } } +static cudaError_t ggml_cuda_cpy_tensor_1d( + void * dst, const struct ggml_tensor * src, int64_t i1_low, int64_t i1_high, cudaStream_t stream) { + cudaMemcpyKind kind; + char * src_ptr; + if (src->backend == GGML_BACKEND_CPU) { + kind = cudaMemcpyHostToDevice; + src_ptr = (char *) src->data; + } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) { + GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); + kind = cudaMemcpyDeviceToDevice; + struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + src_ptr = (char *) extra->data_device[id]; + } else { + GGML_ASSERT(false); + } + + char * dst_ptr = (char *) dst; + + const int64_t ne0 = src->ne[0]; + const int64_t nb0 = src->nb[0]; + const int64_t blck = ggml_blck_size(src->type); + + const enum ggml_type type = src->type; + const int64_t ts = ggml_type_size(type); + const int64_t bs = ggml_blck_size(type); + int64_t i1_diff = i1_high - i1_low; + + const char * x = src_ptr + i1_low*nb0/blck; + return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb0/blck, kind, stream); +} + +void ggml_cuda_cpy_1d(struct ggml_tensor * dst, const struct ggml_tensor * src) { + ggml_cuda_set_device(g_main_device); + const cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; + + // TODO: only supports CPU -> GPU as of now + GGML_ASSERT(src->backend == GGML_BACKEND_CPU && dst->backend == GGML_BACKEND_GPU); + struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + + CUDA_CHECK(ggml_cuda_cpy_tensor_1d(dst_extra->data_device[0], src, 0, src->ne[0], main_stream)); +} + +void ** ggml_cuda_get_data_pp(struct ggml_tensor * tensor) { + // only supports one device for now + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + return &extra->data_device[0]; +} + inline void ggml_cuda_op_add( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { @@ -6146,7 +6737,13 @@ inline void ggml_cuda_op_add( const int64_t ne11 = src1->ne[1]; if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream); + ggml_tensor * src2 = dst->src[2]; + if (src2 == NULL) + add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream); + else { + float *idx = (src2->backend == GGML_BACKEND_GPU) ? (float *)((ggml_tensor_extra_gpu *)(src2->extra))->data_device[0] : (float *)src2->data; + add_idx_f32_cuda(src0_dd, src1_dd, dst_dd, idx, ggml_nelements(src0), ne10*ne11, main_stream); + } } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream); } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { @@ -6289,10 +6886,27 @@ inline void ggml_cuda_op_mul_mat_q( // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + struct ggml_tensor_extra_gpu *idx_extra = NULL; + struct ggml_tensor_extra_gpu *dst_extra = NULL; + if (dst->src[2] != NULL) { + idx_extra = (ggml_tensor_extra_gpu *)dst->src[2]->extra; + // dst_extra = (ggml_tensor_extra_gpu *) dst->src[3]->extra; + } + switch (src0->type) { case GGML_TYPE_Q4_0: - ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + if (dst->src[2] == NULL) + ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + else { + if (dst->src[3] != NULL) { + dst_extra = (ggml_tensor_extra_gpu *) dst->src[3]->extra; + cudaMemsetAsync((void *)dst_dd_i, 0, ggml_nbytes(dst), stream); + // cudaMemset((void *)dst_dd_i, 0, ggml_nbytes(dst)); + // convert_mul_mat_vec_f16_cuda_sparse(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream, (int *)dst_extra->data_device[0], (float *)idx_extra->data_device[0]); + dequantize_mul_mat_batch_q4_0_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, src1_ncols, dst->ne[0], stream, (int *)dst_extra->data_device[0], (float *)dst->src[2]->data); + } + } break; case GGML_TYPE_Q4_1: ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); @@ -6321,6 +6935,13 @@ inline void ggml_cuda_op_mul_mat_q( case GGML_TYPE_Q6_K: ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; + case GGML_TYPE_F16: + // printf("src0_nbytes %ld, src1_nbytes %ld, dst_nbytes %ld, idx_nbytes %ld\n", src0->nb[1], src1->nb[1], dst->nb[1], dst->src[2]->nb[1]); + dst_extra = (ggml_tensor_extra_gpu *) dst->src[3]->extra; + cudaMemsetAsync(dst_dd_i, 0, ggml_nbytes(dst), stream); + // printf("ne0 : src1 %d, dst %d, idx %d ne1: src1 %d\n", src1->ne[0], dst->ne[0], dst->src[2]->ne[0], src1->ne[1]); + convert_mul_mat_batch_f16_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, src1_ncols, dst->ne[0], stream, (int *)dst_extra->data_device[0], (float *)dst->src[2]->data); + break; default: GGML_ASSERT(false); break; @@ -6390,17 +7011,62 @@ static int64_t get_row_rounding(ggml_type type) { #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } +__global__ void copyKernel(float* dst, float* src, int len, int* flag) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + if (id < len ) { + dst[id] = src[flag[id]]; + } +} + inline void ggml_cuda_op_mul_mat_vec_q( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, const cudaStream_t & stream) { const int64_t ne00 = src0->ne[0]; + const int64_t ne10 = src1->ne[1]; const int64_t row_diff = row_high - row_low; + struct ggml_tensor_extra_gpu *idx_extra = NULL; + struct ggml_tensor_extra_gpu *dst_extra = NULL; switch (src0->type) { case GGML_TYPE_Q4_0: - mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + if (dst->src[2] == NULL) + mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + else { + if (dst->src[3] != NULL) { + dst_extra = (ggml_tensor_extra_gpu *) dst->src[3]->extra; + cudaMemsetAsync((void *)dst_dd_i, 0, ggml_nbytes(dst), stream); + // cudaMemset((void *)dst_dd_i, 0, ggml_nbytes(dst)); + // convert_mul_mat_vec_f16_cuda_sparse(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream, (int *)dst_extra->data_device[0], (float *)idx_extra->data_device[0]); + dequantize_mul_mat_vec_q4_0_cuda_sparse(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream, (int *)dst_extra->data_device[0], (float *)dst->src[2]->data); + } + else { + int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1; + padded_row_size -= padded_row_size % MATRIX_ROW_PADDING; + // cudaStream_t cudaStream_main = g_cudaStreams[g_main_device][0]; + dst_extra = (ggml_tensor_extra_gpu *) dst->src[2]->extra; + int *data = (int *)dst_extra->data_device[0]; + int len = src0->ne[0]; + int blockSize = 32; + int numBlocks = (len + blockSize - 1) / blockSize; + float *devDst = NULL; + // cudaMalloc(&devDst, sizeof(float) * len); + size_t sz = 0; + size_t as = 0; + devDst = (float *)ggml_cuda_pool_malloc(sizeof(float) * len, &sz); + // cudaMemsetAsync((void *)devDst, 0, sz, stream); + copyKernel<<>>(devDst, (float *)src1_ddf_i, len, data); + void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as); + quantize_row_q8_1_cuda(devDst, src1_q8_1, ne00, ne10, padded_row_size, stream); + mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_q8_1, dst_dd_i, ne00, row_diff, stream); + // printf("here\n"); + // dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, devDst, dst_dd_i, ne00, row_diff, stream); + ggml_cuda_pool_free(devDst, sz); + + ggml_cuda_pool_free(src1_q8_1, as); + } + } break; case GGML_TYPE_Q4_1: mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); @@ -6467,7 +7133,12 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( #else const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion #endif // GGML_CUDA_F16 - + struct ggml_tensor_extra_gpu *idx_extra = NULL; + struct ggml_tensor_extra_gpu *dst_extra = NULL; + if (dst->src[2] != NULL) { + idx_extra = (ggml_tensor_extra_gpu *)dst->src[2]->extra; + // dst_extra = (ggml_tensor_extra_gpu *) dst->src[3]->extra; + } switch (src0->type) { case GGML_TYPE_Q4_0: dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); @@ -6500,7 +7171,476 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); break; case GGML_TYPE_F16: - convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + if (dst->src[2] == NULL) + convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + else + { + if (dst->src[3] != NULL) { + dst_extra = (ggml_tensor_extra_gpu *) dst->src[3]->extra; + cudaMemsetAsync((void *)dst_dd_i, 0, ggml_nbytes(dst), stream); + convert_mul_mat_vec_f16_cuda_sparse(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream, (int *)dst_extra->data_device[0], (float *)dst->src[2]->data); + } else { + dst_extra = (ggml_tensor_extra_gpu *) dst->src[2]->extra; + int *data = (int *)dst_extra->data_device[0]; + int len = src0->ne[0]; + int blockSize = 32; + int numBlocks = (len + blockSize - 1) / blockSize; + float *devDst = NULL; + size_t sz = 0; + devDst = (float *)ggml_cuda_pool_malloc(sizeof(float) * len, &sz); + copyKernel<<>>(devDst, (float *)src1_ddf_i, len, data); + convert_mul_mat_vec_f16_cuda(src0_dd_i, devDst, dst_dd_i, ne00, row_diff, stream); + ggml_cuda_pool_free(devDst, sz); + } + } + break; + default: + GGML_ASSERT(false); + break; + } + +#ifdef GGML_CUDA_F16 + if (src1_convert_f16) { + ggml_cuda_pool_free(src1_dfloat, ash); + } +#endif // GGML_CUDA_F16 + + (void) src1; + (void) dst; + (void) src1_ddq_i; + (void) src1_ncols; + (void) src1_padded_row_size; +} + + +__global__ void compute_positions(float *idx, int *positions, int size, int *positions_out) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= size) return; + if (i < size) { + positions[i] = idx[i] > 0.5 ? 1 : 0; + } + __syncthreads(); + + for (int stride = 1; stride < size; stride *= 2) { + int temp = 0; + if (i >= stride) temp = positions[i - stride]; + __syncthreads(); //Do we really need this sync? + positions[i] += temp; + __syncthreads(); + } + if(i == 0) + positions_out[0] = positions[size - 1]; +} + +#define BLOCK_SIZE 32 + +// still cublas spec N refers to rows, M refers to cols origin +// N 4096 M 16384 +__global__ inline void transpose_cont(float *A, float *B, int N, int M, int stride_0, int strideA_1, int strideB_1, float *idx) { + int row = blockIdx.x; + if (row >= N) return; + int copy_iter = (M + BLOCK_SIZE - 1) / BLOCK_SIZE; + copy_iter = M; + int tid = threadIdx.x; + // Loop over the A and B matrices in blocks of BLOCK_SIZE + int offset = row * strideB_1; + for (int i = tid; i < copy_iter; i+=blockDim.x) { + // int load_idx = i * BLOCK_SIZE + tid; + int load_idx = i; + // Load elements into shared memory + if (load_idx < M) { + B[offset + load_idx] = A[row + load_idx * strideA_1]; // 考虑到了A是转置矩阵 + } else { + B[offset + load_idx] = 0.0f; + } + } +} +__global__ void markRows(float *X, int *marks, int rows) { + //TODO :idx need to bucket + int idx = threadIdx.x + blockDim.x * blockIdx.x; + int predict_idx = idx; + if (idx < rows) { + marks[idx] = (X[predict_idx] >= 0) ? 1 : 0; + } + else { + marks[idx] = 0; + return; + } + + +} + +__global__ void markRowsPosition(int *input, int *output, int rows, int *cols) { + //TODO :idx need to bucket + int idx = threadIdx.x + blockDim.x * blockIdx.x; + int predict_idx = idx; + if (idx >= rows) return; + + int32_t sum = 0; + for (size_t i = 0; i <= idx; ++i) + { + sum += input[i]; + } + output[idx] = sum; + if (idx != rows -1) return; + else { + *cols = sum; + } + +} + +// rows for A +__global__ void copyRows(float *A, float *B, int *prefixSum, int rows, int cols) { + int row = blockIdx.x; + if (row >= rows) return; + int copy_iter = cols; + int tid = threadIdx.x; + // Loop over the A and B matrices in blocks of BLOCK_SIZE + if (prefixSum[row] == 0 || prefixSum[row] == prefixSum[row - 1]) return; + int offset = (prefixSum[row]-1) * cols; + int origin_offset = row * cols; + for (int i = tid; i < copy_iter; i+= blockDim.x) { + int load_idx = i; + if (load_idx < cols) { + B[offset + load_idx] = A[origin_offset + load_idx]; + } else { + B[offset + load_idx] = 0.0f; + } + } +} +__global__ void copyColumns(float *A, float *B, int *prefixSum, int rows, int cols, int new_cols) { + int row = blockIdx.x; + if (row >= rows) return; + int copy_iter = cols; + int tid = threadIdx.x; + // Loop over the A and B matrices in blocks of BLOCK_SIZE + int offset = row * new_cols; + int origin_offset = row * cols; + for (int i = tid; i < copy_iter; i+= blockDim.x) { + int load_idx = i; + if (prefixSum[i] == 0 || prefixSum[i] == prefixSum[i - 1]) continue; + int new_position = prefixSum[i] - 1; + // Load elements into shared memory + if (load_idx < cols) { + B[offset + new_position] = A[origin_offset + load_idx]; + } + } +} + +inline void ggml_cuda_op_mul_mat_transpose_select_gemm( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, const cudaStream_t & stream) { + + GGML_ASSERT(src0_dd_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_dd_i != nullptr); + + const float alpha = 1.0f; + const float beta = 0.0f; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t row_diff = row_high - row_low; + + float * src0_ddq_as_f32; + size_t src0_as = 0; + + if (src0->type != GGML_TYPE_F32) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); + src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT + to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream); + } + const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32; + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + + // the main device has a larger memory buffer to hold the results from all GPUs + // ldc == nrows of the matrix that cuBLAS writes into + int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + ldc = ne0; + size_t src0_as_t = 0; + float *transpose = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as_t); // NOLINT + int blockSize = 32; + int numBlocks = ne00; + transpose_cont<<< numBlocks, blockSize, 0, stream>>>((float *)src0_ddf_i, transpose, ne00, ne01, 1, ne00, ne01,NULL); + + CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream)); + CUBLAS_CHECK( + cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + ne00, src1_ncols, ne10, + &alpha, transpose, ne01, + src1_ddf_i, ne10, + &beta, dst_dd_i, ldc)); + + + if (src0_as > 0) { + ggml_cuda_pool_free(src0_ddq_as_f32, src0_as); + ggml_cuda_pool_free(transpose, src0_as_t); + } + + (void) dst; + (void) src1_ddq_i; + (void) src1_padded_row_size; +} +inline void ggml_cuda_op_mul_mat_transpose_gemm( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, const cudaStream_t & stream) { + + GGML_ASSERT(src0_dd_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_dd_i != nullptr); + + const float alpha = 1.0f; + const float beta = 0.0f; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t row_diff = row_high - row_low; + + float * src0_ddq_as_f32; + size_t src0_as = 0; + + if (src0->type != GGML_TYPE_F32) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); + src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT + to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream); + } + const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32; + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + + // the main device has a larger memory buffer to hold the results from all GPUs + // ldc == nrows of the matrix that cuBLAS writes into + int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + ldc = ne0; + // size_t src0_as_t = 0; + // float *transpose = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as_t); // NOLINT + // int blockSize = 32; + // int numBlocks = ne00; + // transpose_cont<<< numBlocks, blockSize, 0, stream>>>((float *)src0_ddf_i, transpose, ne00, ne01, 1, ne00, ne01,NULL); + + CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream)); + // CUBLAS_CHECK( + // cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + // ne00, src1_ncols, ne10, + // &alpha, transpose, ne01, + // src1_ddf_i, ne10, + // &beta, dst_dd_i, ldc)); + + CUBLAS_CHECK( + cublasSgemm(g_cublas_handles[id], CUBLAS_OP_N, CUBLAS_OP_N, + ne00, src1_ncols, ne10, + &alpha, src0_ddf_i, ne00, + src1_ddf_i, ne10, + &beta, dst_dd_i, ldc)); + + + if (src0_as > 0) { + ggml_cuda_pool_free(src0_ddq_as_f32, src0_as); + // ggml_cuda_pool_free(transpose, src0_as_t); + } + + (void) dst; + (void) src1_ddq_i; + (void) src1_padded_row_size; +} +inline void ggml_cuda_op_dequantize_axpy_vec( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, const cudaStream_t & stream) { + static int *cols = (int *)ggml_cuda_host_malloc(sizeof(int)); + + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + + // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics +#ifdef GGML_CUDA_F16 + size_t ash; + dfloat * src1_dfloat = nullptr; // dfloat == half + + bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || + src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || + src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; + + if (src1_convert_f16) { + src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash); + ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00, + ne00, 1, sizeof(float), 0, 0, + ne00, 1, sizeof(half), 0, 0, stream); + } +#else + const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion +#endif // GGML_CUDA_F16 + struct ggml_tensor_extra_gpu *idx_extra = NULL; + struct ggml_tensor_extra_gpu *dst_extra = NULL; + if (dst->src[2] != NULL) { + idx_extra = (ggml_tensor_extra_gpu *)dst->src[2]->extra; + // dst_extra = (ggml_tensor_extra_gpu *) dst->src[3]->extra; + } + switch (src0->type) { + case GGML_TYPE_Q4_0: + if (dst->src[2] == NULL) { + cudaMemsetAsync((void *)dst_dd_i, 0, ggml_nbytes(dst), stream); + dequantize_axpy_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + } + else { + dst_extra = (ggml_tensor_extra_gpu *) dst->src[3]->extra; + cudaMemsetAsync((void *)dst_dd_i, 0, ggml_nbytes(dst), stream); + dequantize_axpy_sparse_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream, (int *)dst_extra->data_device[0], (float *)dst->src[2]->data); + } + // dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_F16: + if (dst->src[2] == NULL) { + cudaMemsetAsync((void *)dst_dd_i, 0, ggml_nbytes(dst), stream); + convert_axpy_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + } else { + dst_extra = (ggml_tensor_extra_gpu *) dst->src[3]->extra; + cudaMemsetAsync((void *)dst_dd_i, 0, ggml_nbytes(dst), stream); + convert_axpy_sparse_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream, (int *)dst_extra->data_device[0], (float *)dst->src[2]->data); + } + break; + default: + GGML_ASSERT(false); + break; + } + +#ifdef GGML_CUDA_F16 + if (src1_convert_f16) { + ggml_cuda_pool_free(src1_dfloat, ash); + } +#endif // GGML_CUDA_F16 + + (void) src1; + (void) dst; + (void) src1_ddq_i; + (void) src1_ncols; + (void) src1_padded_row_size; +} +inline void ggml_cuda_op_dequantize_axpy_batch( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, const cudaStream_t & stream) { + + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + + // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics +#ifdef GGML_CUDA_F16 + size_t ash; + dfloat * src1_dfloat = nullptr; // dfloat == half + + bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || + src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || + src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; + + if (src1_convert_f16) { + src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash); + ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00, + ne00, 1, sizeof(float), 0, 0, + ne00, 1, sizeof(half), 0, 0, stream); + } +#else + const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion +#endif // GGML_CUDA_F16 + struct ggml_tensor_extra_gpu *idx_extra = NULL; + struct ggml_tensor_extra_gpu *dst_extra = NULL; + if (dst->src[2] != NULL) { + idx_extra = (ggml_tensor_extra_gpu *)dst->src[2]->extra; + } + switch (src0->type) { + case GGML_TYPE_Q4_0: + if (dst->src[2] == NULL) { + cudaMemsetAsync((void *)dst_dd_i, 0, ggml_nbytes(dst), stream); + convert_axpy_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + } + else + { + dst_extra = (ggml_tensor_extra_gpu *) dst->src[3]->extra; + cudaMemsetAsync((void *)dst_dd_i, 0, ggml_nbytes(dst), stream); + int idx_ne = src1->ne[0]; + dequantize_axpy_sparse_batch_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, idx_ne, src1_ncols, stream, (int *)dst_extra->data_device[0], (float *)dst->src[2]->data); + } + dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_F16: + if (dst->src[2] == NULL) { + cudaMemsetAsync((void *)dst_dd_i, 0, ggml_nbytes(dst), stream); + convert_axpy_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + } + else + { + dst_extra = (ggml_tensor_extra_gpu *) dst->src[3]->extra; + cudaMemsetAsync((void *)dst_dd_i, 0, ggml_nbytes(dst), stream); + int idx_ne = src1->ne[0]; + convert_axpy_sparse_batch_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, idx_ne, src1_ncols, stream, (int *)dst_extra->data_device[0], (float *)dst->src[2]->data); + } break; default: GGML_ASSERT(false); @@ -7322,9 +8462,16 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te if (!g_cublas_loaded) return false; const int64_t ne10 = src1->ne[0]; + const int64_t ne01 = src0->ne[1]; const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; + if (dst->src[2] != NULL){ + return false; + } + else { + dst->src[2] = NULL; + } // TODO: find the optimal values for these return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && @@ -7626,9 +8773,10 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } } else { bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); - + // when tensor cores are available, use them for large batch size // ref: https://github.com/ggerganov/llama.cpp/pull/3776 + // TODO: disble tensor core for now if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) { use_mul_mat_q = false; } @@ -7636,7 +8784,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 if (use_mul_mat_q) { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); } else { - ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); + if ( (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_Q4_0) && dst->src[2] != NULL) + // Since ggml_cuda_op_mul_mat_q supports FP16 + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); + else + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } } } else { @@ -7644,6 +8796,20 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } } +void ggml_cuda_axpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && + src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU; + if (src1->ne[1] == 1) + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_axpy_vec, false); + // else if (src1->ne[1] < 80){ + else if (1){ + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_axpy_batch, false); + } + else { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_transpose_gemm, false); + } +} + static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale); } @@ -7656,8 +8822,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + // GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); + // GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); @@ -7681,6 +8847,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg CUDA_CHECK(ggml_cuda_set_device(g_main_device)); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; + if (src0->backend == GGML_BACKEND_GPU && src1->backend == GGML_BACKEND_CPU) { + int size = ggml_nbytes(src0); + const struct ggml_tensor_extra_gpu *src0_extra = (ggml_tensor_extra_gpu *)src0->extra; + cudaMemcpyAsync(src1->data, src0_extra->data_device[g_main_device], size, cudaMemcpyDeviceToHost, main_stream); + cudaStreamSynchronize(main_stream); + return ; + } + else if (src0->backend == GGML_BACKEND_CPU){ + GGML_ASSERT(-1); + } + const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; @@ -7727,7 +8904,7 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi); } -void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col); } @@ -7737,7 +8914,7 @@ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, gg (void) dst; } -void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { +static void ggml_cuda_transform_tensor_impl(void * data, struct ggml_tensor * tensor, bool alloc_only) { const int64_t nrows = ggml_nrows(tensor); const int64_t ne0 = tensor->ne[0]; @@ -7792,14 +8969,16 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { char * buf; CUDA_CHECK(cudaMalloc(&buf, size)); - char * buf_host = (char*)data + offset_split; // set padding to 0 to avoid possible NaN values if (size > original_size) { CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); } - CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice)); + if (!alloc_only) { + char * buf_host = (char*)data + offset_split; + CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice)); + } extra->data_device[id] = buf; @@ -7813,6 +8992,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { tensor->extra = extra; } +void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { + return ggml_cuda_transform_tensor_impl(data, tensor, false); +} + +void ggml_cuda_alloc_tensor(struct ggml_tensor * tensor) { + return ggml_cuda_transform_tensor_impl(nullptr, tensor, true); +} + void ggml_cuda_free_data(struct ggml_tensor * tensor) { if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { return; @@ -7842,11 +9029,11 @@ static size_t g_temp_tensor_extra_index = 0; static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { if (g_temp_tensor_extras == nullptr) { - g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE]; + g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; } size_t alloc_index = g_temp_tensor_extra_index; - g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE; + g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES; ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; memset(extra, 0, sizeof(*extra)); @@ -7965,6 +9152,8 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) { } void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { + if (tensor == NULL) + return; ggml_cuda_assign_buffers_impl(tensor, true, false, false); } @@ -8019,11 +9208,19 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ const bool any_on_device = tensor->backend == GGML_BACKEND_GPU || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); - - if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) { - return false; + + if (!any_on_device) { + if (tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_AXPY) + { + return false; + } } + + // if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) { + // return false; + // } + if (tensor->op == GGML_OP_MUL_MAT) { if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { #ifndef NDEBUG @@ -8075,6 +9272,13 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ } func = ggml_cuda_mul_mat; break; + case GGML_OP_AXPY: + // if (!any_on_device && tensor->ne[1] < 80) { + if (!any_on_device) { + return false; + } + func = ggml_cuda_axpy; + break; case GGML_OP_SCALE: func = ggml_cuda_scale; break; @@ -8173,11 +9377,11 @@ struct ggml_backend_buffer_context_cuda { ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { if (temp_tensor_extras == nullptr) { - temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE]; + temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; } size_t alloc_index = temp_tensor_extra_index; - temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE; + temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES; ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index]; memset(extra, 0, sizeof(*extra)); diff --git a/ggml-cuda.h b/ggml-cuda.h index 528e66c33..d253bab0e 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -29,7 +29,11 @@ GGML_API void ggml_cuda_host_free(void * ptr); GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split); GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor); +GGML_API void ggml_cuda_alloc_tensor(struct ggml_tensor * tensor); GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor); +GGML_API void ggml_cuda_cpy_1d(struct ggml_tensor * dst, const struct ggml_tensor * src); +GGML_API bool debug_equal(short *a, short *b); +GGML_API void **ggml_cuda_get_data_pp(struct ggml_tensor * tensor); GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); diff --git a/ggml-quants.c b/ggml-quants.c index cf2860b8c..608e8e986 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -2423,6 +2423,78 @@ static inline __m128i get_scale_shuffle(int i) { } #endif +void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale) { + const int qk = QK8_0; + const int nb = n / qk; + assert(n % qk == 0); + assert(nb % 2 == 0); + + const block_q4_0 * restrict x = vx; + + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + __m256i alpha_v = _mm256_set1_epi16((short)alpha); + // Main loop + for (int i = 0; i < nb; ++i) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(scale) ); + __m256i bx = bytes_from_nibbles_32(x[i].qs); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = _mm256_set1_epi8( 8 ); + bx = _mm256_sub_epi8( bx, off ); + //16个数计算 + __m128i m_a = _mm256_extracti128_si256(bx, 0); + __m256i m_x = _mm256_cvtepi8_epi16(m_a); //16 elements + m_x = _mm256_mullo_epi16(m_x, alpha_v); + __m128i x_0 = _mm256_extracti128_si256(m_x, 0); + __m256i x0_32 = _mm256_cvtepi16_epi32(x_0); + __m256 fx0 = _mm256_cvtepi32_ps(x0_32); + fx0 = _mm256_mul_ps(fx0, d); + + + __m256 by = _mm256_loadu_ps((const __m256 *)(vy+i*128)); + + by = _mm256_add_ps(by, fx0); + _mm256_storeu_ps((__m256*)(vz + i*128), by); + //second phase + + x_0 = _mm256_extracti128_si256(m_x, 1); + x0_32 = _mm256_cvtepi16_epi32(x_0); + fx0 = _mm256_cvtepi32_ps(x0_32); + fx0 = _mm256_mul_ps(fx0, d); + by = _mm256_loadu_ps((const __m256 *)(vy+i*128+32)); + by = _mm256_add_ps(by, fx0); + _mm256_storeu_ps((__m256*)(vz + i*128+32), by); + + //third phase + m_a = _mm256_extracti128_si256(bx, 1); + m_x = _mm256_cvtepi8_epi16(m_a); + m_x = _mm256_mullo_epi16(m_x, alpha_v); + x_0 = _mm256_extracti128_si256(m_x, 0); + x0_32 = _mm256_cvtepi16_epi32(x_0); + fx0 = _mm256_cvtepi32_ps(x0_32); + fx0 = _mm256_mul_ps(fx0, d); + by = _mm256_loadu_ps((const __m256 *)(vy+i*128+64)); + + by = _mm256_add_ps(by, fx0); + _mm256_storeu_ps((__m256*)(vz + i*128+64), by); + + //fourth phase + x_0 = _mm256_extracti128_si256(m_x, 1); + x0_32 = _mm256_cvtepi16_epi32(x_0); + fx0 = _mm256_cvtepi32_ps(x0_32); + fx0 = _mm256_mul_ps(fx0, d); + by = _mm256_loadu_ps((const __m256 *)(vy+i*128+96)); + by = _mm256_add_ps(by, fx0); + _mm256_storeu_ps((__m256*)(vz + i*128+96), by); + + } + +} + + void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = QK8_0; const int nb = n / qk; diff --git a/ggml-quants.h b/ggml-quants.h index 70c12c274..6308f483e 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -210,6 +210,8 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); +void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale); + // Dot product void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); diff --git a/ggml.c b/ggml.c index 3202a517b..5f1da9d16 100644 --- a/ggml.c +++ b/ggml.c @@ -24,6 +24,9 @@ #include #include +// #define _GNU_SOURCE +// #include + #ifdef GGML_USE_METAL #include #endif @@ -143,7 +146,7 @@ void ggml_print_backtrace(void) { } #endif -/*#define GGML_PERF*/ +#define GGML_PERF #define GGML_DEBUG 0 #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 @@ -384,8 +387,8 @@ int64_t ggml_cycles_per_ms(void) { #ifdef GGML_PERF #define ggml_perf_time_ms() ggml_time_ms() #define ggml_perf_time_us() ggml_time_us() -#define ggml_perf_cycles() ggml_cycles() -#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms() +#define ggml_perf_cycles() 0 +#define ggml_perf_cycles_per_ms() 0 #else #define ggml_perf_time_ms() 0 #define ggml_perf_time_us() 0 @@ -1555,6 +1558,15 @@ inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_ *s = sum; } +inline static void ggml_vec_sum_i32_ggf(const int n, int64_t * s, const int32_t * x) { + int64_t sum = 0; + for (int i = 0; i < n; ++i) { + sum += (int64_t)x[i]; + } + *s = sum; +} + + inline static void ggml_vec_max_f32(const int n, float * s, const float * x) { #ifndef GGML_USE_ACCELERATE float max = -INFINITY; @@ -1613,6 +1625,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GROUP_NORM", "MUL_MAT", + "AXPY", "OUT_PROD", "SCALE", @@ -1666,7 +1679,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); +// Since we added AXPY +// static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1748,7 +1762,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); +// Since we added AXPY +// static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -1771,6 +1786,7 @@ static void ggml_setup_op_has_task_pass(void) { p[GGML_OP_ACC ] = true; p[GGML_OP_MUL_MAT ] = true; + p[GGML_OP_AXPY ] = true; p[GGML_OP_OUT_PROD ] = true; p[GGML_OP_SET ] = true; p[GGML_OP_GET_ROWS_BACK ] = true; @@ -1794,22 +1810,6 @@ static void ggml_setup_op_has_task_pass(void) { // ggml context // -struct ggml_context { - size_t mem_size; - void * mem_buffer; - bool mem_buffer_owned; - bool no_alloc; - bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers - - int n_objects; - - struct ggml_object * objects_begin; - struct ggml_object * objects_end; - - struct ggml_scratch scratch; - struct ggml_scratch scratch_save; -}; - struct ggml_context_container { bool used; @@ -2502,6 +2502,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.is_param =*/ false, /*.grad =*/ NULL, /*.src =*/ { NULL }, + /*.is_finish =*/ ATOMIC_VAR_INIT(0), /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, @@ -3024,16 +3025,23 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) { return (float *)(tensor->data); } +int32_t * ggml_get_data_i32(const struct ggml_tensor * tensor) { + assert(tensor->type == GGML_TYPE_I32); + return (int32_t *)(tensor->data); +} + enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { GGML_ASSERT(tensor->op == GGML_OP_UNARY); return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0); } const char * ggml_get_name(const struct ggml_tensor * tensor) { + if (tensor == NULL) return NULL; return tensor->name; } struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) { + if (tensor == NULL) return NULL; strncpy(tensor->name, name, sizeof(tensor->name)); tensor->name[sizeof(tensor->name) - 1] = '\0'; return tensor; @@ -3056,6 +3064,7 @@ struct ggml_tensor * ggml_view_tensor( for (int i = 0; i < GGML_MAX_DIMS; i++) { result->nb[i] = src->nb[i]; } + result->op = GGML_OP_VIEW; return result; } @@ -3156,6 +3165,10 @@ static struct ggml_tensor * ggml_add_impl( bool inplace) { // TODO: support less-strict constraint // GGML_ASSERT(ggml_can_repeat(b, a)); + if (a == NULL) + return b; + if (b == NULL) + return a; GGML_ASSERT(ggml_can_repeat_rows(b, a)); bool is_node = false; @@ -3172,6 +3185,7 @@ static struct ggml_tensor * ggml_add_impl( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; + result->src[2] = NULL; return result; } @@ -3220,6 +3234,48 @@ static struct ggml_tensor * ggml_add_cast_impl( return result; } +static struct ggml_tensor * ggml_add_idx_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * idx, + bool inplace) { + // TODO: support less-strict constraint + // GGML_ASSERT(ggml_can_repeat(b, a)); + // GGML_ASSERT(ggml_can_repeat_rows(b, a)); + // printf("in add_idx\n"); + if (a == NULL) + return b; + if (b == NULL) + return a; + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + // TODO: support backward pass for broadcasting + GGML_ASSERT(ggml_are_same_shape(a, b)); + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_ADD; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = idx; + + return result; +} +// add for all gather +struct ggml_tensor * ggml_add_idx( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * idx) { + return ggml_add_idx_impl(ctx, a, b, idx, false); +} + struct ggml_tensor * ggml_add_cast( struct ggml_context * ctx, struct ggml_tensor * a, @@ -4052,6 +4108,94 @@ struct ggml_tensor * ggml_mul_mat( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; + result->src[2] = NULL; + result->src[3] = NULL; + + return result; +} +// ggml_mul_mat_idx for GPU +struct ggml_tensor * ggml_mul_mat_special( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + struct ggml_tensor * d, + struct ggml_tensor * ref) { + if (a == NULL || b == NULL) + return NULL; + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + const int64_t ne[4] = { ref->ne[1], b->ne[1], b->ne[2], b->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); + + result->op = GGML_OP_MUL_MAT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; + result->src[3] = d; + + return result; +} +// ggml_mul_mat_idx for CPU and axpy in GPU +struct ggml_tensor * ggml_mul_mat_idx( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + struct ggml_tensor * d) { + if (a == NULL || b == NULL) + return NULL; + GGML_ASSERT(!ggml_is_transposed(a)); + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); + + result->op = GGML_OP_MUL_MAT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; + result->src[3] = d; + + return result; +} + +struct ggml_tensor * ggml_axpy( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + struct ggml_tensor * d) { + if (a == NULL || b == NULL) + return NULL; + GGML_ASSERT(!ggml_is_transposed(a)); + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + const int64_t ne[4] = { a->ne[0], b->ne[1], b->ne[2], b->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); + + result->op = GGML_OP_AXPY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; + result->src[3] = d; return result; } @@ -6827,6 +6971,10 @@ static void ggml_compute_forward_add_f32( // row range for this thread const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); + struct ggml_tensor *src2 = dst->src[2]; + float *ft; + if (src2 != NULL) + ft = src2->data; if (nb10 == sizeof(float)) { for (int ir = ir0; ir < ir1; ++ir) { @@ -6846,7 +6994,43 @@ static void ggml_compute_forward_add_f32( #ifdef GGML_USE_ACCELERATE vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00); #else - ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr); + // ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr); + if (src2 == NULL) + ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr); + else + { + // printf("head %d\n", src2->ne[0]); + // int k; + // scanf("%d", &k); + // ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr); + int num = src2->ne[0]; + if (num > 1000) { + for (int i = 0; i < ne00; i++) + { + dst_ptr[i] = ft[i] >= 0.0f ? src0_ptr[i] + src1_ptr[i] : 0; + } + } + else { + // ggml_set_zero(dst); + for (int i = 0; i < num; i++) + { + int id = i << 7; + /* dst_ptr[i] = ft[id] > 0.4? src0_ptr[i] + src1_ptr[i] : 0; */ + if (ft[i] < -7.0f){ + for (int j = 0; j < 128; j++) + dst_ptr[id + j] = 0; + // dst_ptr[i] = 0; + continue; + } + else + { + for (int j = 0; j < 128; j++) + dst_ptr[id+j] = src0_ptr[id+j] + src1_ptr[id+j]; + } + // dst_ptr[i] = src0_ptr[i] + src1_ptr[i]; + } + } + } #endif } } else { @@ -7953,6 +8137,39 @@ static void ggml_compute_forward_sum_f16( ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum); } +static void ggml_compute_forward_sum_i32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_is_scalar(dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + assert(src0->nb[0] == sizeof(int32_t)); + + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) + + int64_t sum = 0; + int64_t row_sum = 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_i32_ggf(ne00, + &row_sum, + (int32_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); + sum += row_sum; + } + } + } + ((int32_t *) dst->data)[0] = sum; +} + + static void ggml_compute_forward_sum( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -7966,6 +8183,10 @@ static void ggml_compute_forward_sum( { ggml_compute_forward_sum_f16(params, src0, dst); } break; + case GGML_TYPE_I32: + { + ggml_compute_forward_sum_i32(params, src0, dst); + } break; default: { GGML_ASSERT(false); @@ -13641,6 +13862,1026 @@ static void ggml_compute_forward_cross_entropy_loss_back( } } + +static void ggml_compute_forward_mul_mat_sparse_head( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + + if (params->type == GGML_TASK_INIT) { + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } + } + } + } + ggml_set_zero(dst); + atomic_store(params->aic, 0); + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = ne11*ne12*ne13; // src1 rows + + //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); + + // distribute the thread work across the inner or outer loop based on which one is larger + + const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows + const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows + + const int64_t ith0 = ith % nth0; + const int64_t ith1 = ith / nth0; + + const int64_t dr0 = (nr0 + 8*nth0 - 1)/(8*nth0); + const int64_t dr1 = (nr1 + nth1 - 1)/nth1; + + int64_t ir010 = dr0*ith0; + // const int64_t ir011 = MIN(ir010 + dr0, nr0); + const int64_t ir011 = ir010 + dr0; + + const int64_t ir110 = dr1*ith1; + const int64_t ir111 = MIN(ir110 + dr1, nr1); + + //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111); + + // threads with no work simply yield (not sure if it helps) + // if (ir010 >= ir011 || ir110 >= ir111) { + // sched_yield(); + // return; + // } + + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); + + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; + + // attempt to reduce false-sharing (does not seem to make a difference) + float tmp[16]; + float *ffdata = (float *)dst->src[2]->data; + int *gid = (int *)dst->src[3]->data; + while(true) { + ir010 = atomic_fetch_add(params->aic, dr0); + for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { + // for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { + // for (int64_t iir0 = ir010; iir0 < ir011;) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { + const int64_t i13 = (ir1/(ne12*ne11)); + const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11; + const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11); + + // broadcast src0 into src1 + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size + : (i11*nb11 + i12*nb12 + i13*nb13)); + + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); + + //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + //} + + // for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + for (int64_t ir0 = ir010; ir0 < ir010+dr0; ++ir0) { + if (ir0 > nr0) + break; + int id = ir0 >> 7; + if (ffdata[id] < -7.0f) + { + dst_col[ir0] = 0; + ir0 += 127; + continue; + } + // vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + } + // memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); + } + // } + } + if (ir010 + dr0 >= nr0) { + break; + } + + } + + +} + +static void ggml_compute_forward_mul_mat_sparse( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + +#if defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(src0, src1, dst)) { + // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension + // ref: https://github.com/ggerganov/ggml/pull/224 + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } +#endif + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + if (params->ith != 0) { + return; + } + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + // broadcast src0 into src1 across 2nd,3rd dimension + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); + + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + + if (type != GGML_TYPE_F32) { + float * const wdata = params->wdata; + ggml_to_float_t const to_float = type_traits[type].to_float; + + size_t id = 0; + for (int64_t i01 = 0; i01 < ne01; ++i01) { + to_float((const char *) x + i01*nb01, wdata + id, ne00); + id += ne00; + } + + assert(id*sizeof(float) <= params->wsize); + x = wdata; + } + + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); + } + } + + //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); + + return; + } +#endif + + if (params->type == GGML_TASK_INIT) { + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } + } + } + } + atomic_store(params->aic, 0); + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = ne11*ne12*ne13; // src1 rows + + + // distribute the thread work across the inner or outer loop based on which one is larger + + const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows + const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows + + const int64_t ith0 = ith % nth0; + const int64_t ith1 = ith / nth0; + + const int64_t dr0 = (nr0 + 8*nth0 - 1)/(8*nth0); + const int64_t dr1 = (nr1 + nth1 - 1)/nth1; + // const int64_t dr0 = (nr0 + nth0 - 1)/(nth0); + // const int64_t dr1 = (nr1 + nth1 - 1)/nth1; + + int64_t ir010 = dr0*ith0; + int64_t ir011 = MIN(ir010 + dr0, nr0); + // const int64_t ir011 = ir010 + dr0; + + const int64_t ir110 = dr1*ith1; + const int64_t ir111 = MIN(ir110 + dr1, nr1); + + //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111); + + // threads with no work simply yield (not sure if it helps) + // if (ir010 >= ir011 || ir110 >= ir111) { + // sched_yield(); + // return; + // } + + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); + + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; + int total = 0; + + // attempt to reduce false-sharing (does not seem to make a difference) + float tmp[16]; + float *ffdata = (float *)dst->src[2]->data; + int *gid = (int *)dst->src[3]->data; + float *predictor_data = (float *)dst->src[2]->data; + const size_t predictor_row_size = dst->src[2]->ne[0]*ggml_type_size(GGML_TYPE_F32)/ggml_blck_size(GGML_TYPE_F32); + + while(true) { + ir010 = atomic_fetch_add(params->aic, dr0); + ir011 = MIN(ir010 + dr0, nr0); + for (int64_t ir0 = ir010; ir0 < ir011; ++ir0) + { + for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) + { + if (ir0 > nr0) + break; + // for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { + // for (int64_t iir0 = ir010; iir0 < ir011;) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) + { + const int64_t i13 = (ir1 / (ne12 * ne11)); + const int64_t i12 = (ir1 - i13 * ne12 * ne11) / ne11; + const int64_t i11 = (ir1 - i13 * ne12 * ne11 - i12 * ne11); + + // broadcast src0 into src1 + const int64_t i03 = i13 / r3; + const int64_t i02 = i12 / r2; + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + const char *src0_row = (const char *)src0->data + (0 + i02 * nb02 + i03 * nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char *src1_col = (const char *)wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size + : (i11 * nb11 + i12 * nb12 + i13 * nb13)); + ffdata = (float *)((char *)predictor_data + (i11 + i12*ne11 + i13*ne12*ne11)*predictor_row_size); + // printf("ith %d row %d ir1 %d %d %d %d %d\n", ith, ir0, ir1, src1_col-(char *)wdata, ffdata-predictor_data, predictor_row_size, dst->src[2]->ne[1]); + + float *dst_col = (float *)((char *)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); + + // if (ffdata[ir0] <= 0.0f) { + if (gid[ir0] == 1 || ffdata[ir0] < -0.0f) { + dst_col[ir0] = 0; + continue; + } + vec_dot(ne00, &dst_col[ir0], src0_row + ir0 * nb01, src1_col); + } + // } + } + } + if (ir010 + dr0 >= nr0) { + break; + } + + } + // printf("total %d\n", total); + + // int predictor_cpu = 0; + // int predictor = 0; + // for (int i = 0; i < 9216 *4 ; i++) { + // if (ffdata[i] > 0.5f && gid[i] == 0) + // predictor_cpu += 1; + // if (ffdata[i] > 0.5f) + // predictor += 1; + // } + // if (ith == 0) + // printf("predictor %d predictor_cpu %d\n", predictor, predictor_cpu); +} + +// vz = alpha * vx + vy +static void ggml_axpy_normal_f16(const int n, const ggml_fp16_t * vx, const ggml_fp16_t * restrict vy, const void* restrict vz, ggml_fp16_t alpha) { + float *res = (float *)vz; + for (int i = 0; i < n; i++) { + res[i] = res[i] + (GGML_FP16_TO_FP32(vx[i])*GGML_FP16_TO_FP32(alpha)); + } +} +static void ggml_axpy_avx_f16(const int n, const ggml_fp16_t * restrict vx, const ggml_fp16_t * restrict vy, void* restrict vz, ggml_fp16_t alpha) { + + float *result = (float *)vz; + float alpha_f32 = GGML_FP16_TO_FP32(alpha); + __m256 scale = _mm256_set1_ps(alpha_f32); // 创建scale向量 + for (int i = 0; i < n; i += 8) { + __m128i vx_low = _mm_loadu_si128((__m128i const*)(&vx[i])); + __m256 vx_f32 = _mm256_cvtph_ps(vx_low); // 转换vx为fp32 + __m256 vy_f32 = _mm256_loadu_ps((float const*)(result+ i)); // 加载vy + __m256 res = _mm256_fmadd_ps(vx_f32, scale, vy_f32); // 执行向量加法和乘法操作 + _mm256_storeu_ps((float*)(&result[i]), res); // 存储结果 + } + +} +atomic_flag g_axpy_dense_lock = ATOMIC_FLAG_INIT; +static void ggml_compute_forward_mul_mat_axpy_dense( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + // GGML_ASSERT(ne0 == ne01); + // GGML_ASSERT(ne1 == ne11); + // GGML_ASSERT(ne2 == ne12); + // GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + if (params->type == GGML_TASK_INIT) { + ggml_set_zero(dst); + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } + } + } + } + atomic_store(params->aic, 0); + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + ggml_fp16_t* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + struct ggml_tensor *src2 = dst->src[2]; + + // parallelize by src0 rows + const int64_t dr = (src2->ne[0] + 8*nth - 1)/(8*nth); + const int nr = ggml_nrows(src0); + + // const int64_t ir10 = dr*ith; + // const int64_t ir11 = MIN(ir10 + dr, src2->ne[0]); + + // src1 rows + const int64_t nr1 = ne11*ne12*ne13; + // float *idx = src2->data; + // int *gid = (int *)(dst->src[3]->data); + // printf("down %d up %d ne00 %d\n", ir10, ir11, ne00); + + float vec[ne00*4]; + void *vy = vec; + memset(vy, 0, ne00*4); + char* src0_row = (const char *) src0->data; + while(true) { + const int ir0 = atomic_fetch_add(params->aic, dr); + for (int64_t ir1 = ir0; ir1 < ir0+dr; ir1++) { + if (ir1 >= nr) break; + // if (gid[ir1] == 1) + // continue; + // if (idx[ir1] < 0.0f) + // continue; + // ggml_axpy_normal_f16(ne00, src0_row+nb01*ir1, vy, vy, wdata[ir1]); + ggml_axpy_avx_f16(ne00, src0_row+nb01*ir1, vy, vy, wdata[ir1]); + } + if (ir0 + dr >= nr) + break; + } + + // 获取锁 + while (atomic_flag_test_and_set(&g_axpy_dense_lock)) { + // 如果锁已经被占用,则等待 + } + + float *res = (float *)(dst->data); + float *tmp = (float *)vy; + int i; + + + // 计算剩余的元素个数 + int remainder = ne00 % 8; + + // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) { + __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 + __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 + _mm256_storeu_ps(res + i, result); // 存储结果到res中 + } + + // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) { + res[i] += tmp[i]; + } + // for (i = 0; i < dst->ne[0]; i++) { + // res[i] += tmp[i]; + // } + + atomic_flag_clear(&g_axpy_dense_lock); + +} + +atomic_flag g_axpy_lock = ATOMIC_FLAG_INIT; +atomic_int g_axpy_control = 0; +static void ggml_compute_forward_mul_mat_axpy( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + // GGML_ASSERT(ne0 == ne01); + // GGML_ASSERT(ne1 == ne11); + // GGML_ASSERT(ne2 == ne12); + // GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + if (params->type == GGML_TASK_INIT) { + ggml_set_zero(dst); + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } + } + } + } + atomic_store(params->aic, 0); + atomic_store(&g_axpy_control, 0); + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + ggml_fp16_t* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + struct ggml_tensor *src2 = dst->src[2]; + + // parallelize by src0 rows + // const int64_t dr = (src2->ne[0] + 8*nth - 1)/(8*nth); + const int64_t dr = (ne01 + nth - 1)/(nth); + const int nr = ggml_nrows(src0); + + const int64_t ir10 = dr*ith; + // const int64_t ir10 = dr*ith; + // const int64_t ir11 = MIN(ir10 + dr, src2->ne[0]); + + // src1 rows + const int64_t nr1 = ne11*ne12*ne13; + float *idx = src2->data; + int idx_row_size = src2->nb[1]; + int *gid = (int *)(dst->src[3]->data); + + float vec[ne00*4]; + void *vy = vec; + char* src0_row = (const char *) src0->data; + ggml_fp16_t * src1_ptr = NULL; + for (int col_idx = 0; col_idx < nr1; col_idx++) { + src1_ptr = (ggml_fp16_t *)((char *)wdata + col_idx * row_size); + idx = (float *)((char *)src2->data + col_idx * idx_row_size); + memset(vy, 0, ne00*4); + // maybe write a special axpy for batch 1 + // while(true) { + // const int ir0 = atomic_fetch_add(params->aic, dr); + for (int64_t ir1 = ir10; ir1 < ir10+dr; ir1++) { + if (ir1 >= nr) { + break; + } + if (src1_ptr[ir1]==0) + continue; + if (gid[ir1] == 1) { + continue; + } + if (idx[ir1] < -0.0f) + continue; + // ggml_axpy_normal_f16(ne00, src0_row+nb01*ir1, vy, vy, wdata[ir1]); + ggml_axpy_avx_f16(ne00, src0_row+nb01*ir1, vy, vy, src1_ptr[ir1]); + } + + // 获取锁 + while (atomic_flag_test_and_set(&g_axpy_lock)) + { + // 如果锁已经被占用,则等待 + } + + float *res = (float *)((char *)(dst->data) + col_idx * nb1); + float *tmp = (float *)vy; + int i; + + + // 计算剩余的元素个数 + int remainder = ne00 % 8; + + // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) { + __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 + __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 + _mm256_storeu_ps(res + i, result); // 存储结果到res中 + } + + // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) { + res[i] += tmp[i]; + } + + + atomic_flag_clear(&g_axpy_lock); + } + +} +static void ggml_compute_forward_mul_mat_axpy_q4_0( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + // GGML_ASSERT(ne0 == ne01); + // GGML_ASSERT(ne1 == ne11); + // GGML_ASSERT(ne2 == ne12); + // GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + if (params->type == GGML_TASK_INIT) { + ggml_set_zero(dst); + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } + } + } + } + atomic_store(params->aic, 0); + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + ggml_fp16_t* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + struct ggml_tensor *src2 = dst->src[2]; + + // parallelize by src0 rows + // const int64_t dr = (src2->ne[0] + 8*nth - 1)/(8*nth); + const int64_t dr = (src2->ne[0] + nth - 1)/(nth); + const int nr = ggml_nrows(src0); + + const int64_t ir10 = dr*ith; + // const int64_t ir11 = MIN(ir10 + dr, src2->ne[0]); + + // src1 rows + const int64_t nr1 = ne11*ne12*ne13; + float *idx = src2->data; + int idx_row_size = src2->nb[1]; + int *gid = (int *)(dst->src[3]->data); + // printf("down %d up %d ne00 %d\n", ir10, ir11, ne00); + + float vec[ne00*4]; + void *vy = vec; + char* src0_row = (const char *) src0->data; + for (int col_idx = 0; col_idx < nr1; col_idx++) { + // const block_q8_0 * restrict nerual = wdata; + const block_q8_0 *restrict nerual = ((char *)wdata + col_idx * row_size); + idx = (float *)((char *)src2->data + col_idx * idx_row_size); + memset(vy, 0, ne00 * 4); + // while(true) { + // const int ir0 = atomic_fetch_add(params->aic, dr); + for (int64_t ir1 = ir10; ir1 < ir10 + dr; ir1++) + { + if (ir1 >= nr) + break; + if (gid[ir1] == 1) + continue; + if (idx[ir1] < 0.0f) + continue; + int bid = ir1 / QK8_0; + int qsid = ir1 % QK8_0; + int b = (int)nerual[bid].qs[qsid]; + if (b == 0) + continue; + ggml_fp16_t d = nerual[bid].d; + ggml_axpy_q4_0_q8_0(ne00, src0_row + nb01 * ir1, vy, vy, b, d); + } + // if (ir0 + dr >= nr) + // break; + // } + + // 获取锁 + while (atomic_flag_test_and_set(&g_axpy_lock)) + { + // 如果锁已经被占用,则等待 + } + + // float *res = (float *)(dst->data); + float *res = (float *)((char *)(dst->data) + col_idx * nb1); + float *tmp = (float *)vy; + int i; + + // 计算剩余的元素个数 + int remainder = ne00 % 8; + + // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) + { + __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 + __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 + _mm256_storeu_ps(res + i, result); // 存储结果到res中 + } + + // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) + { + res[i] += tmp[i]; + } + atomic_flag_clear(&g_axpy_lock); + } + +} +atomic_flag g_axpy_head_lock = ATOMIC_FLAG_INIT; +static void ggml_compute_forward_mul_mat_axpy_head( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + // GGML_ASSERT(ne0 == ne01); + // GGML_ASSERT(ne1 == ne11); + // GGML_ASSERT(ne2 == ne12); + // GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + if (params->type == GGML_TASK_INIT) { + ggml_set_zero(dst); + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } + } + } + } + atomic_store(params->aic, 0); + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + const ggml_fp16_t* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + + struct ggml_tensor *src2 = dst->src[2]; + int chunk = ne00 / 32; + + // parallelize by src0 rows + const int64_t dr = (src2->ne[0] + chunk - 1)/chunk; + const int nr = ggml_nrows(src0); + + // const int64_t ir10 = dr*ith; + // const int64_t ir11 = MIN(ir10 + dr, src2->ne[0]); + + // src1 rows + const int64_t nr1 = ne11*ne12*ne13; + float *idx = src2->data; + int *gid = (int *)(dst->src[3]->data); + // printf("down %d up %d ne00 %d\n", ir10, ir11, ne00); + + float vec[ne00*4]; + void *vy = vec; + memset(vy, 0, ne00*4); + char* src0_row = (const char *) src0->data; + while (true) { + const int ir0 = atomic_fetch_add(params->aic, dr); + // int id = ir0 >> 7; + // if (idx[id] < -15.0f) + // continue; + for (int64_t ir1 = ir0; ir1 < ir0+dr; ir1++) { + if (ir1 >= nr) break; + // ggml_axpy_normal_f16(ne00, src0_row+nb01*ir1, vy, vy, wdata[ir1]); + ggml_axpy_avx_f16(ne00, src0_row+nb01*ir1, vy, vy, wdata[ir1]); + } + if (ir0 + dr >= nr) + break; + } + + // 获取锁 + while (atomic_flag_test_and_set(&g_axpy_head_lock)) { + // 如果锁已经被占用,则等待 + } + float *res = (float *)(dst->data); + float *tmp = (float *)vy; + int i; + + + // 计算剩余的元素个数 + int remainder = ne00 % 8; + + // 使用AVX指令进行向量化计算 + for (i = 0; i < ne00 - remainder; i += 8) { + __m256 res_vec = _mm256_loadu_ps(res + i); // 加载res中的8个浮点数 + __m256 tmp_vec = _mm256_loadu_ps(tmp + i); // 加载tmp中的8个浮点数 + __m256 result = _mm256_add_ps(res_vec, tmp_vec); // 执行加法运算 + _mm256_storeu_ps(res + i, result); // 存储结果到res中 + } + + // 处理剩余的元素 + for (i = ne00 - remainder; i < ne00; i++) { + res[i] += tmp[i]; + } + // for (i = 0; i < ne00; i++) { + // res[i] = tmp[i]; + // } + + atomic_flag_clear(&g_axpy_head_lock); + +} + ///////////////////////////////// static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { @@ -13750,7 +14991,43 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_MUL_MAT: { - ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); + if (tensor->src[2] != NULL) { + int num = tensor->src[2]->ne[0]; + if (num > 1000) { + ggml_compute_forward_mul_mat_sparse(params, tensor->src[0], tensor->src[1], tensor); + break; + } + else { + // if (params->ith == 0) + // printf("name %s num %d\n", ggml_get_name(tensor), num); + ggml_compute_forward_mul_mat_sparse_head(params, tensor->src[0], tensor->src[1], tensor); + // ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); + break; + } + } + else + ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_AXPY: + { + // printf("here? %d\n", tensor->src[0]->type); + struct ggml_tensor *src3 = tensor->src[3]; + if (tensor->src[2] == NULL) { + ggml_compute_forward_mul_mat_axpy_dense(params, tensor->src[0], tensor->src[1], tensor); + } + else if (src3 != NULL){ + if (tensor->src[0]->type != GGML_TYPE_Q4_0) { + ggml_compute_forward_mul_mat_axpy(params, tensor->src[0], tensor->src[1], tensor); + } + else { + ggml_compute_forward_mul_mat_axpy_q4_0(params, tensor->src[0], tensor->src[1], tensor); + + } + } + else { + ggml_compute_forward_mul_mat_axpy_head(params, tensor->src[0], tensor->src[1], tensor); + } + // ggml_compute_forward_mul_mat_axpy(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_OUT_PROD: { @@ -15062,6 +16339,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * cgraph->leafs[cgraph->n_leafs] = node; cgraph->n_leafs++; + atomic_store(&(node->is_finish), 1); } else { GGML_ASSERT(cgraph->n_nodes < cgraph->size); @@ -15407,6 +16685,7 @@ struct ggml_compute_state_shared { int64_t perf_node_start_time_us; const int n_threads; + atomic_int aic; // synchronization primitives atomic_int n_active; // num active threads @@ -15430,6 +16709,28 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const node->perf_cycles += cycles_cur; node->perf_time_us += time_us_cur; } +static void ggml_graph_compute_perf_stats_node_gpu(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { + int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles; + int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us; + + node->perf_runs+=2; + node->perf_cycles += cycles_cur; + node->perf_time_us += time_us_cur; +} +void busy_wait_cycles(int cycles) { + struct timespec ts_start, ts_end; + + clock_gettime(CLOCK_MONOTONIC, &ts_start); + + while (1) { + clock_gettime(CLOCK_MONOTONIC, &ts_end); + long diff_ns = (ts_end.tv_sec - ts_start.tv_sec) * 1000000000 + + (ts_end.tv_nsec - ts_start.tv_nsec); + if (diff_ns >= cycles) { + break; + } + } +} static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { int n_tasks = 0; @@ -15519,6 +16820,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { #endif } break; case GGML_OP_OUT_PROD: + case GGML_OP_AXPY: { n_tasks = n_threads; } break; @@ -15662,6 +16964,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { set_numa_thread_affinity(state->ith, n_threads); + // cpu_set_t mask; + // CPU_ZERO(&mask); + // CPU_SET(state->ith * 2, &mask); + // if (sched_setaffinity(0, sizeof(mask), &mask) == -1) { + // perror("sched_setaffinity"); + // } + int node_n = -1; while (true) { @@ -15669,6 +16978,66 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { state->shared->node_n += 1; return (thread_ret_t) GGML_EXIT_ABORTED; } + if (state->ith == 0) + { + // atomic_fetch_sub(&state->shared->n_active, 1); + node_n = -1; + // return 0; + + while (1) + { + state->shared->perf_node_start_cycles = ggml_perf_cycles(); + state->shared->perf_node_start_time_us = ggml_perf_time_us(); + node_n = node_n + 1; + if (node_n >= cgraph->n_nodes) + return 0; + struct ggml_tensor *node = cgraph->nodes[node_n]; + if (node->backend == GGML_BACKEND_CPU) + continue; + // uint64_t dbug = 0; + while (1) + { + // dbug++; + int status0 = atomic_load(&node->src[0]->is_finish); + int status1 = 1; + int status2 = 1; + if (node->src[1] != NULL) + status1 = atomic_load(&node->src[1]->is_finish); + if (node->src[2] != NULL) + status2 = atomic_load(&node->src[2]->is_finish); + // if (dbug > 10000000) { + // printf("stuck %s thread %d\n", ggml_get_name(node), n_threads); + // int k; + // scanf("%d", &k); + // } + if (status0 == 1 && status1 == 1 && status2 == 1) + { + break; + } + // else + // busy_wait_cycles(10); + } + struct ggml_compute_params params = { + /*.type =*/GGML_TASK_COMPUTE, + /*.ith =*/0, + /*.nth =*/1, + /*.wsize =*/NULL, + /*.wdata =*/NULL, + /*.aic =*/0, + }; + + + // printf("GPU %s\n", ggml_get_name(node)); + // cudaDeviceSynchronize(); + ggml_compute_forward(¶ms, node); + // cudaDeviceSynchronize(); + // ggml_graph_compute_perf_stats_node_gpu(node, state->shared); + ggml_graph_compute_perf_stats_node_gpu(node, state->shared); + // if (strcmp(ggml_get_name(node), "before") == 0) + // printf("%ld\n", ggml_time_us()); + atomic_store(&node->is_finish, 1); + } + } if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { // all other threads are finished and spinning // do finalize and init here so we don't have synchronize again @@ -15678,6 +17047,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.nth =*/ 0, /*.wsize =*/ cplan->work_size, /*.wdata =*/ cplan->work_data, + /*.aic =*/ &state->shared->aic, }; if (node_n != -1) { @@ -15688,6 +17058,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); } ggml_graph_compute_perf_stats_node(node, state->shared); + atomic_store(&node->is_finish, 1); } // distribute new work or execute it direct if 1T @@ -15701,6 +17072,21 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { state->shared->perf_node_start_time_us = ggml_perf_time_us(); params.nth = n_tasks; + if (node->backend == GGML_BACKEND_GPU) + continue; + while(1) + { + int status0 = atomic_load(&node->src[0]->is_finish); + int status1 = 1; + int status2 = 1; + if(node->src[1] != NULL) + status1 = atomic_load(&node->src[1]->is_finish); + if(node->src[2] != NULL) + status2 = atomic_load(&node->src[2]->is_finish); + if(status0 == 1 && status1 == 1 && status2 == 1) + break; + // else busy_wait_cycles(10); + } /* INIT */ if (GGML_OP_HAS_INIT[node->op]) { @@ -15713,6 +17099,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { // they do something more efficient than spinning (?) params.type = GGML_TASK_COMPUTE; ggml_compute_forward(¶ms, node); + atomic_store(&node->is_finish, 1); if (GGML_OP_HAS_FINALIZE[node->op]) { params.type = GGML_TASK_FINALIZE; @@ -15757,10 +17144,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_params params = { /*.type =*/ GGML_TASK_COMPUTE, - /*.ith =*/ state->ith, - /*.nth =*/ n_tasks, + /*.ith =*/ state->ith-1, + /*.nth =*/ n_tasks-1, /*.wsize =*/ cplan->work_size, /*.wdata =*/ cplan->work_data, + /*.aic =*/ &state->shared->aic, }; if (state->ith < n_tasks) { @@ -15837,6 +17225,13 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type); } } break; + case GGML_OP_AXPY: + { + const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; + if (node->src[1]->type != vec_dot_type) { + cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type); + } + } break; case GGML_OP_OUT_PROD: { n_tasks = n_threads; @@ -15975,8 +17370,9 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { /*.cgraph_plan =*/ cplan, /*.perf_node_start_cycles =*/ 0, /*.perf_node_start_time_us =*/ 0, - /*.n_threads =*/ n_threads, - /*.n_active =*/ n_threads, + /*.n_threads =*/ n_threads-1, + /*.aic =*/ 0, + /*.n_active =*/ n_threads-1, /*.node_n =*/ -1, /*.abort_callback =*/ NULL, /*.abort_callback_data =*/ NULL, @@ -17963,6 +19359,7 @@ struct gguf_tensor_info { struct gguf_context { struct gguf_header header; + enum ggml_sparse_deriv sparse_deriv; struct gguf_kv * kv; struct gguf_tensor_info * infos; @@ -18013,6 +19410,12 @@ struct gguf_context * gguf_init_empty(void) { return ctx; } +struct gguf_context * gguf_init_empty_sparse(void) { + struct gguf_context * ctx = gguf_init_empty(); + memcpy(ctx->header.magic, GGUF_POWERINFER_MAGIC, sizeof(ctx->header.magic)); + return ctx; +} + struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { FILE * file = fopen(fname, "rb"); if (!file) { @@ -18023,23 +19426,28 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p size_t offset = 0; char magic[4]; + enum ggml_sparse_deriv sparse_deriv; // check the magic before making allocations { gguf_fread_el(file, &magic, sizeof(magic), &offset); - for (uint32_t i = 0; i < sizeof(magic); i++) { - if (magic[i] != GGUF_MAGIC[i]) { - fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic); - fclose(file); - return NULL; - } + if (strncmp(magic, GGUF_MAGIC, sizeof(magic)) == 0) { + sparse_deriv = GGML_DENSE_INFERENCE; + } else if (strncmp(magic, GGUF_POWERINFER_MAGIC, sizeof(magic)) == 0) { + sparse_deriv = GGML_SPARSE_INFERENCE; + fprintf(stderr, "%s: PowerInfer derived model detected. Sparse inference will be used.\n", __func__); + } else { + fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic); + fclose(file); + return NULL; } } bool ok = true; struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + ctx->sparse_deriv = sparse_deriv; // read the header { @@ -18492,6 +19900,10 @@ int gguf_get_n_tensors(const struct gguf_context * ctx) { return ctx->header.n_tensors; } +enum ggml_sparse_deriv gguf_get_sparse_deriv(const struct gguf_context * ctx) { + return ctx->sparse_deriv; +} + int gguf_find_tensor(const struct gguf_context * ctx, const char * name) { // return -1 if tensor not found int tensorfound = -1; @@ -19096,4 +20508,18 @@ int ggml_cpu_has_vsx(void) { #endif } +void ggml_set_backend(struct ggml_tensor * tensor, enum ggml_backend_type backend) { + if (backend == GGML_BACKEND_CPU) { + tensor->backend = GGML_BACKEND_CPU; + return; + } + if (backend == GGML_BACKEND_GPU || backend == GGML_BACKEND_GPU_SPLIT) { + #if defined(GGML_USE_CUBLAS) + tensor->backend = backend; + return; + #endif + } + GGML_ASSERT(false && "invalid backend"); +} + //////////////////////////////////////////////////////////////////////////////// diff --git a/ggml.h b/ggml.h index 8e6b64606..b430c05fd 100644 --- a/ggml.h +++ b/ggml.h @@ -207,6 +207,14 @@ #include #include #include +#ifdef __cplusplus + #include + using std::atomic_int; + using std::memory_order; + using std::memory_order_acquire; +#else /* not __cplusplus */ + #include +#endif /* __cplusplus */ #define GGML_FILE_MAGIC 0x67676d6c // "ggml" #define GGML_FILE_VERSION 1 @@ -232,6 +240,7 @@ #define GGML_EXIT_ABORTED 1 #define GGUF_MAGIC "GGUF" +#define GGUF_POWERINFER_MAGIC "PWRI" #define GGUF_VERSION 3 @@ -336,6 +345,11 @@ extern "C" { GGML_BACKEND_GPU_SPLIT = 20, }; + enum ggml_sparse_deriv { + GGML_DENSE_INFERENCE = 0, + GGML_SPARSE_INFERENCE = 1, + }; + // model file types enum ggml_ftype { GGML_FTYPE_UNKNOWN = -1, @@ -382,6 +396,7 @@ extern "C" { GGML_OP_GROUP_NORM, GGML_OP_MUL_MAT, + GGML_OP_AXPY, GGML_OP_OUT_PROD, GGML_OP_SCALE, @@ -504,6 +519,7 @@ extern "C" { struct ggml_tensor * src[GGML_MAX_SRC]; // performance + atomic_int is_finish; int perf_runs; int64_t perf_cycles; int64_t perf_time_us; @@ -520,6 +536,9 @@ extern "C" { char padding[12]; }; + + static const int64_t GGML_NE_WILDCARD = -1; + static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); // the compute plan that needs to be prepared for ggml_graph_compute() @@ -573,6 +592,22 @@ extern "C" { void * data; }; + struct ggml_context { + size_t mem_size; + void * mem_buffer; + bool mem_buffer_owned; + bool no_alloc; + bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers + + int n_objects; + + struct ggml_object * objects_begin; + struct ggml_object * objects_end; + + struct ggml_scratch scratch; + struct ggml_scratch scratch_save; + }; + struct ggml_init_params { // memory pool size_t mem_size; // bytes @@ -600,6 +635,7 @@ extern "C" { // work buffer for all threads size_t wsize; void * wdata; + atomic_int *aic; }; // misc @@ -618,6 +654,8 @@ extern "C" { GGML_API void ggml_print_object (const struct ggml_object * obj); GGML_API void ggml_print_objects(const struct ggml_context * ctx); + GGML_API + GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); @@ -727,6 +765,7 @@ extern "C" { GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); + GGML_API int32_t * ggml_get_data_i32(const struct ggml_tensor * tensor); GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); @@ -735,6 +774,9 @@ extern "C" { GGML_ATTRIBUTE_FORMAT(2, 3) GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...); + GGML_API void ggml_set_backend(struct ggml_tensor * tensor, enum ggml_backend_type backend); + + // // operations on tensors with backpropagation // @@ -753,6 +795,12 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor *ggml_add_idx( + struct ggml_context *ctx, + struct ggml_tensor *a, + struct ggml_tensor *b, + struct ggml_tensor *idx); + GGML_API struct ggml_tensor * ggml_add_inplace( struct ggml_context * ctx, struct ggml_tensor * a, @@ -1027,6 +1075,25 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor *ggml_mul_mat_idx( + struct ggml_context *ctx, + struct ggml_tensor *a, + struct ggml_tensor *b, + struct ggml_tensor *idx, + struct ggml_tensor *d); + GGML_API struct ggml_tensor *ggml_mul_mat_special( + struct ggml_context *ctx, + struct ggml_tensor *a, + struct ggml_tensor *b, + struct ggml_tensor *idx, + struct ggml_tensor *d, + struct ggml_tensor *ref); + GGML_API struct ggml_tensor *ggml_axpy( + struct ggml_context *ctx, + struct ggml_tensor *a, + struct ggml_tensor *b, + struct ggml_tensor *c, + struct ggml_tensor *d); // A: m columns, n rows, // B: p columns, n rows, @@ -2013,6 +2080,7 @@ extern "C" { }; GGML_API struct gguf_context * gguf_init_empty(void); + GGML_API struct gguf_context * gguf_init_empty_sparse(void); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); //GGML_API struct gguf_context * gguf_init_from_buffer(..); @@ -2049,6 +2117,7 @@ extern "C" { GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id); GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); + GGML_API enum ggml_sparse_deriv gguf_get_sparse_deriv(const struct gguf_context * ctx); GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 7f63361bd..cb31e5278 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -115,6 +115,10 @@ class MODEL_TENSOR(IntEnum): FFN_NORM = auto() ATTN_Q_NORM = auto() ATTN_K_NORM = auto() + FFN_DOWN_T = auto() + FC_1 = auto() + FC_2 = auto() + MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -155,6 +159,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_DOWN_T: "blk.{bid}.ffn_down_t", + MODEL_TENSOR.FC_1: "blk.{bid}.fc1", + MODEL_TENSOR.FC_2: "blk.{bid}.fc2", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -173,6 +180,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_DOWN_T, + MODEL_TENSOR.FC_1, + MODEL_TENSOR.FC_2, ], MODEL_ARCH.GPTNEOX: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 22ad8b8fc..2c8130500 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -194,6 +194,14 @@ class TensorNameMap: MODEL_TENSOR.ROPE_FREQS: ( "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon ), + + MODEL_TENSOR.FC_1: ( + "model.layers.{bid}.fc1", + ), + + MODEL_TENSOR.FC_2: ( + "model.layers.{bid}.fc2", + ), } mapping: dict[str, tuple[MODEL_TENSOR, str]] diff --git a/llama.cpp b/llama.cpp index 01522fdb4..49d890561 100644 --- a/llama.cpp +++ b/llama.cpp @@ -338,6 +338,9 @@ enum llm_tensor { LLM_TENSOR_FFN_NORM, LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_MLP_PRED_FC1, + LLM_TENSOR_MLP_PRED_FC2, + LLM_TENSOR_FFN_DOWN_T, }; static std::map> LLM_TENSOR_NAMES = { @@ -358,6 +361,9 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN_T, "blk.%d.ffn_down_t" }, + { LLM_TENSOR_MLP_PRED_FC1, "blk.%d.fc1" }, + { LLM_TENSOR_MLP_PRED_FC2, "blk.%d.fc2" }, }, }, { @@ -391,6 +397,9 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN_T, "blk.%d.ffn_down_t" }, + { LLM_TENSOR_MLP_PRED_FC1, "blk.%d.fc1" }, + { LLM_TENSOR_MLP_PRED_FC2, "blk.%d.fc2" }, }, }, { @@ -1196,10 +1205,25 @@ struct llama_layer { struct ggml_tensor * ffn_gate; // w1 struct ggml_tensor * ffn_down; // w2 struct ggml_tensor * ffn_up; // w3 + struct ggml_tensor * ffn_down_t; + + // ff sliced on gpu + struct ggml_tensor * ffn_gate_gpu; + struct ggml_tensor * ffn_down_gpu; + struct ggml_tensor * ffn_up_gpu; // ff bias struct ggml_tensor * ffn_down_b; // b2 struct ggml_tensor * ffn_up_b; // b3 + + // mlp predictor weights + struct ggml_tensor * mlp_pre_w1; + struct ggml_tensor * mlp_pre_w2; + + // gpu double-index + // TODO: need to fill this in for all layers + struct ggml_tensor * gpu_idx; + struct ggml_tensor * gpu_bucket; }; struct llama_kv_cell { @@ -1297,6 +1321,9 @@ struct llama_vocab { } }; +struct llama_mlp_model_loader; +struct llama_augmentation_model_loader; + struct llama_model { e_model type = MODEL_UNKNOWN; llm_arch arch = LLM_ARCH_UNKNOWN; @@ -1304,6 +1331,8 @@ struct llama_model { std::string name = "n/a"; + ggml_sparse_deriv sparse_deriv; + llama_hparams hparams = {}; llama_vocab vocab; @@ -1329,6 +1358,10 @@ struct llama_model { // model memory mapped file std::unique_ptr mapping; + // aux model loaders for dynamically loaded/transformed model weights + std::unique_ptr mlp_model_loader; + std::unique_ptr aug_model_loader; + // objects representing data potentially being locked in memory llama_mlock mlock_buf; llama_mlock mlock_mmap; @@ -1702,6 +1735,8 @@ struct llama_model_loader { int n_tensors = 0; int n_created = 0; + ggml_sparse_deriv sparse_deriv; + int64_t n_elements = 0; size_t n_bytes = 0; @@ -1729,7 +1764,7 @@ struct llama_model_loader { n_kv = gguf_get_n_kv(ctx_gguf); n_tensors = gguf_get_n_tensors(ctx_gguf); - + sparse_deriv = gguf_get_sparse_deriv(ctx_gguf); fver = (enum llama_fver ) gguf_get_version(ctx_gguf); for (int i = 0; i < n_tensors; i++) { @@ -1897,7 +1932,8 @@ struct llama_model_loader { bool is_ok = true; for (size_t i = 0; i < ne.size(); ++i) { if (ne[i] != cur->ne[i]) { - is_ok = false; + // allow for -1 in ne for wildcard dimensions + is_ok = ne[i] == -1; break; } } @@ -2537,6 +2573,328 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } } + +struct llama_mlp_model_loader { + int n_tensors = 0; + size_t n_bytes = 0; // tensor data bytes + + const std::string fname; + llama_file file; + int fver; + + bool use_mmap = false; // only supports mmap yet + std::unique_ptr mapping; + struct ggml_context * ctx_meta = nullptr; + + llama_mlp_model_loader(const std::string & fname, bool use_mmap) : fname(fname), use_mmap(use_mmap), file(fname.c_str(), "rb") { + GGML_ASSERT(use_mmap); + + // verify magic and version + uint32_t magic = file.read_u32(); + // TODO: assert on file magic once we have a stable format + GGML_ASSERT(magic == 0xDEADBEEF && "invalid file magic" || true); + + fver = file.read_u32(); + GGML_ASSERT(fver == 1 && "unsupported file version"); + + n_tensors = file.read_u32(); + + // allocate memadata/data for mlp tensors + // TODO: support allocating buffer for tensor data (when mmap is not used) + size_t per_tensor_meta_size = GGML_PAD(sizeof(struct ggml_tensor), GGML_MEM_ALIGN) + GGML_OBJECT_SIZE; + size_t tensor_meta_size = n_tensors * per_tensor_meta_size; + struct ggml_init_params params = { + /*.mem_size =*/ tensor_meta_size, + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + ctx_meta = ggml_init(params); + + // memory-map the mlp weights file + mapping.reset(new llama_mmap(&file, /* prefetch */ 0, ggml_is_numa())); + } + + int apply_tensors_to_base_model(llama_model * model) { + // TODO: assert fp is at the end of headers + if (n_tensors != model -> layers.size() * 2) { + LLAMA_LOG_ERROR("%s: error: the number of mlp adapters does not match the layer of model\n", __func__); + return 1; + } + LLAMA_LOG_INFO("%s: applying gpu_idx adapter from '%s' - please wait ...\n", __func__, fname.c_str()); + const int64_t t_start_mlp_us = ggml_time_us(); + + for (llama_layer &model_layer : model -> layers) { + ggml_tensor *mlp_fc1_tensor = load_mlp_tensor_from_stream(); + ggml_tensor *mlp_fc2_tensor = load_mlp_tensor_from_stream(); +#ifdef GGML_USE_CUBLAS + // ggml_set_backend(mlp_fc1_tensor, GGML_BACKEND_GPU); + // ggml_cuda_transform_tensor(mlp_fc1_tensor->data, mlp_fc1_tensor); + + // gpu bucket to GPU + ggml_set_backend(mlp_fc2_tensor, GGML_BACKEND_GPU); + ggml_cuda_transform_tensor(mlp_fc2_tensor->data, mlp_fc2_tensor); +#endif // GGML_USE_CUBLAS + if (mlp_fc1_tensor == nullptr || mlp_fc2_tensor == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to load mlp tensors\n", __func__); + return 1; + } + + // load model layer and check dimensions + // ggml_tensor *model_up_t = model_layer.ffn_up; + // GGML_ASSERT(model_up_t != nullptr); + // if (model_up_t->ne[0] != mlp_fc1_tensor->ne[0] || + // model_up_t->ne[1] != mlp_fc2_tensor->ne[1]) { + // LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 + // " and %" PRId64 + // ");" + // " are you sure that this adapter is for this model?\n", + // __func__, model_up_t->ne[0], mlp_fc1_tensor->ne[1]); + // return 1; + // } + + // GGML_ASSERT(model_layer.mlp_pre_w1 == nullptr && model_layer.mlp_pre_w2 == nullptr); + model_layer.gpu_idx = mlp_fc1_tensor; + model_layer.gpu_bucket = mlp_fc2_tensor; + int *data1 = (int *)mlp_fc1_tensor->data; + int *data2 = (int *)mlp_fc2_tensor->data; + + LLAMA_LOG_INFO("."); + } + + const int64_t t_mlp_us = ggml_time_us() - t_start_mlp_us; + LLAMA_LOG_INFO(" done (%.2f ms)\n", t_mlp_us / 1000.0); + + return 0; + } + + // Consumes the stream and returns a new mlp tensor. + // Returns nullptr on error. + // TODO: mmap mlp model file + ggml_tensor *load_mlp_tensor_from_stream() { + uint32_t n_dims = file.read_u32(); + uint32_t name_length = file.read_u32(); + uint32_t ftype = file.read_u32(); + + uint32_t ne[2] = {1, 1}; + for (int i = 0; i < n_dims; ++i) { + ne[i] = file.read_u32(); + } + + std::string tensor_name; + { + char buf[1024]; + file.read_raw(buf, name_length); + tensor_name = std::string(buf, name_length); + } + + // const std::string mlp_suffix = ".mlp"; + // size_t pos = tensor_name.rfind(mlp_suffix); + // if (pos == std::string::npos) { + // LLAMA_LOG_ERROR("%s: error: '%s' is not a mlp tensor\n", __func__, + // tensor_name.c_str()); + // return nullptr; + // } + + // std::string mlp_type = tensor_name.substr(pos + mlp_suffix.length()); + // std::string base_name = tensor_name; + // base_name.erase(pos); + // LLAMA_LOG_INFO("%s: %s => %s (mlp type %s) (", __func__, tensor_name.c_str(), + // base_name.c_str(), mlp_type.c_str()); + // for (int i = 0; i < n_dims; ++i) { + // LLAMA_LOG_INFO("%d ", ne[i]); + // } + // LLAMA_LOG_INFO(")\n"); + // LLAMA_LOG_INFO("tensor name %s\n", tensor_name.c_str()); + + // create ggml tensor + ggml_type wtype; + switch (ftype) { + case 0: + wtype = GGML_TYPE_F32; + break; + case 1: + wtype = GGML_TYPE_F16; + break; + case 18: + wtype = GGML_TYPE_I32; + break; + default: { + LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n", __func__, ftype); + return nullptr; + } + } + ggml_tensor *mlp_tensor; + // if (n_dims != 2) { + // LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); + // return nullptr; + // } + mlp_tensor = ggml_new_tensor_2d(ctx_meta, wtype, ne[0], ne[1]); + // ggml_set_name(mlp_tensor, ""); + + // load tensor data + size_t offset = file.tell(); + size_t tensor_data_size = ggml_nbytes(mlp_tensor); + offset = (offset + 31) & -32; + file.seek(offset, SEEK_SET); + // point to the mmaped mlp model file + mlp_tensor -> data = mapping -> addr + static_cast(offset); + file.seek(tensor_data_size, SEEK_CUR); + return mlp_tensor; + } +}; + +// to dynamically load/transform llama model weights +struct llama_augmentation_model_loader { + struct ggml_context * aux_ctx = nullptr; + + llama_augmentation_model_loader(llama_model *model) { + // TODO: check precondition - MLP loaded + + // check augmentation fields to load + // 1. gpu_idx; + // 2. gpu_bucket; + // 3. transformed ffn_down; + const int64_t ggml_aux_tensor_size = 4 * (100 * 100 + 5120*40*4 * ggml_tensor_overhead() + (int64_t)13824*5120*40*4); + printf("%ld\n", ggml_aux_tensor_size); + struct ggml_init_params params = { + /*.mem_size =*/ ggml_aux_tensor_size, + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ false, + }; + aux_ctx = ggml_init(params); + } + + ggml_tensor * create_striped_mat_to_gpu(const struct ggml_tensor *src, struct ggml_tensor * gpu_bucket) { + if (src == NULL) { + return NULL; + } + // allocate and copy selected weights to gpu + #ifdef GGML_USE_CUBLAS + int64_t row_len = src->ne[0]; + int64_t gpu_rows = gpu_bucket->ne[0]; + if (gpu_rows == 0) + return NULL; + + ggml_set_no_alloc(aux_ctx, true); + ggml_tensor * gpu_dst = ggml_new_tensor_2d(aux_ctx, src->type, row_len, gpu_rows); + ggml_set_backend(gpu_dst, GGML_BACKEND_GPU); + ggml_cuda_alloc_tensor(gpu_dst); + + // init two 1d views on host and device + ggml_tensor * host_mat_row = ggml_new_tensor_1d(aux_ctx, src->type, row_len); + static ggml_tensor * device_mat_row = ggml_dup_tensor(aux_ctx, host_mat_row); + ggml_set_backend(device_mat_row, GGML_BACKEND_GPU); + ggml_cuda_alloc_tensor(device_mat_row); + *ggml_cuda_get_data_pp(device_mat_row) = *ggml_cuda_get_data_pp(gpu_dst); + + // read raw data and copy to device depending on gpu_idx + const enum ggml_type type = src->type; + const int ne0 = src->ne[0]; + const size_t row_data_size = ne0*ggml_type_size(type)/ggml_blck_size(type); + for (int i = 0; i < gpu_rows; i++) { + int32_t host_i = ((int32_t *)gpu_bucket->data)[i]; + host_mat_row -> data = src -> data + host_i * row_data_size; + char ** gpu_data_pp = reinterpret_cast(ggml_cuda_get_data_pp(device_mat_row)); + // printf("gpu_data_p: %p\n", *gpu_data_pp); + ggml_cuda_cpy_1d(device_mat_row, host_mat_row); + *gpu_data_pp = *gpu_data_pp + row_data_size; + } + ggml_set_no_alloc(aux_ctx, false); + + return gpu_dst; + #else + printf("As you do not support CUDA. Split to GPU is not allowed.\n"); + return NULL; + #endif + } + + void slice_ffn_mat_to_gpu(llama_layer & layer) { + std::vector work_buffer; + ggml_cgraph * tmp_sum_gf = ggml_new_graph(aux_ctx); + ggml_tensor * gpu_idx = layer.gpu_idx; + + // calculate the size of tensor to be copied + ggml_tensor * sum_t = ggml_sum(aux_ctx, gpu_idx); + ggml_build_forward_expand(tmp_sum_gf, sum_t); + ggml_graph_compute_helper(work_buffer, tmp_sum_gf, 2); + int64_t gpu_rows = *ggml_get_data_i32(sum_t); + + + int64_t gpu_index_len = gpu_idx->ne[0]; + // ggml_tensor * gpu_bucket = ggml_new_tensor_1d(aux_ctx, GGML_TYPE_I32, gpu_rows); + // make bucket a reverse index back to unstriped mat + // int32_t * pbucket_data = (int32_t *)gpu_bucket->data; + // for (int i = 0; i < gpu_index_len; i++) { + // if (ggml_get_data_i32(gpu_idx)[i] == 0) { + // continue; + // } + // *pbucket_data = i; + // ++pbucket_data; + // } + // layer.gpu_bucket = gpu_bucket; + ggml_tensor *gpu_bucket = layer.gpu_bucket; + layer.ffn_gate_gpu = create_striped_mat_to_gpu(layer.ffn_gate, gpu_bucket); + layer.ffn_up_gpu = create_striped_mat_to_gpu(layer.ffn_up, gpu_bucket); + layer.ffn_down_gpu = create_striped_mat_to_gpu(layer.ffn_down_t, gpu_bucket); + } + + int apply_augmentation_to_base_model(llama_model * model) { + LLAMA_LOG_INFO("%s: applying augmentation to model - please wait ...\n", __func__); + const int64_t t_start_aug_us = ggml_time_us(); + std::vector work_buffer; + + // transpose ffn_down to use axpy + // ggml_cgraph * tmp_transpose_gf = ggml_new_graph(aux_ctx); + // for (llama_layer &model_layer : model -> layers) { + // // gpu_w2 transpose load + // ggml_tensor * ffn_down_t = ggml_cont(aux_ctx, ggml_transpose(aux_ctx, model_layer.ffn_down)); + // ggml_build_forward_expand(tmp_transpose_gf, ffn_down_t); + // model_layer.ffn_down_t = ffn_down_t; + // LLAMA_LOG_INFO("."); + // } + // ggml_graph_compute_helper(work_buffer, tmp_transpose_gf, 2); + // for (llama_layer &model_layer : model -> layers) { + // model_layer.ffn_down_t->op = GGML_OP_NONE; + // model_layer.ffn_down_t->src[0] = NULL; + // model_layer.ffn_down_t->src[1] = NULL; + // model_layer.ffn_down_t->src[2] = NULL; + // } + + // load gpu_idx and slice mat to gpu + for (llama_layer &model_layer : model -> layers) { + // gpu_idx load + if (model_layer.gpu_idx == NULL && model_layer.gpu_bucket == NULL) { + ggml_tensor * gpu_idx = ggml_new_tensor_1d(aux_ctx, GGML_TYPE_I32, model_layer.mlp_pre_w2 -> ne[1]); + ggml_set_zero(gpu_idx); + model_layer.gpu_idx = gpu_idx; + ggml_tensor * gpu_bucket = ggml_new_tensor_1d(aux_ctx, GGML_TYPE_I32, 0); + model_layer.gpu_bucket = gpu_bucket; + } + slice_ffn_mat_to_gpu(model_layer); + LLAMA_LOG_INFO("."); + } + + LLAMA_LOG_INFO(" done (%.2f ms)\n", (ggml_time_us() - t_start_aug_us) / 1000.0); + return 0; + } +}; + +static bool should_offload_mlp_at_layer(int layer_idx) { + char * n_offload = getenv("N_OFFLOAD_MLP"); + if (n_offload == nullptr) { + return false; + } + return layer_idx < atoi(n_offload); +} + +static bool should_offload_attention_at_layer(int layer_idx) { + char * n_offload = getenv("N_OFFLOAD_ATTN"); + if (n_offload == nullptr) { + return false; + } + return layer_idx < atoi(n_offload); +} + static void llm_load_tensors( llama_model_loader & ml, llama_model & model, @@ -2599,8 +2957,29 @@ static void llm_load_tensors( llama_backend_offload_split = GGML_BACKEND_GPU; #endif + // deprecated + auto ffn_b = [] (ggml_backend_type backend) -> ggml_backend_type { + const bool ffn_offloading = false; + if (ffn_offloading) { + return backend; + } + return GGML_BACKEND_CPU; + }; + // prepare memory for the weights size_t vram_weights = 0; + auto create_tensor = [&] (const std::string & name, const std::vector & ne, ggml_backend_type backend) -> ggml_tensor * { + ggml_tensor * created_tensor = ml.create_tensor(ctx, name, ne, backend); + if (created_tensor == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to create tensor '%s'\n", __func__, name); + return nullptr; + } + if (created_tensor->backend == GGML_BACKEND_GPU || created_tensor->backend == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(created_tensor); + } + return created_tensor; + }; + { const int64_t n_embd = hparams.n_embd; const int64_t n_embd_gqa = hparams.n_embd_gqa(); @@ -2612,7 +2991,7 @@ static void llm_load_tensors( case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output { @@ -2634,15 +3013,8 @@ static void llm_load_tensors( backend_output = GGML_BACKEND_CPU; } - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); } const uint32_t n_ff = hparams.n_ff; @@ -2652,30 +3024,31 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + const ggml_backend_type attention_backend = should_offload_attention_at_layer(i) ? llama_backend_offload : GGML_BACKEND_CPU; + const ggml_backend_type mlp_backend = should_offload_mlp_at_layer(i) ? llama_backend_offload : GGML_BACKEND_CPU; + const ggml_backend_type ffn_backend = GGML_BACKEND_CPU; auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, attention_backend); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, attention_backend); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, attention_backend); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, attention_backend); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, attention_backend); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, ffn_backend); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, ffn_backend); + if (model.sparse_deriv == GGML_SPARSE_INFERENCE) { + layer.ffn_down_t = create_tensor(tn(LLM_TENSOR_FFN_DOWN_T, "weight", i), {n_embd, n_ff}, ffn_backend); + layer.mlp_pre_w1 = create_tensor(tn(LLM_TENSOR_MLP_PRED_FC1, "weight", i), {n_embd, GGML_NE_WILDCARD}, mlp_backend); + layer.mlp_pre_w2 = create_tensor(tn(LLM_TENSOR_MLP_PRED_FC2, "weight", i), {GGML_NE_WILDCARD, n_ff}, mlp_backend); + } else { + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, ffn_backend); } + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, ffn_backend); + } } break; case LLM_ARCH_BAICHUAN: @@ -2730,11 +3103,11 @@ static void llm_load_tensors( layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, ffn_b(backend)); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, ffn_b(backend_split)); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, ffn_b(backend_split)); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, ffn_b(backend_split)); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -2748,7 +3121,7 @@ static void llm_load_tensors( { // TODO: CPU-only for now - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output { @@ -2770,56 +3143,41 @@ static void llm_load_tensors( backend_output = GGML_BACKEND_CPU; } - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - vram_weights += ggml_nbytes(model.output_norm_b); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); } const uint32_t n_ff = hparams.n_ff; - const int i_gpu_start = n_layer - n_gpu_layers; - model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + const ggml_backend_type attention_backend = should_offload_attention_at_layer(i) ? llama_backend_offload : GGML_BACKEND_CPU; + const ggml_backend_type mlp_backend = should_offload_mlp_at_layer(i) ? llama_backend_offload : GGML_BACKEND_CPU; + const ggml_backend_type ffn_backend = GGML_BACKEND_CPU; auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, attention_backend); + layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, attention_backend); if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) { - layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend); - layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(layer.attn_norm_2); - vram_weights += ggml_nbytes(layer.attn_norm_2_b); - } + layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, attention_backend); + layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, attention_backend); } - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + - ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) + - ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, attention_backend); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, attention_backend); + // Either ffn_down or ffn_down_t is used, depending on the model type + if (model.sparse_deriv == GGML_SPARSE_INFERENCE) { + layer.ffn_down_t = create_tensor(tn(LLM_TENSOR_FFN_DOWN_T, "weight", i), {n_embd, n_ff}, ffn_backend); + layer.mlp_pre_w1 = create_tensor(tn(LLM_TENSOR_MLP_PRED_FC1, "weight", i), {n_embd, GGML_NE_WILDCARD}, mlp_backend); + layer.mlp_pre_w2 = create_tensor(tn(LLM_TENSOR_MLP_PRED_FC2, "weight", i), {GGML_NE_WILDCARD, n_ff}, mlp_backend); + } else { + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, ffn_backend); } + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, ffn_backend); } } break; case LLM_ARCH_STARCODER: @@ -2881,14 +3239,14 @@ static void llm_load_tensors( layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, ffn_b(backend)); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, ffn_b(backend)); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, ffn_b(backend_split)); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, ffn_b(backend)); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, ffn_b(backend_split)); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, ffn_b(backend)); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -2957,12 +3315,12 @@ static void llm_load_tensors( layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, ffn_b(backend_split)); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, ffn_b(backend)); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, ffn_b(backend_split)); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, ffn_b(backend)); + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, ffn_b(backend)); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, ffn_b(backend)); layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend); layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend); @@ -3031,14 +3389,14 @@ static void llm_load_tensors( layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, ffn_b(backend)); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, ffn_b(backend)); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, ffn_b(backend_split)); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, ffn_b(backend)); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, ffn_b(backend_split)); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, ffn_b(backend)); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -3102,10 +3460,10 @@ static void llm_load_tensors( layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, ffn_b(backend)); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, ffn_b(backend_split)); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, ffn_b(backend_split)); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -3177,12 +3535,12 @@ static void llm_load_tensors( layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, ffn_b(backend)); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, ffn_b(backend)); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, ffn_b(backend_split)); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, ffn_b(backend_split)); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, ffn_b(backend_split)); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -3263,6 +3621,7 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con llama_model_loader ml(fname, params.use_mmap); model.hparams.vocab_only = params.vocab_only; + model.sparse_deriv = ml.sparse_deriv; llm_load_arch (ml, model); llm_load_hparams(ml, model); @@ -3399,7 +3758,7 @@ static void llm_build_k_shift( } } -static void llm_build_kv_store( +static std::pair llm_build_kv_store( struct ggml_context * ctx, const llama_hparams & hparams, const llama_kv_cache & kv, @@ -3428,8 +3787,12 @@ static void llm_build_kv_store( cb(v_cache_view, "v_cache_view", il); // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); - ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view)); + ggml_tensor * k_cpy = ggml_cpy(ctx, k_cur, k_cache_view); + ggml_tensor * v_cpy = ggml_cpy(ctx, v_cur_t, v_cache_view); + //ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); + //ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view)); + + return {k_cpy, v_cpy}; } static struct ggml_tensor * llm_build_norm( @@ -3538,7 +3901,8 @@ static struct ggml_tensor * llm_build_ffn( cb(cur, "ffn_gate_par", il); } - cur = ggml_mul_mat(ctx, down, cur); + // cur = ggml_mul_mat(ctx, down, cur); + cur = ggml_axpy(ctx, down, cur, NULL, NULL); if (down_b) { cb(cur, "ffn_down", il); } @@ -3550,6 +3914,130 @@ static struct ggml_tensor * llm_build_ffn( return cur; } +static struct ggml_tensor * llm_build_ffn_sparse( + struct ggml_context * ctx, + struct ggml_tensor * cur, + struct ggml_tensor * up, + struct ggml_tensor * up_b, + struct ggml_tensor * gate, + struct ggml_tensor * gate_b, + struct ggml_tensor * down, + struct ggml_tensor * down_b, + struct ggml_tensor * down_t, + struct ggml_tensor * pre_w1, + struct ggml_tensor * pre_w2, + struct ggml_tensor * pred_inpl, + struct ggml_tensor * gpu_index, + struct ggml_tensor * gpu_bucket, + struct ggml_tensor * gate_gpu, + struct ggml_tensor * down_gpu, + struct ggml_tensor * up_gpu, + llm_ffn_op_type type_op, + llm_ffn_gate_type type_gate, + const llm_build_cb & cb, + int il) { + // TODO: no gpu slicing for now + // GGML_ASSERT(gate_gpu == nullptr && down_gpu == nullptr && up_gpu == nullptr && gpu_bucket == nullptr); + + ggml_tensor *idx = nullptr; + ggml_tensor *idx_g = nullptr; + ggml_tensor *cur_c = nullptr; + ggml_tensor *third = nullptr; + + if (pred_inpl->backend != pre_w1->backend) { + if (pre_w1->backend == GGML_BACKEND_CPU) { + pred_inpl = ggml_dup(ctx, pred_inpl); + } else { + // NOOP for now + } + } + + // prepare sparse idx + idx = ggml_mul_mat(ctx, pre_w1, pred_inpl); + // no offlaoad + cb(idx, "mlp_pre_w1", il); + idx = ggml_relu(ctx, idx); + cb(idx, "relu_pre", il); + idx = ggml_mul_mat(ctx, pre_w2, idx); + cb(idx, "mlp_pre_w2", il); + + + // FFN up + third = cur; + struct ggml_tensor * tmp = ggml_mul_mat_idx(ctx, up, cur, idx, gpu_index); + cb(tmp, "ffn_up_sparse", il); + struct ggml_tensor * tmp2 = ggml_mul_mat_special(ctx, up_gpu, cur, idx, gpu_bucket, up); + if (tmp2 != NULL) { + ggml_cuda_assign_buffers_no_alloc(tmp2); + cb(tmp2, "ffn_up_sparse_gpu", il); + } + tmp = ggml_add(ctx, tmp, tmp2); + + + if (up_b) { + tmp = ggml_add(ctx, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } + + if (gate) { + // TODO: only support par for now + GGML_ASSERT(type_gate == LLM_FFN_PAR); + third = cur; + cur = ggml_mul_mat_idx(ctx, gate, cur, idx, gpu_index); + cb(cur, "ffn_gate", il); + tmp2 = ggml_mul_mat_special(ctx, gate_gpu, third, idx, gpu_bucket, gate); + if (tmp2 != NULL) { + ggml_cuda_assign_buffers_no_alloc(tmp2); + cb(tmp2, "ffn_up_sparse_gpu", il); + } + cur = ggml_add(ctx, cur, tmp2); + + if (gate_b) { + cur = ggml_add(ctx, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } + } else { + cur = tmp; + } + + switch (type_op) { + case LLM_FFN_RELU: + { + cur = ggml_relu(ctx, cur); + cb(cur, "ffn_relu", il); + } break; + default: + // only support relu for now + GGML_ASSERT(type_op == LLM_FFN_RELU); + } + + if (type_gate == LLM_FFN_PAR) { + cur = ggml_mul(ctx, cur, tmp); + cb(cur, "ffn_gate_par", il); + } + + third = cur; + cur = ggml_axpy(ctx, down_gpu, cur, idx, gpu_bucket); + if (cur != NULL) { + ggml_cuda_assign_buffers_no_alloc(cur); + cb(cur, "ffn_down", il); + } + tmp = ggml_axpy(ctx, down_t, third, idx, gpu_index); + cb(tmp, "ffn_down_gpu", il); + cur = ggml_add(ctx, cur, tmp); + + if (down_b) { + cur = ggml_add(ctx, cur, down_b); + cb(cur, "ffn_down", il); + } + + return cur; +} + + +static ggml_tensor * k_cpy = nullptr; +static ggml_tensor * v_cpy = nullptr; + // if max_alibi_bias > 0 then apply ALiBi static struct ggml_tensor * llm_build_kqv( struct ggml_context * ctx, @@ -3582,6 +4070,9 @@ static struct ggml_tensor * llm_build_kqv( ggml_element_size(kv.k)*n_embd_head, ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il); cb(k, "k", il); + if (k_cpy != nullptr) { + k->src[1] = k_cpy; + } struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); @@ -3611,6 +4102,9 @@ static struct ggml_tensor * llm_build_kqv( ggml_element_size(kv.v)*n_ctx*n_embd_head, ggml_element_size(kv.v)*n_ctx*n_embd_gqa*il); cb(v, "v", il); + if (v_cpy != nullptr) { + v->src[1] = v_cpy; + } struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); cb(kqv, "kqv", il); @@ -3633,6 +4127,10 @@ static struct ggml_tensor * llm_build_kqv( return cur; } +const llm_build_cb no_offload_cb = [](struct ggml_tensor * cur, const char * name, int nl) { + ggml_set_name(cur, name); +}; + struct llm_build_context { const llama_model & model; const llama_hparams & hparams; @@ -3664,7 +4162,7 @@ struct llm_build_context { const bool do_rope_shift; - const llm_build_cb & cb; + llm_build_cb cb; llama_buffer & buf_compute; @@ -3714,6 +4212,7 @@ struct llm_build_context { /*.mem_buffer =*/ buf_compute.data, /*.no_alloc =*/ true, }; + ctx0 = ggml_init(params); } @@ -3788,7 +4287,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + std::tie(k_cpy, v_cpy) = llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kqv(ctx0, hparams, kv_self, model.layers[il].wo, NULL, @@ -3804,14 +4303,156 @@ struct llm_build_context { cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + no_offload_cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + if (1) { + cur = llm_build_ffn_sparse(ctx0, cur, model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); + model.layers[il].ffn_down_t, + model.layers[il].mlp_pre_w1, + model.layers[il].mlp_pre_w2, + ffn_inp, // as for now, llama's pred use the same input as the ffn + model.layers[il].gpu_idx, + model.layers[il].gpu_bucket, model.layers[il].ffn_gate_gpu, model.layers[il].ffn_down_gpu, model.layers[il].ffn_up_gpu, // TODO: disable gpu offloading as for now + LLM_FFN_RELU, LLM_FFN_PAR, no_offload_cb, il); + } else { + // fallback to dense + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down_t, NULL, + LLM_FFN_RELU, LLM_FFN_PAR, cb, il); + } + // cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_llama_dense() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + std::tie(k_cpy, v_cpy) = llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + no_offload_cb(cur, "ffn_norm", il); + + if (0) { + cur = llm_build_ffn_sparse(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + model.layers[il].ffn_down_t, + model.layers[il].mlp_pre_w1, + model.layers[il].mlp_pre_w2, + ffn_inp, // as for now, llama's pred use the same input as the ffn + model.layers[il].gpu_idx, + model.layers[il].gpu_bucket, model.layers[il].ffn_gate_gpu, model.layers[il].ffn_down_gpu, model.layers[il].ffn_up_gpu, // TODO: disable gpu offloading as for now + LLM_FFN_RELU, LLM_FFN_PAR, no_offload_cb, il); + } else { + // fallback to dense + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down_t, NULL, + LLM_FFN_RELU, LLM_FFN_PAR, cb, il); + } + // cb(cur, "ffn_out", il); } cur = ggml_add(ctx0, cur, ffn_inp); @@ -3982,6 +4623,7 @@ struct llm_build_context { if (do_rope_shift) { llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); } + // cb = no_offload_cb; for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -3990,7 +4632,7 @@ struct llm_build_context { model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il); - cb(attn_norm, "attn_norm", il); + // cb(attn_norm, "attn_norm", il); // self-attention { @@ -4032,7 +4674,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + std::tie(k_cpy, v_cpy) = llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kqv(ctx0, hparams, kv_self, model.layers[il].wo, NULL, @@ -4043,12 +4685,26 @@ struct llm_build_context { struct ggml_tensor * ffn_inp = cur; // feed forward - { + attn_norm->src[3] = ffn_inp; + // cur->ne[1] is the input length. we use dense ffn at prompting phase for bettern perf + if (llama_use_sparse_inference(&model)) { + cur = llm_build_ffn_sparse(ctx0, attn_norm, + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + model.layers[il].ffn_down_t, + model.layers[il].mlp_pre_w1, + model.layers[il].mlp_pre_w2, + inpL, + model.layers[il].gpu_idx, + model.layers[il].gpu_bucket, model.layers[il].ffn_gate_gpu, model.layers[il].ffn_down_gpu, model.layers[il].ffn_up_gpu, // TODO: disable gpu offloading as for now + LLM_FFN_RELU, LLM_FFN_SEQ, no_offload_cb, il); + } else { cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result model.layers[il].ffn_up, NULL, NULL, NULL, - model.layers[il].ffn_down, NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + model.layers[il].ffn_down_t, NULL, + LLM_FFN_RELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -4079,6 +4735,10 @@ struct llm_build_context { return gf; } + + + + struct ggml_cgraph * build_starcoder() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -5282,6 +5942,10 @@ static struct ggml_cgraph * llama_build_graph( case LLM_ARCH_LLAMA: { result = llm.build_llama(); + // if (llm.n_tokens < 80) + // result = llm.build_llama(); + // else + // result = llm.build_llama_dense(); } break; case LLM_ARCH_BAICHUAN: { @@ -5495,7 +6159,7 @@ static int llama_decode_internal( const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3; if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) { - n_threads = 1; + n_threads = 8; } #if GGML_USE_MPI @@ -7685,6 +8349,9 @@ static ggml_type get_k_quant_type( else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; } + else if (name.find("fc1.weight") != std::string::npos || name.find("fc2.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_0; + } // This can be used to reduce the size of the Q5_K_S model. // The associated PPL increase is fully in line with the size reduction //else { @@ -7775,7 +8442,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } const size_t align = GGUF_DEFAULT_ALIGNMENT; - struct gguf_context * ctx_out = gguf_init_empty(); + struct gguf_context * ctx_out = ml.sparse_deriv == GGML_SPARSE_INFERENCE ? gguf_init_empty_sparse() : gguf_init_empty(); // copy the KV pairs from the input file gguf_set_kv (ctx_out, ml.ctx_gguf); @@ -8628,6 +9295,10 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) { return model->vocab.type; } +bool llama_use_sparse_inference(const struct llama_model * model) { + return model->sparse_deriv == GGML_SPARSE_INFERENCE; +} + int llama_n_vocab(const struct llama_model * model) { return model->vocab.id_to_token.size(); } @@ -8702,6 +9373,27 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha } } +int llama_model_apply_mlp_from_file(struct llama_model * model, const char * path_mlp, bool use_mmap) { + llama_mlp_model_loader * mlp_ml = new llama_mlp_model_loader(path_mlp, use_mmap); + if (mlp_ml -> apply_tensors_to_base_model(model) > 0) { + LLAMA_LOG_ERROR("%s: failed to apply mlp adapter\n", __func__); + return 1; + } + model -> mlp_model_loader = std::unique_ptr(mlp_ml); + return 0; +} + +// Apply postprocessing steps for PowerInfer derived models +int llama_model_apply_augmentation(struct llama_model * model) { + llama_augmentation_model_loader * aug_ml = new llama_augmentation_model_loader(model); + if (aug_ml -> apply_augmentation_to_base_model(model) > 0) { + LLAMA_LOG_ERROR("%s: failed to apply augmentation adapter\n", __func__); + return 1; + } + model -> aug_model_loader = std::unique_ptr(aug_ml); + return 0; +} + int llama_get_kv_cache_token_count(const struct llama_context * ctx) { return ctx->kv_self.head; } diff --git a/llama.h b/llama.h index e8dc04bb5..bd4a416f2 100644 --- a/llama.h +++ b/llama.h @@ -293,6 +293,7 @@ extern "C" { LLAMA_API int llama_n_ctx (const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); + LLAMA_API bool llama_use_sparse_inference(const struct llama_model * model); LLAMA_API int llama_n_vocab (const struct llama_model * model); LLAMA_API int llama_n_ctx_train(const struct llama_model * model); @@ -340,6 +341,13 @@ extern "C" { const char * path_base_model, int n_threads); + LLAMA_API int llama_model_apply_mlp_from_file( + struct llama_model * model, + const char * path_mlp, + bool use_mmap); + + LLAMA_API int llama_model_apply_augmentation(struct llama_model * model); + // // KV cache // diff --git a/scripts/export-gpu-split.py b/scripts/export-gpu-split.py new file mode 100644 index 000000000..c028f6a3e --- /dev/null +++ b/scripts/export-gpu-split.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 + +import argparse +import torch +import torch.nn as tnn +from pathlib import Path +import os +import re +import struct +from typing import Any, BinaryIO +import numpy as np +import pickle + +class ReluMLP(tnn.Module): + def __init__(self, input_dim, hidden_dim, output_dim): + super(ReluMLP, self).__init__() + self.fc1 = tnn.Linear(input_dim, hidden_dim, bias=False) + self.relu = tnn.ReLU() + self.fc2 = tnn.Linear(hidden_dim, output_dim, bias=False) + + def forward(self, x): + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + return x + + +def _load_mlp_model(model_file: Path): + model = torch.load(model_file) + # hidden_size, input_size = model.get("fc1.weight").shape + # output_size, _ = model.get("fc2.weight").shape + # mlp = ReluMLP(input_size, hidden_size, output_size) + # mlp.load_state_dict(model) + return model + + +def load_mlp_predictors(models_base: Path): + # TODO: might need a specification file to indicate which models to load. + # But for now, let's assume it is a plain directory of models_{0, ... , n_layers - 1}.pt + *_, files = next(os.walk(models_base)) + return [_load_mlp_model(models_base / f"activation_{i}.pt") for i in range(len(files))] + + +def write_file_header(fout: BinaryIO, n_tensors: int) -> None: + fout.write(b"gglp"[::-1]) # magic (GGml mLP) + fout.write(struct.pack("i", 1)) # file version + # TODO: If we found we need more common parameters, we can add them here. + fout.write(struct.pack("i", n_tensors)) + + +def write_tensor_header( + fout: BinaryIO, key: str, shape: tuple[int, ...], dtype: np.dtype +) -> None: + _NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1, "int32": 18} + bkey = key.encode("utf-8") + fout.write( + struct.pack("iii", len(shape), len(bkey), _NUMPY_TYPE_TO_FTYPE[dtype.name]) + ) + fout.write(struct.pack("i" * len(shape), *shape)) + fout.write(bkey) + # Aligns to 32 bytes + fout.seek((fout.tell() + 31) & -32) + + +# TODO: need to add more details in key name to indicate the network, layer number, etc. +def _translate_mlp_key(key: str) -> str: + match = re.match(r"^(fc\d+).weight$", key) + if not match or len(match.groups()) != 1: + raise ValueError(f"Unexpected key: {key}") + return f"{match.group(1)}.weight.mlp" + + +def append_mlp_model(fout: BinaryIO, model: ReluMLP) -> None: + model_dict = model.state_dict() + for k, v in model_dict.items(): + key = _translate_mlp_key(k) + # torch.nn.Linear stores the weight matrix as (output_dim, input_dim), so does GGML. + weights = v.half().detach().numpy() + # GGML stores the weight matrix as (input_dim, output_dim) + dims = weights.shape[::-1] + print( + f"{k} => {key} {weights.shape} {weights.dtype} {weights.nbytes/1024/1024} MiB" + ) + # TODO: add option to write in float32 + write_tensor_header(fout, key, dims, np.dtype("float16")) + weights.tofile(fout) + +def append_gpu_idx(fout: BinaryIO, activation, select_count) -> None: + values, indices = torch.topk(activation, k=int(select_count)) + gpu_idx = torch.zeros_like(activation) + gpu_idx[indices] = 1 + gpu_idx = gpu_idx.numpy().astype(np.int32) + weights = gpu_idx + dims = gpu_idx.shape[::-1] + key = "gpu_idx" + print( + f"{key} => {key} {weights.shape} {weights.dtype} {weights.nbytes/1024/1024} MiB" + ) + write_tensor_header(fout, key, dims, np.dtype("int32")) + weights.tofile(fout) + + indices = indices.numpy().astype(np.int32) + weights = indices + dims = weights.shape[::-1] + key = "gpu_bucket" + print( + f"{key} => {key} {weights.shape} {weights.dtype} {weights.nbytes/1024/1024} MiB" + ) + write_tensor_header(fout, key, dims, np.dtype("int32")) + weights = np.sort(weights) + weights.tofile(fout) + +def main(predictors_path: str, output_path: str, solver_path: str): + predictors = load_mlp_predictors(Path(predictors_path)) # predictor => activation acount + n_tensors = len(predictors) * 2 # gpu_idx and gpu_bucket + print(f"found {len(predictors)} MLP adapters with {n_tensors} tensors") + with open(solver_path, "rb") as f: + loaded_lst = pickle.load(f) + # print(f"check solver {loaded_lst}") + with open(output_path, "wb") as fout: + fout.truncate() + write_file_header(fout, n_tensors=n_tensors) + for i, activation in enumerate(predictors): + print(f"appending gpu idx layer-{i}") + append_gpu_idx(fout, activation, loaded_lst[i]) + # append_gpu_idx(fout, activation, (32768*0.0)) + + print(f"converted MLP adapters from {predictors_path} to {output_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("predictors_path", help="path to the MLP predictors") + parser.add_argument( + "output_path", + help="path to the output GGML adapter", + default="./ggml-mlp-adapters.bin", + ) + parser.add_argument("solver", help="path to the solver") + + args = parser.parse_args() + main(args.predictors_path, args.output_path, args.solver)