Merge branch 'master' into update_flake_lock_action

This commit is contained in:
Philip Taron 2024-06-24 08:26:19 -07:00 committed by GitHub
commit c7e3cd08ce
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
32 changed files with 23685 additions and 23125 deletions

View file

@ -33,15 +33,13 @@ jobs:
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
# have disabled them for now until the reason why
# is understood.
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } # Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" } - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" } - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
steps: steps:

View file

@ -30,7 +30,7 @@ jobs:
strategy: strategy:
matrix: matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED] sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
build_type: [RelWithDebInfo] build_type: [RelWithDebInfo]
include: include:
- build_type: Release - build_type: Release

View file

@ -144,9 +144,6 @@ option(LLAMA_BUILD_SERVER "llama: build server example"
option(LLAMA_LASX "llama: enable lasx" ON) option(LLAMA_LASX "llama: enable lasx" ON)
option(LLAMA_LSX "llama: enable lsx" ON) option(LLAMA_LSX "llama: enable lsx" ON)
# add perf arguments
option(LLAMA_PERF "llama: enable perf" OFF)
# Required for relocatable CMake package # Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
@ -870,10 +867,6 @@ if (LLAMA_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind) target_link_libraries(ggml PUBLIC memkind)
endif() endif()
if (LLAMA_PERF)
add_compile_definitions(GGML_PERF)
endif()
function(get_flags CCID CCVER) function(get_flags CCID CCVER)
set(C_FLAGS "") set(C_FLAGS "")
set(CXX_FLAGS "") set(CXX_FLAGS "")

View file

@ -344,9 +344,6 @@ ifdef LLAMA_GPROF
MK_CFLAGS += -pg MK_CFLAGS += -pg
MK_CXXFLAGS += -pg MK_CXXFLAGS += -pg
endif endif
ifdef LLAMA_PERF
MK_CPPFLAGS += -DGGML_PERF
endif
# Architecture specific # Architecture specific
# TODO: probably these flags need to be tweaked on some architectures # TODO: probably these flags need to be tweaked on some architectures

File diff suppressed because it is too large Load diff

View file

@ -152,7 +152,6 @@ struct gpt_params {
bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
bool embedding = false; // get only sentence embedding
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\" bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool multiline_input = false; // reverse the usage of `\` bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool simple_io = false; // improves compatibility with subprocesses and limited consoles
@ -179,6 +178,12 @@ struct gpt_params {
std::string mmproj = ""; // path to multimodal projector std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
// embedding
bool embedding = false; // get only sentence embedding
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
std::string embd_sep = "\n"; // separator of embendings
// server params // server params
int32_t port = 8080; // server listens on this network port int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_read = 600; // http read timeout in seconds
@ -377,7 +382,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
// Embedding utils // Embedding utils
// //
void llama_embd_normalize(const float * inp, float * out, int n); void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);

View file

@ -65,7 +65,8 @@ class Model:
# subclasses should define this! # subclasses should define this!
model_arch: gguf.MODEL_ARCH model_arch: gguf.MODEL_ARCH
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None): def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool,
model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
if type(self) is Model: if type(self) is Model:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
self.dir_model = dir_model self.dir_model = dir_model
@ -80,7 +81,7 @@ class Model:
if not self.is_safetensors: if not self.is_safetensors:
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin") self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = Model.load_hparams(self.dir_model) self.hparams = Model.load_hparams(self.dir_model)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
self.tensor_names = None self.tensor_names = None
if self.ftype == gguf.LlamaFileType.GUESSED: if self.ftype == gguf.LlamaFileType.GUESSED:
@ -96,7 +97,8 @@ class Model:
ftype_lw: str = ftype_up.lower() ftype_lw: str = ftype_up.lower()
# allow templating the file name with the output ftype, useful with the "auto" ftype # allow templating the file name with the output ftype, useful with the "auto" ftype
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file) self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
@classmethod @classmethod
def __init_subclass__(cls): def __init_subclass__(cls):
@ -332,6 +334,8 @@ class Model:
self.gguf_writer.close() self.gguf_writer.close()
def write_vocab(self): def write_vocab(self):
if len(self.gguf_writer.tensors) != 1:
raise ValueError('Splitting the vocabulary is not supported')
self.gguf_writer.write_header_to_file(self.fname_out) self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.close() self.gguf_writer.close()
@ -1404,6 +1408,48 @@ class LlamaModel(Model):
raise ValueError(f"Unprocessed experts: {experts}") raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("BitnetForCausalLM")
class BitnetModel(Model):
model_arch = gguf.MODEL_ARCH.BITNET
def set_vocab(self):
self._set_vocab_sentencepiece()
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(1.0)
def weight_quant(self, weight):
dtype = weight.dtype
weight = weight.float()
s = 1 / weight.abs().mean().clamp(min=1e-5)
weight = (weight * s).round().clamp(-1, 1) / s
scale = weight.abs().max().unsqueeze(0)
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
weight = torch.sign(weight).type(dtype)
return weight.type(dtype), scale.type(torch.float32)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
new_name = self.map_tensor_name(name)
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
gguf.MODEL_TENSOR.ATTN_Q,
gguf.MODEL_TENSOR.ATTN_K,
gguf.MODEL_TENSOR.ATTN_V,
gguf.MODEL_TENSOR.ATTN_OUT,
gguf.MODEL_TENSOR.FFN_UP,
gguf.MODEL_TENSOR.FFN_DOWN,
gguf.MODEL_TENSOR.FFN_GATE,
]):
# transform weight into 1/0/-1 (in fp32)
weight_torch, scale_torch = self.weight_quant(data_torch)
yield (new_name, weight_torch)
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
else:
yield (new_name, data_torch)
@Model.register("GrokForCausalLM") @Model.register("GrokForCausalLM")
class GrokModel(Model): class GrokModel(Model):
model_arch = gguf.MODEL_ARCH.GROK model_arch = gguf.MODEL_ARCH.GROK
@ -2729,6 +2775,124 @@ class DeepseekV2Model(Model):
raise ValueError(f"Unprocessed experts: {experts}") raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("T5ForConditionalGeneration")
@Model.register("T5WithLMHeadModel")
class T5Model(Model):
model_arch = gguf.MODEL_ARCH.T5
def set_vocab(self):
# to avoid TypeError: Descriptors cannot be created directly
# exception when importing sentencepiece_model_pb2
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from sentencepiece import SentencePieceProcessor
from sentencepiece import sentencepiece_model_pb2 as model
tokenizer_path = self.dir_model / 'spiece.model'
if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}")
sentencepiece_model = model.ModelProto()
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
tokenizer = SentencePieceProcessor()
tokenizer.LoadFromFile(str(tokenizer_path))
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
for token_id in range(tokenizer.vocab_size()):
piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8")
score = tokenizer.GetScore(token_id)
toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.IsUnknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.IsControl(token_id):
toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.IsUnused(token_id):
toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.IsByte(token_id):
toktype = SentencePieceTokenTypes.BYTE
tokens[token_id] = text
scores[token_id] = score
toktypes[token_id] = toktype
added_tokens_file = self.dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
added_tokens_json = json.load(f)
for key in added_tokens_json:
token_id = added_tokens_json[key]
if (token_id >= vocab_size):
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
tokens[token_id] = key.encode("utf-8")
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if vocab_size > len(tokens):
pad_count = vocab_size - len(tokens)
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
for i in range(1, pad_count + 1):
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED)
self.gguf_writer.add_tokenizer_model("t5")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_add_space_prefix(add_prefix)
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
if precompiled_charsmap:
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
self.gguf_writer.add_add_bos_token(False)
self.gguf_writer.add_add_eos_token(True)
def set_gguf_parameters(self):
self.gguf_writer.add_name("T5")
self.gguf_writer.add_context_length(self.hparams["n_positions"])
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
self.gguf_writer.add_block_count(self.hparams["num_layers"])
self.gguf_writer.add_head_count(self.hparams["num_heads"])
self.gguf_writer.add_key_length(self.hparams["d_kv"])
self.gguf_writer.add_value_length(self.hparams["d_kv"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
self.gguf_writer.add_file_type(self.ftype)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
# Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
# "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
# To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight":
logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.")
return []
return [(self.map_tensor_name(name), data_torch)]
###### CONVERSION LOGIC ###### ###### CONVERSION LOGIC ######
@ -2814,10 +2978,44 @@ def parse_args() -> argparse.Namespace:
"--verbose", action="store_true", "--verbose", action="store_true",
help="increase output verbosity", help="increase output verbosity",
) )
parser.add_argument(
"--split-max-tensors", type=int, default=0,
help="max tensors in each split",
)
parser.add_argument(
"--split-max-size", type=str, default="0",
help="max size per split N(M|G)",
)
parser.add_argument(
"--dry-run", action="store_true",
help="only print out a split plan and exit, without writing any new files",
)
parser.add_argument(
"--no-tensor-first-split", action="store_true",
help="do not add tensors to the first split (disabled by default)"
)
return parser.parse_args() return parser.parse_args()
def split_str_to_n_bytes(split_str: str) -> int:
if split_str.endswith("K"):
n = int(split_str[:-1]) * 1000
elif split_str.endswith("M"):
n = int(split_str[:-1]) * 1000 * 1000
elif split_str.endswith("G"):
n = int(split_str[:-1]) * 1000 * 1000 * 1000
elif split_str.isnumeric():
n = int(split_str)
else:
raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
if n < 0:
raise ValueError(f"Invalid split size: {split_str}, must be positive")
return n
def main() -> None: def main() -> None:
args = parse_args() args = parse_args()
@ -2850,6 +3048,10 @@ def main() -> None:
"auto": gguf.LlamaFileType.GUESSED, "auto": gguf.LlamaFileType.GUESSED,
} }
if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
logger.error("Error: Cannot use temp file when splitting")
sys.exit(1)
if args.outfile is not None: if args.outfile is not None:
fname_out = args.outfile fname_out = args.outfile
else: else:
@ -2867,7 +3069,10 @@ def main() -> None:
logger.error(f"Model {hparams['architectures'][0]} is not supported") logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1) sys.exit(1)
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name) model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file,
args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors,
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
small_first_shard=args.no_tensor_first_split)
logger.info("Set model parameters") logger.info("Set model parameters")
model_instance.set_gguf_parameters() model_instance.set_gguf_parameters()
@ -2878,13 +3083,13 @@ def main() -> None:
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
if args.vocab_only: if args.vocab_only:
logger.info(f"Exporting model vocab to '{model_instance.fname_out}'") logger.info("Exporting model vocab...")
model_instance.write_vocab() model_instance.write_vocab()
logger.info("Model vocab successfully exported.")
else: else:
logger.info(f"Exporting model to '{model_instance.fname_out}'") logger.info("Exporting model...")
model_instance.write() model_instance.write()
logger.info("Model successfully exported.")
logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -19,3 +19,43 @@ llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
``` ```
The above command will output space-separated float values. The above command will output space-separated float values.
## extra parameters
### --embd-normalize $integer$
| $integer$ | description | formula |
|-----------|---------------------|---------|
| $-1$ | none |
| $0$ | max absolute int16 | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$
| $1$ | taxicab | $\Large{x_i \over\sum \lvert x_i\rvert}$
| $2$ | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$
| $>2$ | p-norm | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$
### --embd-output-format $'string'$
| $'string'$ | description | |
|------------|------------------------------|--|
| '' | same as before | (default)
| 'array' | single embeddings | $[[x_1,...,x_n]]$
| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
| 'json' | openai style |
| 'json+' | add cosine similarity matrix |
### --embd-separator $"string"$
| $"string"$ | |
|--------------|-|
| "\n" | (default)
| "<#embSep#>" | for exemple
| "<#sep#>" | other exemple
## examples
### Unix-based systems (Linux, macOS, etc.):
```bash
./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
```
### Windows:
```powershell
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
```

View file

@ -7,13 +7,19 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static std::vector<std::string> split_lines(const std::string & s) { static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
std::string line;
std::vector<std::string> lines; std::vector<std::string> lines;
std::stringstream ss(s); size_t start = 0;
while (std::getline(ss, line)) { size_t end = s.find(separator);
lines.push_back(line);
while (end != std::string::npos) {
lines.push_back(s.substr(start, end - start));
start = end + separator.length();
end = s.find(separator, start);
} }
lines.push_back(s.substr(start)); // Add the last part
return lines; return lines;
} }
@ -24,7 +30,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
} }
} }
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
// clear previous kv_cache values (irrelevant for embeddings) // clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
@ -44,13 +50,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
float * out = output + batch.seq_id[i][0] * n_embd; float * out = output + batch.seq_id[i][0] * n_embd;
//TODO: I would also add a parameter here to enable normalization or not. llama_embd_normalize(embd, out, n_embd, embd_norm);
/*fprintf(stdout, "unnormalized_embedding:");
for (int hh = 0; hh < n_embd; hh++) {
fprintf(stdout, "%9.6f ", embd[hh]);
}
fprintf(stdout, "\n");*/
llama_embd_normalize(embd, out, n_embd);
} }
} }
@ -110,7 +110,7 @@ int main(int argc, char ** argv) {
} }
// split the prompt into lines // split the prompt into lines
std::vector<std::string> prompts = split_lines(params.prompt); std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
// max batch size // max batch size
const uint64_t n_batch = params.n_batch; const uint64_t n_batch = params.n_batch;
@ -170,7 +170,7 @@ int main(int argc, char ** argv) {
// encode if at capacity // encode if at capacity
if (batch.n_tokens + n_toks > n_batch) { if (batch.n_tokens + n_toks > n_batch) {
float * out = emb + p * n_embd; float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd); batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
llama_batch_clear(batch); llama_batch_clear(batch);
p += s; p += s;
s = 0; s = 0;
@ -183,29 +183,78 @@ int main(int argc, char ** argv) {
// final batch // final batch
float * out = emb + p * n_embd; float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd); batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
// print the first part of the embeddings or for a single prompt, the full embedding if (params.embd_out.empty()) {
fprintf(stdout, "\n"); // print the first part of the embeddings or for a single prompt, the full embedding
for (int j = 0; j < n_prompts; j++) {
fprintf(stdout, "embedding %d: ", j);
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
}
fprintf(stdout, "\n"); fprintf(stdout, "\n");
} for (int j = 0; j < n_prompts; j++) {
fprintf(stdout, "embedding %d: ", j);
// print cosine similarity matrix for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
if (n_prompts > 1) { if (params.embd_normalize == 0) {
fprintf(stdout, "\n"); fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
printf("cosine similarity matrix:\n\n"); } else {
for (int i = 0; i < n_prompts; i++) { fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
for (int j = 0; j < n_prompts; j++) { }
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f ", sim);
} }
fprintf(stdout, "\n"); fprintf(stdout, "\n");
} }
// print cosine similarity matrix
if (n_prompts > 1) {
fprintf(stdout, "\n");
printf("cosine similarity matrix:\n\n");
for (int i = 0; i < n_prompts; i++) {
fprintf(stdout, "%6.6s ", prompts[i].c_str());
}
fprintf(stdout, "\n");
for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) {
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f ", sim);
}
fprintf(stdout, "%1.10s", prompts[i].c_str());
fprintf(stdout, "\n");
}
}
}
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
const bool notArray = params.embd_out != "array";
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
for (int j = 0;;) { // at least one iteration (one prompt)
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
fprintf(stdout, "[");
for (int i = 0;;) { // at least one iteration (n_embd > 0)
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
i++;
if (i < n_embd) fprintf(stdout, ","); else break;
}
fprintf(stdout, notArray ? "]\n }" : "]");
j++;
if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
}
fprintf(stdout, notArray ? "\n ]" : "]\n");
if (params.embd_out == "json+" && n_prompts > 1) {
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
for (int i = 0;;) { // at least two iteration (n_prompts > 1)
fprintf(stdout, " [");
for (int j = 0;;) { // at least two iteration (n_prompts > 1)
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f", sim);
j++;
if (j < n_prompts) fprintf(stdout, ", "); else break;
}
fprintf(stdout, " ]");
i++;
if (i < n_prompts) fprintf(stdout, ",\n"); else break;
}
fprintf(stdout, "\n ]");
}
if (notArray) fprintf(stdout, "\n}\n");
} }
// clean up // clean up

View file

@ -634,12 +634,12 @@ return html`
<div> <div>
<div class="grammar"> <div class="grammar">
<label for="template"></label> <label for="template"></label>
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON-Scheme + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/> <textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON Schema + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
</div> </div>
<div class="grammar-columns"> <div class="grammar-columns">
<div class="json-schema-controls"> <div class="json-schema-controls">
<input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} /> <input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON-Scheme</button> <button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
</div> </div>
</div> </div>
</div> </div>

View file

@ -643,7 +643,7 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
static constexpr int qi = QI3_S; static constexpr int qi = QI3_S;
}; };
static int get_mmq_x_max_host(const int cc) { static constexpr int get_mmq_x_max_host(int cc) {
#ifdef CUDA_USE_TENSOR_CORES #ifdef CUDA_USE_TENSOR_CORES
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64; return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
#else #else
@ -652,7 +652,7 @@ static int get_mmq_x_max_host(const int cc) {
} }
// Round rows to this value for --split-mode row: // Round rows to this value for --split-mode row:
static int get_mmq_y_host(const int cc) { static constexpr int get_mmq_y_host(int cc) {
return cc >= CC_VOLTA ? 128 : 64; return cc >= CC_VOLTA ? 128 : 64;
} }

View file

@ -20,6 +20,20 @@ struct mma_int_A_I16K4 {
GGML_CUDA_ASSUME(ret < K); GGML_CUDA_ASSUME(ret < K);
return ret; return ret;
} }
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
#if defined(INT8_MMA_AVAILABLE)
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
: "+r"(x[0]), "+r"(x[1])
: "l"(xs));
#else
#pragma unroll
for (int l = 0; l < ne; ++l) {
x[l] = xs0[get_i(l)*stride + get_k(l)];
}
#endif // defined(INT8_MMA_AVAILABLE)
}
}; };
struct mma_int_A_I16K8 { struct mma_int_A_I16K8 {
@ -42,6 +56,20 @@ struct mma_int_A_I16K8 {
GGML_CUDA_ASSUME(ret < K); GGML_CUDA_ASSUME(ret < K);
return ret; return ret;
} }
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
#if defined(INT8_MMA_AVAILABLE)
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
asm("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
: "l"(xs));
#else
#pragma unroll
for (int l = 0; l < ne; ++l) {
x[l] = xs0[get_i(l)*stride + get_k(l)];
}
#endif // defined(INT8_MMA_AVAILABLE)
}
}; };
struct mma_int_B_J8K4 { struct mma_int_B_J8K4 {
@ -64,6 +92,20 @@ struct mma_int_B_J8K4 {
GGML_CUDA_ASSUME(ret < K); GGML_CUDA_ASSUME(ret < K);
return ret; return ret;
} }
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
const int * xs = xs0 + (threadIdx.x%J)*stride;
asm("ldmatrix.sync.aligned.m8n8.x1.b16 {%0}, [%1];"
: "+r"(x[0])
: "l"(xs));
#else
#pragma unroll
for (int l = 0; l < ne; ++l) {
x[l] = xs0[get_j(l)*stride + get_k(l)];
}
#endif // defined(INT8_MMA_AVAILABLE)
}
}; };
struct mma_int_B_J8K8 { struct mma_int_B_J8K8 {
@ -86,6 +128,20 @@ struct mma_int_B_J8K8 {
GGML_CUDA_ASSUME(ret < K); GGML_CUDA_ASSUME(ret < K);
return ret; return ret;
} }
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
const int * xs = xs0 + (threadIdx.x%J)*stride + ((threadIdx.x/J)*(K/2)) % K;
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
: "+r"(x[0]), "+r"(x[1])
: "l"(xs));
#else
#pragma unroll
for (int l = 0; l < ne; ++l) {
x[l] = xs0[get_j(l)*stride + get_k(l)];
}
#endif // defined(INT8_MMA_AVAILABLE)
}
}; };
struct mma_int_C_I16J8 { struct mma_int_C_I16J8 {

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

1352
ggml.c

File diff suppressed because it is too large Load diff

35
ggml.h
View file

@ -591,11 +591,7 @@ extern "C" {
struct ggml_tensor * grad; struct ggml_tensor * grad;
struct ggml_tensor * src[GGML_MAX_SRC]; struct ggml_tensor * src[GGML_MAX_SRC];
// performance // source tensor and offset for views
int perf_runs;
int64_t perf_cycles;
int64_t perf_time_us;
struct ggml_tensor * view_src; struct ggml_tensor * view_src;
size_t view_offs; size_t view_offs;
@ -605,7 +601,7 @@ extern "C" {
void * extra; // extra things e.g. for ggml-cuda.cu void * extra; // extra things e.g. for ggml-cuda.cu
char padding[8]; // char padding[4];
}; };
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -652,11 +648,6 @@ extern "C" {
struct ggml_hash_set visited_hash_table; struct ggml_hash_set visited_hash_table;
enum ggml_cgraph_eval_order order; enum ggml_cgraph_eval_order order;
// performance
int perf_runs;
int64_t perf_cycles;
int64_t perf_time_us;
}; };
// scratch buffer // scratch buffer
@ -673,28 +664,6 @@ extern "C" {
bool no_alloc; // don't allocate memory for the tensor data bool no_alloc; // don't allocate memory for the tensor data
}; };
// compute types
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
enum ggml_task_type {
GGML_TASK_TYPE_INIT = 0,
GGML_TASK_TYPE_COMPUTE,
GGML_TASK_TYPE_FINALIZE,
};
struct ggml_compute_params {
enum ggml_task_type type;
// ith = thread index, nth = number of threads
int ith, nth;
// work buffer for all threads
size_t wsize;
void * wdata;
};
// numa strategies // numa strategies
enum ggml_numa_strategy { enum ggml_numa_strategy {
GGML_NUMA_STRATEGY_DISABLED = 0, GGML_NUMA_STRATEGY_DISABLED = 0,

View file

@ -49,6 +49,7 @@ class Keys:
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
POOLING_TYPE = "{arch}.pooling_type" POOLING_TYPE = "{arch}.pooling_type"
LOGIT_SCALE = "{arch}.logit_scale" LOGIT_SCALE = "{arch}.logit_scale"
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
class Attention: class Attention:
HEAD_COUNT = "{arch}.attention.head_count" HEAD_COUNT = "{arch}.attention.head_count"
@ -62,6 +63,7 @@ class Keys:
CAUSAL = "{arch}.attention.causal" CAUSAL = "{arch}.attention.causal"
Q_LORA_RANK = "{arch}.attention.q_lora_rank" Q_LORA_RANK = "{arch}.attention.q_lora_rank"
KV_LORA_RANK = "{arch}.attention.kv_lora_rank" KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
class Rope: class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count" DIMENSION_COUNT = "{arch}.rope.dimension_count"
@ -73,6 +75,11 @@ class Keys:
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
class Split:
LLM_KV_SPLIT_NO = "split.no"
LLM_KV_SPLIT_COUNT = "split.count"
LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"
class SSM: class SSM:
CONV_KERNEL = "{arch}.ssm.conv_kernel" CONV_KERNEL = "{arch}.ssm.conv_kernel"
INNER_SIZE = "{arch}.ssm.inner_size" INNER_SIZE = "{arch}.ssm.inner_size"
@ -80,33 +87,35 @@ class Keys:
TIME_STEP_RANK = "{arch}.ssm.time_step_rank" TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
class Tokenizer: class Tokenizer:
MODEL = "tokenizer.ggml.model" MODEL = "tokenizer.ggml.model"
PRE = "tokenizer.ggml.pre" PRE = "tokenizer.ggml.pre"
LIST = "tokenizer.ggml.tokens" LIST = "tokenizer.ggml.tokens"
TOKEN_TYPE = "tokenizer.ggml.token_type" TOKEN_TYPE = "tokenizer.ggml.token_type"
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
SCORES = "tokenizer.ggml.scores" SCORES = "tokenizer.ggml.scores"
MERGES = "tokenizer.ggml.merges" MERGES = "tokenizer.ggml.merges"
BOS_ID = "tokenizer.ggml.bos_token_id" BOS_ID = "tokenizer.ggml.bos_token_id"
EOS_ID = "tokenizer.ggml.eos_token_id" EOS_ID = "tokenizer.ggml.eos_token_id"
UNK_ID = "tokenizer.ggml.unknown_token_id" UNK_ID = "tokenizer.ggml.unknown_token_id"
SEP_ID = "tokenizer.ggml.seperator_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id"
PAD_ID = "tokenizer.ggml.padding_token_id" PAD_ID = "tokenizer.ggml.padding_token_id"
CLS_ID = "tokenizer.ggml.cls_token_id" CLS_ID = "tokenizer.ggml.cls_token_id"
MASK_ID = "tokenizer.ggml.mask_token_id" MASK_ID = "tokenizer.ggml.mask_token_id"
ADD_BOS = "tokenizer.ggml.add_bos_token" ADD_BOS = "tokenizer.ggml.add_bos_token"
ADD_EOS = "tokenizer.ggml.add_eos_token" ADD_EOS = "tokenizer.ggml.add_eos_token"
ADD_PREFIX = "tokenizer.ggml.add_space_prefix" ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
HF_JSON = "tokenizer.huggingface.json" REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
RWKV = "tokenizer.rwkv.world" PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
CHAT_TEMPLATE = "tokenizer.chat_template" HF_JSON = "tokenizer.huggingface.json"
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" RWKV = "tokenizer.rwkv.world"
CHAT_TEMPLATES = "tokenizer.chat_templates" CHAT_TEMPLATE = "tokenizer.chat_template"
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
CHAT_TEMPLATES = "tokenizer.chat_templates"
# FIM/Infill special tokens constants # FIM/Infill special tokens constants
PREFIX_ID = "tokenizer.ggml.prefix_token_id" PREFIX_ID = "tokenizer.ggml.prefix_token_id"
SUFFIX_ID = "tokenizer.ggml.suffix_token_id" SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
MIDDLE_ID = "tokenizer.ggml.middle_token_id" MIDDLE_ID = "tokenizer.ggml.middle_token_id"
EOT_ID = "tokenizer.ggml.eot_token_id" EOT_ID = "tokenizer.ggml.eot_token_id"
# #
@ -115,91 +124,123 @@ class Keys:
class MODEL_ARCH(IntEnum): class MODEL_ARCH(IntEnum):
LLAMA = auto() LLAMA = auto()
FALCON = auto() FALCON = auto()
BAICHUAN = auto() BAICHUAN = auto()
GROK = auto() GROK = auto()
GPT2 = auto() GPT2 = auto()
GPTJ = auto() GPTJ = auto()
GPTNEOX = auto() GPTNEOX = auto()
MPT = auto() MPT = auto()
STARCODER = auto() STARCODER = auto()
REFACT = auto() REFACT = auto()
BERT = auto() BERT = auto()
NOMIC_BERT = auto() NOMIC_BERT = auto()
JINA_BERT_V2 = auto() JINA_BERT_V2 = auto()
BLOOM = auto() BLOOM = auto()
STABLELM = auto() STABLELM = auto()
QWEN = auto() QWEN = auto()
QWEN2 = auto() QWEN2 = auto()
QWEN2MOE = auto() QWEN2MOE = auto()
PHI2 = auto() PHI2 = auto()
PHI3 = auto() PHI3 = auto()
PLAMO = auto() PLAMO = auto()
CODESHELL = auto() CODESHELL = auto()
ORION = auto() ORION = auto()
INTERNLM2 = auto() INTERNLM2 = auto()
MINICPM = auto() MINICPM = auto()
GEMMA = auto() GEMMA = auto()
STARCODER2 = auto() STARCODER2 = auto()
MAMBA = auto() MAMBA = auto()
XVERSE = auto() XVERSE = auto()
COMMAND_R = auto() COMMAND_R = auto()
DBRX = auto() DBRX = auto()
OLMO = auto() OLMO = auto()
ARCTIC = auto() ARCTIC = auto()
DEEPSEEK2 = auto() DEEPSEEK2 = auto()
BITNET = auto()
T5 = auto()
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
TOKEN_EMBD = auto() TOKEN_EMBD = auto()
TOKEN_EMBD_NORM = auto() TOKEN_EMBD_NORM = auto()
TOKEN_TYPES = auto() TOKEN_TYPES = auto()
POS_EMBD = auto() POS_EMBD = auto()
OUTPUT = auto() OUTPUT = auto()
OUTPUT_NORM = auto() OUTPUT_NORM = auto()
ROPE_FREQS = auto() ROPE_FREQS = auto()
ROPE_FACTORS_LONG = auto() ROPE_FACTORS_LONG = auto()
ROPE_FACTORS_SHORT = auto() ROPE_FACTORS_SHORT = auto()
ATTN_Q = auto() ATTN_Q = auto()
ATTN_K = auto() ATTN_K = auto()
ATTN_V = auto() ATTN_V = auto()
ATTN_QKV = auto() ATTN_QKV = auto()
ATTN_OUT = auto() ATTN_OUT = auto()
ATTN_NORM = auto() ATTN_NORM = auto()
ATTN_NORM_2 = auto() ATTN_NORM_2 = auto()
ATTN_OUT_NORM = auto() ATTN_OUT_NORM = auto()
ATTN_ROT_EMBD = auto() ATTN_ROT_EMBD = auto()
FFN_GATE_INP = auto() FFN_GATE_INP = auto()
FFN_GATE_INP_SHEXP = auto() FFN_GATE_INP_SHEXP = auto()
FFN_NORM = auto() FFN_NORM = auto()
FFN_GATE = auto() FFN_GATE = auto()
FFN_DOWN = auto() FFN_DOWN = auto()
FFN_UP = auto() FFN_UP = auto()
FFN_ACT = auto() FFN_ACT = auto()
FFN_NORM_EXP = auto() FFN_NORM_EXP = auto()
FFN_GATE_EXP = auto() FFN_GATE_EXP = auto()
FFN_DOWN_EXP = auto() FFN_DOWN_EXP = auto()
FFN_UP_EXP = auto() FFN_UP_EXP = auto()
FFN_GATE_SHEXP = auto() FFN_GATE_SHEXP = auto()
FFN_DOWN_SHEXP = auto() FFN_DOWN_SHEXP = auto()
FFN_UP_SHEXP = auto() FFN_UP_SHEXP = auto()
ATTN_Q_NORM = auto() ATTN_Q_NORM = auto()
ATTN_K_NORM = auto() ATTN_K_NORM = auto()
LAYER_OUT_NORM = auto() LAYER_OUT_NORM = auto()
SSM_IN = auto() SSM_IN = auto()
SSM_CONV1D = auto() SSM_CONV1D = auto()
SSM_X = auto() SSM_X = auto()
SSM_DT = auto() SSM_DT = auto()
SSM_A = auto() SSM_A = auto()
SSM_D = auto() SSM_D = auto()
SSM_OUT = auto() SSM_OUT = auto()
ATTN_Q_A = auto() ATTN_Q_A = auto()
ATTN_Q_B = auto() ATTN_Q_B = auto()
ATTN_KV_A_MQA = auto() ATTN_KV_A_MQA = auto()
ATTN_KV_B = auto() ATTN_KV_B = auto()
ATTN_Q_A_NORM = auto() ATTN_Q_A_NORM = auto()
ATTN_KV_A_NORM = auto() ATTN_KV_A_NORM = auto()
FFN_SUB_NORM = auto()
ATTN_SUB_NORM = auto()
DEC_ATTN_NORM = auto()
DEC_ATTN_Q = auto()
DEC_ATTN_K = auto()
DEC_ATTN_V = auto()
DEC_ATTN_OUT = auto()
DEC_ATTN_REL_B = auto()
DEC_CROSS_ATTN_NORM = auto()
DEC_CROSS_ATTN_Q = auto()
DEC_CROSS_ATTN_K = auto()
DEC_CROSS_ATTN_V = auto()
DEC_CROSS_ATTN_OUT = auto()
DEC_CROSS_ATTN_REL_B = auto()
DEC_FFN_NORM = auto()
DEC_FFN_GATE = auto()
DEC_FFN_DOWN = auto()
DEC_FFN_UP = auto()
DEC_OUTPUT_NORM = auto()
ENC_ATTN_NORM = auto()
ENC_ATTN_Q = auto()
ENC_ATTN_K = auto()
ENC_ATTN_V = auto()
ENC_ATTN_OUT = auto()
ENC_ATTN_REL_B = auto()
ENC_FFN_NORM = auto()
ENC_FFN_GATE = auto()
ENC_FFN_DOWN = auto()
ENC_FFN_UP = auto()
ENC_OUTPUT_NORM = auto()
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@ -237,57 +278,89 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.OLMO: "olmo", MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.BITNET: "bitnet",
MODEL_ARCH.T5: "t5",
} }
TENSOR_NAMES: dict[MODEL_TENSOR, str] = { TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
MODEL_TENSOR.TOKEN_TYPES: "token_types", MODEL_TENSOR.TOKEN_TYPES: "token_types",
MODEL_TENSOR.POS_EMBD: "position_embd", MODEL_TENSOR.POS_EMBD: "position_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm", MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output", MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.ROPE_FREQS: "rope_freqs", MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long", MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short", MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp", MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp", MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp", MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a", MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
} }
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -808,6 +881,53 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP,
], ],
MODEL_ARCH.BITNET: [
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.ATTN_SUB_NORM,
MODEL_TENSOR.FFN_SUB_NORM,
],
MODEL_ARCH.T5: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.DEC_ATTN_NORM,
MODEL_TENSOR.DEC_ATTN_Q,
MODEL_TENSOR.DEC_ATTN_K,
MODEL_TENSOR.DEC_ATTN_V,
MODEL_TENSOR.DEC_ATTN_OUT,
MODEL_TENSOR.DEC_ATTN_REL_B,
MODEL_TENSOR.DEC_CROSS_ATTN_NORM,
MODEL_TENSOR.DEC_CROSS_ATTN_Q,
MODEL_TENSOR.DEC_CROSS_ATTN_K,
MODEL_TENSOR.DEC_CROSS_ATTN_V,
MODEL_TENSOR.DEC_CROSS_ATTN_OUT,
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B,
MODEL_TENSOR.DEC_FFN_NORM,
MODEL_TENSOR.DEC_FFN_GATE,
MODEL_TENSOR.DEC_FFN_DOWN,
MODEL_TENSOR.DEC_FFN_UP,
MODEL_TENSOR.DEC_OUTPUT_NORM,
MODEL_TENSOR.ENC_ATTN_NORM,
MODEL_TENSOR.ENC_ATTN_Q,
MODEL_TENSOR.ENC_ATTN_K,
MODEL_TENSOR.ENC_ATTN_V,
MODEL_TENSOR.ENC_ATTN_OUT,
MODEL_TENSOR.ENC_ATTN_REL_B,
MODEL_TENSOR.ENC_FFN_NORM,
MODEL_TENSOR.ENC_FFN_GATE,
MODEL_TENSOR.ENC_FFN_DOWN,
MODEL_TENSOR.ENC_FFN_UP,
MODEL_TENSOR.ENC_OUTPUT_NORM,
],
# TODO # TODO
} }

View file

@ -7,6 +7,7 @@ import struct
import tempfile import tempfile
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum, auto from enum import Enum, auto
from pathlib import Path
from io import BufferedWriter from io import BufferedWriter
from typing import IO, Any, Sequence, Mapping from typing import IO, Any, Sequence, Mapping
from string import ascii_letters, digits from string import ascii_letters, digits
@ -31,6 +32,9 @@ from .quants import quant_shape_from_byte_shape
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
@dataclass @dataclass
class TensorInfo: class TensorInfo:
shape: Sequence[int] shape: Sequence[int]
@ -55,11 +59,11 @@ class WriterState(Enum):
class GGUFWriter: class GGUFWriter:
fout: BufferedWriter | None fout: list[BufferedWriter] | None
path: os.PathLike[str] | str | None path: Path | None
temp_file: tempfile.SpooledTemporaryFile[bytes] | None temp_file: tempfile.SpooledTemporaryFile[bytes] | None
tensors: dict[str, TensorInfo] tensors: list[dict[str, TensorInfo]]
kv_data: dict[str, GGUFValue] kv_data: list[dict[str, GGUFValue]]
state: WriterState state: WriterState
_simple_value_packing = { _simple_value_packing = {
GGUFValueType.UINT8: "B", GGUFValueType.UINT8: "B",
@ -76,26 +80,38 @@ class GGUFWriter:
} }
def __init__( def __init__(
self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE,
endianess: GGUFEndian = GGUFEndian.LITTLE, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False
): ):
self.fout = None self.fout = None
self.path = path self.path = Path(path) if path else None
self.arch = arch self.arch = arch
self.endianess = endianess self.endianess = endianess
self.data_alignment = GGUF_DEFAULT_ALIGNMENT self.data_alignment = GGUF_DEFAULT_ALIGNMENT
self.use_temp_file = use_temp_file self.use_temp_file = use_temp_file
self.temp_file = None self.temp_file = None
self.tensors = dict() self.tensors = [{}]
self.kv_data = dict() self.kv_data = [{}]
self.split_max_tensors = split_max_tensors
self.split_max_size = split_max_size
self.dry_run = dry_run
self.small_first_shard = small_first_shard
logger.info("gguf: This GGUF file is for {0} Endian only".format( logger.info("gguf: This GGUF file is for {0} Endian only".format(
"Big" if self.endianess == GGUFEndian.BIG else "Little", "Big" if self.endianess == GGUFEndian.BIG else "Little",
)) ))
self.state = WriterState.NO_FILE self.state = WriterState.NO_FILE
if self.small_first_shard:
self.tensors.append({})
self.add_architecture() self.add_architecture()
def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None: def format_shard_names(self, path: Path) -> list[Path]:
if len(self.tensors) == 1:
return [path]
return [path.with_name(SHARD_NAME_FORMAT.format(path.stem, i + 1, len(self.tensors))) for i in range(len(self.tensors))]
def open_output_file(self, path: Path | None = None) -> None:
if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path): if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
# allow calling this multiple times as long as the path is the same # allow calling this multiple times as long as the path is the same
return return
@ -106,22 +122,58 @@ class GGUFWriter:
self.path = path self.path = path
if self.path is not None: if self.path is not None:
if self.fout is not None: filenames = self.print_plan()
self.fout.close() self.fout = [open(filename, "wb") for filename in filenames]
self.fout = open(self.path, "wb")
self.state = WriterState.EMPTY self.state = WriterState.EMPTY
def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None: def print_plan(self) -> list[Path]:
logger.info("Writing the following files:")
assert self.path is not None
filenames = self.format_shard_names(self.path)
assert len(filenames) == len(self.tensors)
for name, tensors in zip(filenames, self.tensors):
logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}")
if self.dry_run:
logger.info("Dry run, not writing files")
exit()
return filenames
def add_shard_kv_data(self) -> None:
if len(self.tensors) == 1:
return
total_tensors = sum(len(t) for t in self.tensors)
assert self.fout is not None
total_splits = len(self.fout)
self.kv_data.extend({} for _ in range(len(self.kv_data), total_splits))
for i, kv_data in enumerate(self.kv_data):
kv_data[Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16)
kv_data[Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16)
kv_data[Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32)
def write_header_to_file(self, path: Path | None = None) -> None:
if len(self.tensors) == 1 and (self.split_max_tensors != 0 or self.split_max_size != 0):
logger.warning("Model fails split requirements, not splitting")
self.open_output_file(path) self.open_output_file(path)
if self.state is not WriterState.EMPTY: if self.state is not WriterState.EMPTY:
raise ValueError(f'Expected output file to be empty, got {self.state}') raise ValueError(f'Expected output file to be empty, got {self.state}')
self._write_packed("<I", GGUF_MAGIC, skip_pack_prefix = True) assert self.fout is not None
self._write_packed("I", GGUF_VERSION) assert len(self.fout) == len(self.tensors)
self._write_packed("Q", len(self.tensors)) assert len(self.kv_data) == 1
self._write_packed("Q", len(self.kv_data))
self.flush() self.add_shard_kv_data()
for fout, tensors, kv_data in zip(self.fout, self.tensors, self.kv_data):
fout.write(self._pack("<I", GGUF_MAGIC, skip_pack_prefix = True))
fout.write(self._pack("I", GGUF_VERSION))
fout.write(self._pack("Q", len(tensors)))
fout.write(self._pack("Q", len(kv_data)))
fout.flush()
self.state = WriterState.HEADER self.state = WriterState.HEADER
def write_kv_data_to_file(self) -> None: def write_kv_data_to_file(self) -> None:
@ -129,13 +181,15 @@ class GGUFWriter:
raise ValueError(f'Expected output file to contain the header, got {self.state}') raise ValueError(f'Expected output file to contain the header, got {self.state}')
assert self.fout is not None assert self.fout is not None
kv_data = bytearray() for fout, kv_data in zip(self.fout, self.kv_data):
kv_bytes = bytearray()
for key, val in self.kv_data.items(): for key, val in kv_data.items():
kv_data += self._pack_val(key, GGUFValueType.STRING, add_vtype=False) kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
kv_data += self._pack_val(val.value, val.type, add_vtype=True) kv_bytes += self._pack_val(val.value, val.type, add_vtype=True)
fout.write(kv_bytes)
self.fout.write(kv_data)
self.flush() self.flush()
self.state = WriterState.KV_DATA self.state = WriterState.KV_DATA
@ -144,28 +198,29 @@ class GGUFWriter:
raise ValueError(f'Expected output file to contain KV data, got {self.state}') raise ValueError(f'Expected output file to contain KV data, got {self.state}')
assert self.fout is not None assert self.fout is not None
ti_data = bytearray() for fout, tensors in zip(self.fout, self.tensors):
offset_tensor = 0 ti_data = bytearray()
offset_tensor = 0
for name, ti in self.tensors.items(): for name, ti in tensors.items():
ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False) ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
n_dims = len(ti.shape) n_dims = len(ti.shape)
ti_data += self._pack("I", n_dims) ti_data += self._pack("I", n_dims)
for i in range(n_dims): for j in range(n_dims):
ti_data += self._pack("Q", ti.shape[n_dims - 1 - i]) ti_data += self._pack("Q", ti.shape[n_dims - 1 - j])
ti_data += self._pack("I", ti.dtype) ti_data += self._pack("I", ti.dtype)
ti_data += self._pack("Q", offset_tensor) ti_data += self._pack("Q", offset_tensor)
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment) offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
self.fout.write(ti_data) fout.write(ti_data)
self.flush() fout.flush()
self.state = WriterState.TI_DATA self.state = WriterState.TI_DATA
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None: def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
if key in self.kv_data: if any(key in kv_data for kv_data in self.kv_data):
raise ValueError(f'Duplicated key name {key!r}') raise ValueError(f'Duplicated key name {key!r}')
self.kv_data[key] = GGUFValue(value=val, type=vtype) self.kv_data[0][key] = GGUFValue(value=val, type=vtype)
def add_uint8(self, key: str, val: int) -> None: def add_uint8(self, key: str, val: int) -> None:
self.add_key_value(key,val, GGUFValueType.UINT8) self.add_key_value(key,val, GGUFValueType.UINT8)
@ -206,9 +261,6 @@ class GGUFWriter:
self.add_key_value(key, val, GGUFValueType.STRING) self.add_key_value(key, val, GGUFValueType.STRING)
def add_array(self, key: str, val: Sequence[Any]) -> None: def add_array(self, key: str, val: Sequence[Any]) -> None:
if not isinstance(val, Sequence):
raise ValueError("Value must be a sequence for array type")
self.add_key_value(key, val, GGUFValueType.ARRAY) self.add_key_value(key, val, GGUFValueType.ARRAY)
@staticmethod @staticmethod
@ -222,7 +274,7 @@ class GGUFWriter:
if self.state is not WriterState.NO_FILE: if self.state is not WriterState.NO_FILE:
raise ValueError(f'Expected output file to be not yet opened, got {self.state}') raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
if name in self.tensors: if any(name in tensors for tensors in self.tensors):
raise ValueError(f'Duplicated tensor name {name!r}') raise ValueError(f'Duplicated tensor name {name!r}')
if raw_dtype is None: if raw_dtype is None:
@ -247,7 +299,18 @@ class GGUFWriter:
if tensor_dtype == np.uint8: if tensor_dtype == np.uint8:
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype) tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
self.tensors[name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes) # make sure there is at least one tensor before splitting
if len(self.tensors[-1]) > 0:
if ( # split when over tensor limit
self.split_max_tensors != 0
and len(self.tensors[-1]) >= self.split_max_tensors
) or ( # split when over size limit
self.split_max_size != 0
and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size
):
self.tensors.append({})
self.tensors[-1][name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
def add_tensor( def add_tensor(
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
@ -264,7 +327,7 @@ class GGUFWriter:
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype) self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
if self.temp_file is None: if self.temp_file is None:
self.tensors[name].tensor = tensor self.tensors[-1][name].tensor = tensor
return return
tensor.tofile(self.temp_file) tensor.tofile(self.temp_file)
@ -282,9 +345,24 @@ class GGUFWriter:
if self.endianess == GGUFEndian.BIG: if self.endianess == GGUFEndian.BIG:
tensor.byteswap(inplace=True) tensor.byteswap(inplace=True)
self.write_padding(self.fout, self.fout.tell())
tensor.tofile(self.fout) file_id = -1
self.write_padding(self.fout, tensor.nbytes) for i, tensors in enumerate(self.tensors):
if len(tensors) > 0:
file_id = i
break
fout = self.fout[file_id]
# pop the first tensor info
# TODO: cleaner way to get the first key
first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0]
ti = self.tensors[file_id].pop(first_tensor_name)
assert ti.nbytes == tensor.nbytes
self.write_padding(fout, fout.tell())
tensor.tofile(fout)
self.write_padding(fout, tensor.nbytes)
self.state = WriterState.WEIGHTS self.state = WriterState.WEIGHTS
@ -293,31 +371,43 @@ class GGUFWriter:
assert self.fout is not None assert self.fout is not None
self.write_padding(self.fout, self.fout.tell()) for fout in self.fout:
self.write_padding(fout, fout.tell())
if self.temp_file is None: if self.temp_file is None:
shard_bar = None
bar = None bar = None
if progress: if progress:
from tqdm import tqdm from tqdm import tqdm
total_bytes = sum(t.nbytes for t in self.tensors.values()) total_bytes = sum(ti.nbytes for t in self.tensors for ti in t.values())
if len(self.fout) > 1:
shard_bar = tqdm(desc=f"Shard (0/{len(self.fout)})", total=None, unit="byte", unit_scale=True)
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
# relying on the fact that Python dicts preserve insertion order (since 3.7) for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)):
for ti in self.tensors.values(): if shard_bar is not None:
assert ti.tensor is not None # can only iterate once over the tensors shard_bar.set_description(f"Shard ({i + 1}/{len(self.fout)})")
assert ti.tensor.nbytes == ti.nbytes total = sum(ti.nbytes for ti in tensors.values())
ti.tensor.tofile(self.fout) shard_bar.reset(total=(total if total > 0 else None))
if bar is not None:
bar.update(ti.nbytes) # relying on the fact that Python dicts preserve insertion order (since 3.7)
self.write_padding(self.fout, ti.nbytes) for ti in tensors.values():
ti.tensor = None assert ti.tensor is not None # can only iterate once over the tensors
assert ti.tensor.nbytes == ti.nbytes
ti.tensor.tofile(fout)
if shard_bar is not None:
shard_bar.update(ti.nbytes)
if bar is not None:
bar.update(ti.nbytes)
self.write_padding(fout, ti.nbytes)
ti.tensor = None
else: else:
self.temp_file.seek(0) self.temp_file.seek(0)
shutil.copyfileobj(self.temp_file, self.fout) shutil.copyfileobj(self.temp_file, self.fout[0 if not self.small_first_shard else 1])
self.flush() self.flush()
self.temp_file.close() self.temp_file.close()
@ -325,11 +415,13 @@ class GGUFWriter:
def flush(self) -> None: def flush(self) -> None:
assert self.fout is not None assert self.fout is not None
self.fout.flush() for fout in self.fout:
fout.flush()
def close(self) -> None: def close(self) -> None:
if self.fout is not None: if self.fout is not None:
self.fout.close() for fout in self.fout:
fout.close()
self.fout = None self.fout = None
def add_architecture(self) -> None: def add_architecture(self) -> None:
@ -400,6 +492,9 @@ class GGUFWriter:
def add_parallel_residual(self, use: bool) -> None: def add_parallel_residual(self, use: bool) -> None:
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
def add_decoder_start_token_id(self, id: int) -> None:
self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
def add_head_count(self, count: int) -> None: def add_head_count(self, count: int) -> None:
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
@ -448,6 +543,9 @@ class GGUFWriter:
def add_kv_lora_rank(self, length: int) -> None: def add_kv_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length) self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
def add_relative_attn_buckets_count(self, value: int) -> None:
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
def add_pooling_type(self, value: PoolingType) -> None: def add_pooling_type(self, value: PoolingType) -> None:
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
@ -538,6 +636,12 @@ class GGUFWriter:
def add_add_space_prefix(self, value: bool) -> None: def add_add_space_prefix(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value) self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
def add_remove_extra_whitespaces(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.REMOVE_EXTRA_WS, value)
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
if not isinstance(value, str): if not isinstance(value, str):
template_default = None template_default = None
@ -599,9 +703,12 @@ class GGUFWriter:
kv_data += self._pack("Q", len(encoded_val)) kv_data += self._pack("Q", len(encoded_val))
kv_data += encoded_val kv_data += encoded_val
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val: elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
ltype = GGUFValueType.get_type(val[0]) if isinstance(val, bytes):
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]): ltype = GGUFValueType.UINT8
raise ValueError("All items in a GGUF array should be of the same type") else:
ltype = GGUFValueType.get_type(val[0])
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
raise ValueError("All items in a GGUF array should be of the same type")
kv_data += self._pack("I", ltype) kv_data += self._pack("I", ltype)
kv_data += self._pack("Q", len(val)) kv_data += self._pack("Q", len(val))
for item in val: for item in val:
@ -611,6 +718,13 @@ class GGUFWriter:
return kv_data return kv_data
def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None: @staticmethod
assert self.fout is not None def format_n_bytes_to_str(num: int) -> str:
self.fout.write(self._pack(fmt, value, skip_pack_prefix)) if num == 0:
return "negligible - metadata only"
fnum = float(num)
for unit in ("", "K", "M", "G"):
if abs(fnum) < 1000.0:
return f"{fnum:3.1f}{unit}"
fnum /= 1000.0
return f"{fnum:.1f}T - over 1TB, split recommended"

View file

@ -24,6 +24,7 @@ class TensorNameMap:
"backbone.embedding", # mamba "backbone.embedding", # mamba
"backbone.embeddings", # mamba-hf "backbone.embeddings", # mamba-hf
"transformer.in_out_embed", # Grok "transformer.in_out_embed", # Grok
"shared", # t5
), ),
# Token type embeddings # Token type embeddings
@ -413,6 +414,128 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_KV_A_NORM: ( MODEL_TENSOR.ATTN_KV_A_NORM: (
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2 "model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
), ),
MODEL_TENSOR.ATTN_SUB_NORM: (
"model.layers.{bid}.self_attn.inner_attn_ln", # bitnet
),
MODEL_TENSOR.FFN_SUB_NORM: (
"model.layers.{bid}.mlp.ffn_layernorm", # bitnet
),
MODEL_TENSOR.DEC_ATTN_NORM: (
"decoder.block.{bid}.layer.0.layer_norm", # t5
),
MODEL_TENSOR.DEC_ATTN_Q: (
"decoder.block.{bid}.layer.0.SelfAttention.q", # t5
),
MODEL_TENSOR.DEC_ATTN_K: (
"decoder.block.{bid}.layer.0.SelfAttention.k", # t5
),
MODEL_TENSOR.DEC_ATTN_V: (
"decoder.block.{bid}.layer.0.SelfAttention.v", # t5
),
MODEL_TENSOR.DEC_ATTN_OUT: (
"decoder.block.{bid}.layer.0.SelfAttention.o", # t5
),
MODEL_TENSOR.DEC_ATTN_REL_B: (
"decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
),
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
"decoder.block.{bid}.layer.1.layer_norm", # t5
),
MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
"decoder.block.{bid}.layer.1.EncDecAttention.q", # t5
),
MODEL_TENSOR.DEC_CROSS_ATTN_K: (
"decoder.block.{bid}.layer.1.EncDecAttention.k", # t5
),
MODEL_TENSOR.DEC_CROSS_ATTN_V: (
"decoder.block.{bid}.layer.1.EncDecAttention.v", # t5
),
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
"decoder.block.{bid}.layer.1.EncDecAttention.o", # t5
),
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
"decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5
),
MODEL_TENSOR.DEC_FFN_NORM: (
"decoder.block.{bid}.layer.2.layer_norm", # t5
),
MODEL_TENSOR.DEC_FFN_GATE: (
"decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5
),
MODEL_TENSOR.DEC_FFN_UP: (
"decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5
"decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5
),
MODEL_TENSOR.DEC_FFN_DOWN: (
"decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5
),
MODEL_TENSOR.DEC_OUTPUT_NORM: (
"decoder.final_layer_norm", # t5
),
MODEL_TENSOR.ENC_ATTN_NORM: (
"encoder.block.{bid}.layer.0.layer_norm", # t5
),
MODEL_TENSOR.ENC_ATTN_Q: (
"encoder.block.{bid}.layer.0.SelfAttention.q", # t5
),
MODEL_TENSOR.ENC_ATTN_K: (
"encoder.block.{bid}.layer.0.SelfAttention.k", # t5
),
MODEL_TENSOR.ENC_ATTN_V: (
"encoder.block.{bid}.layer.0.SelfAttention.v", # t5
),
MODEL_TENSOR.ENC_ATTN_OUT: (
"encoder.block.{bid}.layer.0.SelfAttention.o", # t5
),
MODEL_TENSOR.ENC_ATTN_REL_B: (
"encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
),
MODEL_TENSOR.ENC_FFN_NORM: (
"encoder.block.{bid}.layer.1.layer_norm", # t5
),
MODEL_TENSOR.ENC_FFN_GATE: (
"encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5
),
MODEL_TENSOR.ENC_FFN_UP: (
"encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5
"encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5
),
MODEL_TENSOR.ENC_FFN_DOWN: (
"encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
),
MODEL_TENSOR.ENC_OUTPUT_NORM: (
"encoder.final_layer_norm", # t5
),
} }
# architecture-specific block mappings # architecture-specific block mappings

View file

@ -208,7 +208,9 @@ def translate_tensor_name(name):
'ssm_d': 'State space model skip connection', 'ssm_d': 'State space model skip connection',
'ssm_dt': 'State space model time step', 'ssm_dt': 'State space model time step',
'ssm_out': 'State space model output projection', 'ssm_out': 'State space model output projection',
'blk': 'Block' 'blk': 'Block',
'enc': 'Encoder',
'dec': 'Decoder',
} }
expanded_words = [] expanded_words = []
@ -291,6 +293,10 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
tensor_group_name = "base" tensor_group_name = "base"
if tensor_components[0] == 'blk': if tensor_components[0] == 'blk':
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}" tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}"
elif tensor_components[0] in ['enc', 'dec'] and tensor_components[1] == 'blk':
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}.{tensor_components[2]}"
elif tensor_components[0] in ['enc', 'dec']:
tensor_group_name = f"{tensor_components[0]}"
# Check if new Tensor Group # Check if new Tensor Group
if tensor_group_name not in tensor_groups: if tensor_group_name not in tensor_groups:

243
llama.cpp
View file

@ -225,6 +225,7 @@ enum llm_arch {
LLM_ARCH_OLMO, LLM_ARCH_OLMO,
LLM_ARCH_ARCTIC, LLM_ARCH_ARCTIC,
LLM_ARCH_DEEPSEEK2, LLM_ARCH_DEEPSEEK2,
LLM_ARCH_BITNET,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
@ -263,6 +264,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_OLMO, "olmo" }, { LLM_ARCH_OLMO, "olmo" },
{ LLM_ARCH_ARCTIC, "arctic" }, { LLM_ARCH_ARCTIC, "arctic" },
{ LLM_ARCH_DEEPSEEK2, "deepseek2" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" },
{ LLM_ARCH_BITNET, "bitnet" },
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
@ -500,6 +502,8 @@ enum llm_tensor {
LLM_TENSOR_ATTN_KV_B, LLM_TENSOR_ATTN_KV_B,
LLM_TENSOR_ATTN_Q_A_NORM, LLM_TENSOR_ATTN_Q_A_NORM,
LLM_TENSOR_ATTN_KV_A_NORM, LLM_TENSOR_ATTN_KV_A_NORM,
LLM_TENSOR_ATTN_SUB_NORM,
LLM_TENSOR_FFN_SUB_NORM,
}; };
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = { static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@ -1113,6 +1117,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
}, },
}, },
{
LLM_ARCH_BITNET,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_SUB_NORM, "blk.%d.attn_sub_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
},
},
{ {
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
{ {
@ -2118,6 +2140,8 @@ struct llama_layer {
struct ggml_tensor * attn_out_norm_b; struct ggml_tensor * attn_out_norm_b;
struct ggml_tensor * attn_q_a_norm; struct ggml_tensor * attn_q_a_norm;
struct ggml_tensor * attn_kv_a_norm; struct ggml_tensor * attn_kv_a_norm;
struct ggml_tensor * attn_sub_norm;
struct ggml_tensor * ffn_sub_norm;
// attention // attention
struct ggml_tensor * wq; struct ggml_tensor * wq;
@ -2185,6 +2209,15 @@ struct llama_layer {
// long rope factors // long rope factors
struct ggml_tensor * rope_long = nullptr; struct ggml_tensor * rope_long = nullptr;
struct ggml_tensor * rope_short = nullptr; struct ggml_tensor * rope_short = nullptr;
// bitnet scale
struct ggml_tensor * wq_scale;
struct ggml_tensor * wk_scale;
struct ggml_tensor * wv_scale;
struct ggml_tensor * wo_scale;
struct ggml_tensor * ffn_gate_scale;
struct ggml_tensor * ffn_up_scale;
struct ggml_tensor * ffn_down_scale;
}; };
struct llama_kv_cell { struct llama_kv_cell {
@ -4710,6 +4743,15 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_BITNET:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 26: model.type = e_model::MODEL_3B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
default: (void)0; default: (void)0;
} }
@ -6655,6 +6697,44 @@ static bool llm_load_tensors(
} }
} }
} break; } break;
case LLM_ARCH_BITNET:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
// output
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
}
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
}
} break;
default: default:
throw std::runtime_error("unknown architecture"); throw std::runtime_error("unknown architecture");
} }
@ -7295,7 +7375,10 @@ static struct ggml_tensor * llm_build_kqv(
ggml_build_forward_expand(graph, cur); ggml_build_forward_expand(graph, cur);
cur = ggml_mul_mat(ctx, wo, cur); if (wo) {
cur = ggml_mul_mat(ctx, wo, cur);
}
if (wo_b) { if (wo_b) {
cb(cur, "kqv_wo", il); cb(cur, "kqv_wo", il);
} }
@ -11709,6 +11792,153 @@ struct llm_build_context {
return gf; return gf;
} }
struct ggml_cgraph * build_bitnet() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
// B1.K
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
// B1.V
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
nullptr, nullptr,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cur = llm_build_norm(ctx0, cur, hparams,
model.layers[il].attn_sub_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_sub_norm", il);
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
if (model.layers[il].bo) {
cur = ggml_add(ctx0, cur, model.layers[il].bo);
}
cb(cur, "attn_o_out", il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward forward
if (model.layers[il].ffn_gate_inp == nullptr) {
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
struct ggml_tensor *tmp = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_up_scale);
cb(tmp, "ffn_up", il);
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur);
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_gate_scale);
cb(cur, "ffn_gate", il);
cur = ggml_silu(ctx0, cur);
cb(cur, "ffn_silu", il);
cur = ggml_mul(ctx0, cur, tmp);
cb(cur, "ffn_gate_par", il);
cur = llm_build_norm(ctx0, cur, hparams,
model.layers[il].ffn_sub_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_sub_norm", il);
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
cb(cur, "ffn_down", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head
cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
}; };
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) { static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@ -11932,6 +12162,10 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_deepseek2(); result = llm.build_deepseek2();
} break; } break;
case LLM_ARCH_BITNET:
{
result = llm.build_bitnet();
} break;
default: default:
GGML_ASSERT(false); GGML_ASSERT(false);
} }
@ -12551,12 +12785,6 @@ static int llama_decode_internal(
} }
} }
#ifdef GGML_PERF
// print timing information per ggml operation (for debugging purposes)
// requires GGML_PERF to be defined
ggml_graph_print(gf);
#endif
// plot the computation graph in dot format (for debugging purposes) // plot the computation graph in dot format (for debugging purposes)
//if (n_past%100 == 0) { //if (n_past%100 == 0) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot"); // ggml_graph_dump_dot(gf, NULL, "llama.dot");
@ -16760,6 +16988,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_BERT: case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_STABLELM: case LLM_ARCH_STABLELM:
case LLM_ARCH_BITNET:
case LLM_ARCH_QWEN: case LLM_ARCH_QWEN:
case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2:
case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN2MOE:

View file

@ -786,7 +786,7 @@ extern "C" {
// Get the number of threads used for prompt and batch processing (multiple token). // Get the number of threads used for prompt and batch processing (multiple token).
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx); LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
// Set whether the model is in embeddings model or not // Set whether the model is in embeddings mode or not
// If true, embeddings will be returned but logits will not // If true, embeddings will be returned but logits will not
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings); LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);

View file

@ -249,9 +249,8 @@ class tinyBLAS {
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
} }
void matmul(int64_t m, int64_t n, int task) { void matmul(int64_t m, int64_t n) {
if (task == GGML_TASK_TYPE_COMPUTE) mnpack(0, m, 0, n);
mnpack(0, m, 0, n);
} }
private: private:
@ -458,9 +457,8 @@ class tinyBLAS_Q0_ARM {
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
} }
void matmul(int64_t m, int64_t n, int task) { void matmul(int64_t m, int64_t n) {
if (task == GGML_TASK_TYPE_COMPUTE) mnpack(0, m, 0, n);
mnpack(0, m, 0, n);
} }
private: private:
@ -596,9 +594,8 @@ class tinyBLAS_Q0_AVX {
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
} }
void matmul(int64_t m, int64_t n, int task) { void matmul(int64_t m, int64_t n) {
if (task == GGML_TASK_TYPE_COMPUTE) mnpack(0, m, 0, n);
mnpack(0, m, 0, n);
} }
private: private:
@ -829,7 +826,7 @@ class tinyBLAS_Q0_AVX {
* For example, for single-threaded single-precision GEMM you can say * For example, for single-threaded single-precision GEMM you can say
* *
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
* 0, 1, GGML_TASK_TYPE_COMPUTE, * 0, 1,
* GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32); * GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
* *
* @param m is rows in `A` and `C` * @param m is rows in `A` and `C`
@ -843,14 +840,13 @@ class tinyBLAS_Q0_AVX {
* @param ldc is row stride of `C` * @param ldc is row stride of `C`
* @param ith is thread id (must be less than `nth`) * @param ith is thread id (must be less than `nth`)
* @param nth is number of threads (must be greater than zero) * @param nth is number of threads (must be greater than zero)
* @param task is GGML task type
* @param Atype is GGML data type of `A` * @param Atype is GGML data type of `A`
* @param Btype is GGML data type of `B` * @param Btype is GGML data type of `B`
* @param Ctype is GGML data type of `C` * @param Ctype is GGML data type of `C`
* @return true if this function was able to service the matmul request * @return true if this function was able to service the matmul request
*/ */
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C, bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) { int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
assert(m >= 0); assert(m >= 0);
assert(n >= 0); assert(n >= 0);
@ -877,7 +873,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#elif defined(__AVX__) || defined(__AVX2__) #elif defined(__AVX__) || defined(__AVX2__)
if (k % 8) if (k % 8)
@ -887,7 +883,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#elif defined(__ARM_NEON) #elif defined(__ARM_NEON)
if (n < 4) if (n < 4)
@ -899,7 +895,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#else #else
return false; return false;
@ -917,7 +913,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
if (k % 8) if (k % 8)
@ -929,7 +925,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
if (n < 8) if (n < 8)
@ -943,7 +939,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const ggml_fp16_t *)B, ldb, (const ggml_fp16_t *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#elif defined(__ARM_NEON) && !defined(_MSC_VER) #elif defined(__ARM_NEON) && !defined(_MSC_VER)
if (k % 4) if (k % 4)
@ -955,7 +951,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#else #else
return false; return false;
@ -971,7 +967,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const block_q8_0 *)B, ldb, (const block_q8_0 *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#elif defined(__ARM_FEATURE_DOTPROD) #elif defined(__ARM_FEATURE_DOTPROD)
tinyBLAS_Q0_ARM<block_q8_0> tb{ tinyBLAS_Q0_ARM<block_q8_0> tb{
@ -979,7 +975,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const block_q8_0 *)B, ldb, (const block_q8_0 *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#else #else
return false; return false;
@ -995,7 +991,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const block_q8_0 *)B, ldb, (const block_q8_0 *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#elif defined(__ARM_FEATURE_DOTPROD) #elif defined(__ARM_FEATURE_DOTPROD)
tinyBLAS_Q0_ARM<block_q4_0> tb{ tinyBLAS_Q0_ARM<block_q4_0> tb{
@ -1003,7 +999,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(const block_q8_0 *)B, ldb, (const block_q8_0 *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
ith, nth}; ith, nth};
tb.matmul(m, n, task); tb.matmul(m, n);
return true; return true;
#else #else
return false; return false;
@ -1025,7 +1021,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(void)ldc; (void)ldc;
(void)ith; (void)ith;
(void)nth; (void)nth;
(void)task;
(void)Atype; (void)Atype;
(void)Btype; (void)Btype;
(void)Ctype; (void)Ctype;

View file

@ -7,7 +7,7 @@ extern "C" {
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t, bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
const void *, int64_t, void *, int64_t, int, int, const void *, int64_t, void *, int64_t, int, int,
int, int, int, int); int, int, int);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -785,6 +785,10 @@ struct test_cpy : public test_case {
return VARS_TO_STR3(type_src, type_dst, ne); return VARS_TO_STR3(type_src, type_dst, ne);
} }
double max_nmse_err() override {
return 1e-6;
}
size_t op_size(ggml_tensor * t) override { size_t op_size(ggml_tensor * t) override {
return ggml_nbytes(t) + ggml_nbytes(t->src[0]); return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
} }

View file

@ -13,7 +13,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
shared FLOAT_TYPE tmp[BLOCK_SIZE]; shared FLOAT_TYPE tmp[BLOCK_SIZE];
void main() { void main() {
const uint row = gl_WorkGroupID.x; const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
const uint tid = gl_LocalInvocationID.x; const uint tid = gl_LocalInvocationID.x;
uint a_offset, b_offset, d_offset; uint a_offset, b_offset, d_offset;

View file

@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE tmp[32]; shared FLOAT_TYPE tmp[32];
void main() { void main() {
const uint row = gl_WorkGroupID.x; const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
uint a_offset, b_offset, d_offset; uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset); get_offsets(a_offset, b_offset, d_offset);

View file

@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE tmp[32]; shared FLOAT_TYPE tmp[32];
void main() { void main() {
const uint row = gl_WorkGroupID.x; const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
uint a_offset, b_offset, d_offset; uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset); get_offsets(a_offset, b_offset, d_offset);

View file

@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE tmp[32]; shared FLOAT_TYPE tmp[32];
void main() { void main() {
const uint row = gl_WorkGroupID.x; const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
uint a_offset, b_offset, d_offset; uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset); get_offsets(a_offset, b_offset, d_offset);

View file

@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE tmp[32]; shared FLOAT_TYPE tmp[32];
void main() { void main() {
const uint row = gl_WorkGroupID.x; const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
uint a_offset, b_offset, d_offset; uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset); get_offsets(a_offset, b_offset, d_offset);

View file

@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE tmp[32]; shared FLOAT_TYPE tmp[32];
void main() { void main() {
const uint row = gl_WorkGroupID.x; const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
uint a_offset, b_offset, d_offset; uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset); get_offsets(a_offset, b_offset, d_offset);