Merge branch 'master' into update_flake_lock_action
This commit is contained in:
commit
c7e3cd08ce
32 changed files with 23685 additions and 23125 deletions
6
.github/workflows/docker.yml
vendored
6
.github/workflows/docker.yml
vendored
|
@ -33,15 +33,13 @@ jobs:
|
||||||
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
|
||||||
# have disabled them for now until the reason why
|
|
||||||
# is understood.
|
|
||||||
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
# Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
|
||||||
|
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
steps:
|
steps:
|
||||||
|
|
2
.github/workflows/server.yml
vendored
2
.github/workflows/server.yml
vendored
|
@ -30,7 +30,7 @@ jobs:
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
||||||
build_type: [RelWithDebInfo]
|
build_type: [RelWithDebInfo]
|
||||||
include:
|
include:
|
||||||
- build_type: Release
|
- build_type: Release
|
||||||
|
|
|
@ -144,9 +144,6 @@ option(LLAMA_BUILD_SERVER "llama: build server example"
|
||||||
option(LLAMA_LASX "llama: enable lasx" ON)
|
option(LLAMA_LASX "llama: enable lasx" ON)
|
||||||
option(LLAMA_LSX "llama: enable lsx" ON)
|
option(LLAMA_LSX "llama: enable lsx" ON)
|
||||||
|
|
||||||
# add perf arguments
|
|
||||||
option(LLAMA_PERF "llama: enable perf" OFF)
|
|
||||||
|
|
||||||
# Required for relocatable CMake package
|
# Required for relocatable CMake package
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||||
|
|
||||||
|
@ -870,10 +867,6 @@ if (LLAMA_CPU_HBM)
|
||||||
target_link_libraries(ggml PUBLIC memkind)
|
target_link_libraries(ggml PUBLIC memkind)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_PERF)
|
|
||||||
add_compile_definitions(GGML_PERF)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
function(get_flags CCID CCVER)
|
function(get_flags CCID CCVER)
|
||||||
set(C_FLAGS "")
|
set(C_FLAGS "")
|
||||||
set(CXX_FLAGS "")
|
set(CXX_FLAGS "")
|
||||||
|
|
3
Makefile
3
Makefile
|
@ -344,9 +344,6 @@ ifdef LLAMA_GPROF
|
||||||
MK_CFLAGS += -pg
|
MK_CFLAGS += -pg
|
||||||
MK_CXXFLAGS += -pg
|
MK_CXXFLAGS += -pg
|
||||||
endif
|
endif
|
||||||
ifdef LLAMA_PERF
|
|
||||||
MK_CPPFLAGS += -DGGML_PERF
|
|
||||||
endif
|
|
||||||
|
|
||||||
# Architecture specific
|
# Architecture specific
|
||||||
# TODO: probably these flags need to be tweaked on some architectures
|
# TODO: probably these flags need to be tweaked on some architectures
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -152,7 +152,6 @@ struct gpt_params {
|
||||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||||
|
|
||||||
bool embedding = false; // get only sentence embedding
|
|
||||||
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
||||||
bool multiline_input = false; // reverse the usage of `\`
|
bool multiline_input = false; // reverse the usage of `\`
|
||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
|
@ -179,6 +178,12 @@ struct gpt_params {
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
|
// embedding
|
||||||
|
bool embedding = false; // get only sentence embedding
|
||||||
|
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||||
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||||
|
std::string embd_sep = "\n"; // separator of embendings
|
||||||
|
|
||||||
// server params
|
// server params
|
||||||
int32_t port = 8080; // server listens on this network port
|
int32_t port = 8080; // server listens on this network port
|
||||||
int32_t timeout_read = 600; // http read timeout in seconds
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
|
@ -377,7 +382,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
|
||||||
// Embedding utils
|
// Embedding utils
|
||||||
//
|
//
|
||||||
|
|
||||||
void llama_embd_normalize(const float * inp, float * out, int n);
|
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
||||||
|
|
||||||
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,8 @@ class Model:
|
||||||
# subclasses should define this!
|
# subclasses should define this!
|
||||||
model_arch: gguf.MODEL_ARCH
|
model_arch: gguf.MODEL_ARCH
|
||||||
|
|
||||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool,
|
||||||
|
model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
||||||
if type(self) is Model:
|
if type(self) is Model:
|
||||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||||
self.dir_model = dir_model
|
self.dir_model = dir_model
|
||||||
|
@ -80,7 +81,7 @@ class Model:
|
||||||
if not self.is_safetensors:
|
if not self.is_safetensors:
|
||||||
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
||||||
self.hparams = Model.load_hparams(self.dir_model)
|
self.hparams = Model.load_hparams(self.dir_model)
|
||||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
||||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||||
self.tensor_names = None
|
self.tensor_names = None
|
||||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||||
|
@ -96,7 +97,8 @@ class Model:
|
||||||
ftype_lw: str = ftype_up.lower()
|
ftype_lw: str = ftype_up.lower()
|
||||||
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
||||||
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
||||||
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
||||||
|
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __init_subclass__(cls):
|
def __init_subclass__(cls):
|
||||||
|
@ -332,6 +334,8 @@ class Model:
|
||||||
self.gguf_writer.close()
|
self.gguf_writer.close()
|
||||||
|
|
||||||
def write_vocab(self):
|
def write_vocab(self):
|
||||||
|
if len(self.gguf_writer.tensors) != 1:
|
||||||
|
raise ValueError('Splitting the vocabulary is not supported')
|
||||||
self.gguf_writer.write_header_to_file(self.fname_out)
|
self.gguf_writer.write_header_to_file(self.fname_out)
|
||||||
self.gguf_writer.write_kv_data_to_file()
|
self.gguf_writer.write_kv_data_to_file()
|
||||||
self.gguf_writer.close()
|
self.gguf_writer.close()
|
||||||
|
@ -1404,6 +1408,48 @@ class LlamaModel(Model):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("BitnetForCausalLM")
|
||||||
|
class BitnetModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.BITNET
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||||
|
|
||||||
|
def weight_quant(self, weight):
|
||||||
|
dtype = weight.dtype
|
||||||
|
weight = weight.float()
|
||||||
|
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
||||||
|
weight = (weight * s).round().clamp(-1, 1) / s
|
||||||
|
scale = weight.abs().max().unsqueeze(0)
|
||||||
|
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
||||||
|
weight = torch.sign(weight).type(dtype)
|
||||||
|
return weight.type(dtype), scale.type(torch.float32)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
new_name = self.map_tensor_name(name)
|
||||||
|
|
||||||
|
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
|
||||||
|
gguf.MODEL_TENSOR.ATTN_Q,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_K,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_V,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_OUT,
|
||||||
|
gguf.MODEL_TENSOR.FFN_UP,
|
||||||
|
gguf.MODEL_TENSOR.FFN_DOWN,
|
||||||
|
gguf.MODEL_TENSOR.FFN_GATE,
|
||||||
|
]):
|
||||||
|
# transform weight into 1/0/-1 (in fp32)
|
||||||
|
weight_torch, scale_torch = self.weight_quant(data_torch)
|
||||||
|
yield (new_name, weight_torch)
|
||||||
|
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
||||||
|
else:
|
||||||
|
yield (new_name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GrokForCausalLM")
|
@Model.register("GrokForCausalLM")
|
||||||
class GrokModel(Model):
|
class GrokModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GROK
|
model_arch = gguf.MODEL_ARCH.GROK
|
||||||
|
@ -2729,6 +2775,124 @@ class DeepseekV2Model(Model):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("T5ForConditionalGeneration")
|
||||||
|
@Model.register("T5WithLMHeadModel")
|
||||||
|
class T5Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.T5
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
# to avoid TypeError: Descriptors cannot be created directly
|
||||||
|
# exception when importing sentencepiece_model_pb2
|
||||||
|
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
from sentencepiece import sentencepiece_model_pb2 as model
|
||||||
|
|
||||||
|
tokenizer_path = self.dir_model / 'spiece.model'
|
||||||
|
|
||||||
|
if not tokenizer_path.is_file():
|
||||||
|
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
||||||
|
|
||||||
|
sentencepiece_model = model.ModelProto()
|
||||||
|
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
||||||
|
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
||||||
|
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
||||||
|
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
||||||
|
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
||||||
|
|
||||||
|
tokenizer = SentencePieceProcessor()
|
||||||
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||||
|
|
||||||
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||||
|
|
||||||
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||||
|
scores: list[float] = [-10000.0] * vocab_size
|
||||||
|
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||||
|
|
||||||
|
for token_id in range(tokenizer.vocab_size()):
|
||||||
|
piece = tokenizer.IdToPiece(token_id)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score = tokenizer.GetScore(token_id)
|
||||||
|
|
||||||
|
toktype = SentencePieceTokenTypes.NORMAL
|
||||||
|
if tokenizer.IsUnknown(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||||
|
elif tokenizer.IsControl(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.CONTROL
|
||||||
|
elif tokenizer.IsUnused(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNUSED
|
||||||
|
elif tokenizer.IsByte(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.BYTE
|
||||||
|
|
||||||
|
tokens[token_id] = text
|
||||||
|
scores[token_id] = score
|
||||||
|
toktypes[token_id] = toktype
|
||||||
|
|
||||||
|
added_tokens_file = self.dir_model / 'added_tokens.json'
|
||||||
|
if added_tokens_file.is_file():
|
||||||
|
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||||
|
added_tokens_json = json.load(f)
|
||||||
|
for key in added_tokens_json:
|
||||||
|
token_id = added_tokens_json[key]
|
||||||
|
if (token_id >= vocab_size):
|
||||||
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
tokens[token_id] = key.encode("utf-8")
|
||||||
|
scores[token_id] = -1000.0
|
||||||
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
|
||||||
|
if vocab_size > len(tokens):
|
||||||
|
pad_count = vocab_size - len(tokens)
|
||||||
|
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
||||||
|
for i in range(1, pad_count + 1):
|
||||||
|
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
||||||
|
scores.append(-1000.0)
|
||||||
|
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("t5")
|
||||||
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
self.gguf_writer.add_add_space_prefix(add_prefix)
|
||||||
|
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
||||||
|
if precompiled_charsmap:
|
||||||
|
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
self.gguf_writer.add_add_eos_token(True)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
self.gguf_writer.add_name("T5")
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
||||||
|
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
||||||
|
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["num_heads"])
|
||||||
|
self.gguf_writer.add_key_length(self.hparams["d_kv"])
|
||||||
|
self.gguf_writer.add_value_length(self.hparams["d_kv"])
|
||||||
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
del bid # unused
|
||||||
|
|
||||||
|
# Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
|
||||||
|
# "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
|
||||||
|
# To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
|
||||||
|
if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight":
|
||||||
|
logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
@ -2814,10 +2978,44 @@ def parse_args() -> argparse.Namespace:
|
||||||
"--verbose", action="store_true",
|
"--verbose", action="store_true",
|
||||||
help="increase output verbosity",
|
help="increase output verbosity",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-max-tensors", type=int, default=0,
|
||||||
|
help="max tensors in each split",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-max-size", type=str, default="0",
|
||||||
|
help="max size per split N(M|G)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dry-run", action="store_true",
|
||||||
|
help="only print out a split plan and exit, without writing any new files",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-tensor-first-split", action="store_true",
|
||||||
|
help="do not add tensors to the first split (disabled by default)"
|
||||||
|
)
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def split_str_to_n_bytes(split_str: str) -> int:
|
||||||
|
if split_str.endswith("K"):
|
||||||
|
n = int(split_str[:-1]) * 1000
|
||||||
|
elif split_str.endswith("M"):
|
||||||
|
n = int(split_str[:-1]) * 1000 * 1000
|
||||||
|
elif split_str.endswith("G"):
|
||||||
|
n = int(split_str[:-1]) * 1000 * 1000 * 1000
|
||||||
|
elif split_str.isnumeric():
|
||||||
|
n = int(split_str)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
|
||||||
|
|
||||||
|
if n < 0:
|
||||||
|
raise ValueError(f"Invalid split size: {split_str}, must be positive")
|
||||||
|
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
|
@ -2850,6 +3048,10 @@ def main() -> None:
|
||||||
"auto": gguf.LlamaFileType.GUESSED,
|
"auto": gguf.LlamaFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
|
||||||
|
logger.error("Error: Cannot use temp file when splitting")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
if args.outfile is not None:
|
if args.outfile is not None:
|
||||||
fname_out = args.outfile
|
fname_out = args.outfile
|
||||||
else:
|
else:
|
||||||
|
@ -2867,7 +3069,10 @@ def main() -> None:
|
||||||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file,
|
||||||
|
args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors,
|
||||||
|
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
||||||
|
small_first_shard=args.no_tensor_first_split)
|
||||||
|
|
||||||
logger.info("Set model parameters")
|
logger.info("Set model parameters")
|
||||||
model_instance.set_gguf_parameters()
|
model_instance.set_gguf_parameters()
|
||||||
|
@ -2878,13 +3083,13 @@ def main() -> None:
|
||||||
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
||||||
|
|
||||||
if args.vocab_only:
|
if args.vocab_only:
|
||||||
logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
|
logger.info("Exporting model vocab...")
|
||||||
model_instance.write_vocab()
|
model_instance.write_vocab()
|
||||||
|
logger.info("Model vocab successfully exported.")
|
||||||
else:
|
else:
|
||||||
logger.info(f"Exporting model to '{model_instance.fname_out}'")
|
logger.info("Exporting model...")
|
||||||
model_instance.write()
|
model_instance.write()
|
||||||
|
logger.info("Model successfully exported.")
|
||||||
logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -19,3 +19,43 @@ llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
||||||
```
|
```
|
||||||
|
|
||||||
The above command will output space-separated float values.
|
The above command will output space-separated float values.
|
||||||
|
|
||||||
|
## extra parameters
|
||||||
|
### --embd-normalize $integer$
|
||||||
|
| $integer$ | description | formula |
|
||||||
|
|-----------|---------------------|---------|
|
||||||
|
| $-1$ | none |
|
||||||
|
| $0$ | max absolute int16 | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$
|
||||||
|
| $1$ | taxicab | $\Large{x_i \over\sum \lvert x_i\rvert}$
|
||||||
|
| $2$ | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$
|
||||||
|
| $>2$ | p-norm | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$
|
||||||
|
|
||||||
|
### --embd-output-format $'string'$
|
||||||
|
| $'string'$ | description | |
|
||||||
|
|------------|------------------------------|--|
|
||||||
|
| '' | same as before | (default)
|
||||||
|
| 'array' | single embeddings | $[[x_1,...,x_n]]$
|
||||||
|
| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
|
||||||
|
| 'json' | openai style |
|
||||||
|
| 'json+' | add cosine similarity matrix |
|
||||||
|
|
||||||
|
### --embd-separator $"string"$
|
||||||
|
| $"string"$ | |
|
||||||
|
|--------------|-|
|
||||||
|
| "\n" | (default)
|
||||||
|
| "<#embSep#>" | for exemple
|
||||||
|
| "<#sep#>" | other exemple
|
||||||
|
|
||||||
|
## examples
|
||||||
|
### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||||
|
```
|
||||||
|
|
||||||
|
### Windows:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||||
|
```
|
||||||
|
|
||||||
|
|
|
@ -7,13 +7,19 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static std::vector<std::string> split_lines(const std::string & s) {
|
static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
|
||||||
std::string line;
|
|
||||||
std::vector<std::string> lines;
|
std::vector<std::string> lines;
|
||||||
std::stringstream ss(s);
|
size_t start = 0;
|
||||||
while (std::getline(ss, line)) {
|
size_t end = s.find(separator);
|
||||||
lines.push_back(line);
|
|
||||||
|
while (end != std::string::npos) {
|
||||||
|
lines.push_back(s.substr(start, end - start));
|
||||||
|
start = end + separator.length();
|
||||||
|
end = s.find(separator, start);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lines.push_back(s.substr(start)); // Add the last part
|
||||||
|
|
||||||
return lines;
|
return lines;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -24,7 +30,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
||||||
// clear previous kv_cache values (irrelevant for embeddings)
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
|
@ -44,13 +50,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||||
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
|
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
|
||||||
|
|
||||||
float * out = output + batch.seq_id[i][0] * n_embd;
|
float * out = output + batch.seq_id[i][0] * n_embd;
|
||||||
//TODO: I would also add a parameter here to enable normalization or not.
|
llama_embd_normalize(embd, out, n_embd, embd_norm);
|
||||||
/*fprintf(stdout, "unnormalized_embedding:");
|
|
||||||
for (int hh = 0; hh < n_embd; hh++) {
|
|
||||||
fprintf(stdout, "%9.6f ", embd[hh]);
|
|
||||||
}
|
|
||||||
fprintf(stdout, "\n");*/
|
|
||||||
llama_embd_normalize(embd, out, n_embd);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -110,7 +110,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// split the prompt into lines
|
// split the prompt into lines
|
||||||
std::vector<std::string> prompts = split_lines(params.prompt);
|
std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
|
||||||
|
|
||||||
// max batch size
|
// max batch size
|
||||||
const uint64_t n_batch = params.n_batch;
|
const uint64_t n_batch = params.n_batch;
|
||||||
|
@ -170,7 +170,7 @@ int main(int argc, char ** argv) {
|
||||||
// encode if at capacity
|
// encode if at capacity
|
||||||
if (batch.n_tokens + n_toks > n_batch) {
|
if (batch.n_tokens + n_toks > n_batch) {
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd;
|
||||||
batch_decode(ctx, batch, out, s, n_embd);
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
p += s;
|
p += s;
|
||||||
s = 0;
|
s = 0;
|
||||||
|
@ -183,15 +183,20 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// final batch
|
// final batch
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd;
|
||||||
batch_decode(ctx, batch, out, s, n_embd);
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||||
|
|
||||||
|
if (params.embd_out.empty()) {
|
||||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
fprintf(stdout, "embedding %d: ", j);
|
fprintf(stdout, "embedding %d: ", j);
|
||||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
||||||
|
if (params.embd_normalize == 0) {
|
||||||
|
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
||||||
|
} else {
|
||||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -199,14 +204,58 @@ int main(int argc, char ** argv) {
|
||||||
if (n_prompts > 1) {
|
if (n_prompts > 1) {
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
printf("cosine similarity matrix:\n\n");
|
printf("cosine similarity matrix:\n\n");
|
||||||
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
|
fprintf(stdout, "%6.6s ", prompts[i].c_str());
|
||||||
|
}
|
||||||
|
fprintf(stdout, "\n");
|
||||||
for (int i = 0; i < n_prompts; i++) {
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
fprintf(stdout, "%6.2f ", sim);
|
fprintf(stdout, "%6.2f ", sim);
|
||||||
}
|
}
|
||||||
|
fprintf(stdout, "%1.10s", prompts[i].c_str());
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
|
||||||
|
const bool notArray = params.embd_out != "array";
|
||||||
|
|
||||||
|
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
|
||||||
|
for (int j = 0;;) { // at least one iteration (one prompt)
|
||||||
|
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
||||||
|
fprintf(stdout, "[");
|
||||||
|
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
||||||
|
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
||||||
|
i++;
|
||||||
|
if (i < n_embd) fprintf(stdout, ","); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, notArray ? "]\n }" : "]");
|
||||||
|
j++;
|
||||||
|
if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, notArray ? "\n ]" : "]\n");
|
||||||
|
|
||||||
|
if (params.embd_out == "json+" && n_prompts > 1) {
|
||||||
|
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
|
||||||
|
for (int i = 0;;) { // at least two iteration (n_prompts > 1)
|
||||||
|
fprintf(stdout, " [");
|
||||||
|
for (int j = 0;;) { // at least two iteration (n_prompts > 1)
|
||||||
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
|
fprintf(stdout, "%6.2f", sim);
|
||||||
|
j++;
|
||||||
|
if (j < n_prompts) fprintf(stdout, ", "); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, " ]");
|
||||||
|
i++;
|
||||||
|
if (i < n_prompts) fprintf(stdout, ",\n"); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, "\n ]");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (notArray) fprintf(stdout, "\n}\n");
|
||||||
|
}
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
|
|
|
@ -634,12 +634,12 @@ return html`
|
||||||
<div>
|
<div>
|
||||||
<div class="grammar">
|
<div class="grammar">
|
||||||
<label for="template"></label>
|
<label for="template"></label>
|
||||||
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON-Scheme + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON Schema + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
||||||
</div>
|
</div>
|
||||||
<div class="grammar-columns">
|
<div class="grammar-columns">
|
||||||
<div class="json-schema-controls">
|
<div class="json-schema-controls">
|
||||||
<input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
<input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
||||||
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON-Scheme</button>
|
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -643,7 +643,7 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
|
||||||
static constexpr int qi = QI3_S;
|
static constexpr int qi = QI3_S;
|
||||||
};
|
};
|
||||||
|
|
||||||
static int get_mmq_x_max_host(const int cc) {
|
static constexpr int get_mmq_x_max_host(int cc) {
|
||||||
#ifdef CUDA_USE_TENSOR_CORES
|
#ifdef CUDA_USE_TENSOR_CORES
|
||||||
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
|
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
|
||||||
#else
|
#else
|
||||||
|
@ -652,7 +652,7 @@ static int get_mmq_x_max_host(const int cc) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Round rows to this value for --split-mode row:
|
// Round rows to this value for --split-mode row:
|
||||||
static int get_mmq_y_host(const int cc) {
|
static constexpr int get_mmq_y_host(int cc) {
|
||||||
return cc >= CC_VOLTA ? 128 : 64;
|
return cc >= CC_VOLTA ? 128 : 64;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,20 @@ struct mma_int_A_I16K4 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE)
|
||||||
|
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||||
|
: "+r"(x[0]), "+r"(x[1])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_i(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_A_I16K8 {
|
struct mma_int_A_I16K8 {
|
||||||
|
@ -42,6 +56,20 @@ struct mma_int_A_I16K8 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE)
|
||||||
|
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
|
||||||
|
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_i(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_B_J8K4 {
|
struct mma_int_B_J8K4 {
|
||||||
|
@ -64,6 +92,20 @@ struct mma_int_B_J8K4 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
|
||||||
|
const int * xs = xs0 + (threadIdx.x%J)*stride;
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x1.b16 {%0}, [%1];"
|
||||||
|
: "+r"(x[0])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_j(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_B_J8K8 {
|
struct mma_int_B_J8K8 {
|
||||||
|
@ -86,6 +128,20 @@ struct mma_int_B_J8K8 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
|
||||||
|
const int * xs = xs0 + (threadIdx.x%J)*stride + ((threadIdx.x/J)*(K/2)) % K;
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||||
|
: "+r"(x[0]), "+r"(x[1])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_j(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_C_I16J8 {
|
struct mma_int_C_I16J8 {
|
||||||
|
|
1204
ggml-cuda/mmq.cuh
1204
ggml-cuda/mmq.cuh
File diff suppressed because it is too large
Load diff
39661
ggml-vulkan-shaders.hpp
39661
ggml-vulkan-shaders.hpp
File diff suppressed because it is too large
Load diff
2053
ggml-vulkan.cpp
2053
ggml-vulkan.cpp
File diff suppressed because it is too large
Load diff
35
ggml.h
35
ggml.h
|
@ -591,11 +591,7 @@ extern "C" {
|
||||||
struct ggml_tensor * grad;
|
struct ggml_tensor * grad;
|
||||||
struct ggml_tensor * src[GGML_MAX_SRC];
|
struct ggml_tensor * src[GGML_MAX_SRC];
|
||||||
|
|
||||||
// performance
|
// source tensor and offset for views
|
||||||
int perf_runs;
|
|
||||||
int64_t perf_cycles;
|
|
||||||
int64_t perf_time_us;
|
|
||||||
|
|
||||||
struct ggml_tensor * view_src;
|
struct ggml_tensor * view_src;
|
||||||
size_t view_offs;
|
size_t view_offs;
|
||||||
|
|
||||||
|
@ -605,7 +601,7 @@ extern "C" {
|
||||||
|
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
char padding[8];
|
// char padding[4];
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
@ -652,11 +648,6 @@ extern "C" {
|
||||||
struct ggml_hash_set visited_hash_table;
|
struct ggml_hash_set visited_hash_table;
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order order;
|
enum ggml_cgraph_eval_order order;
|
||||||
|
|
||||||
// performance
|
|
||||||
int perf_runs;
|
|
||||||
int64_t perf_cycles;
|
|
||||||
int64_t perf_time_us;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// scratch buffer
|
// scratch buffer
|
||||||
|
@ -673,28 +664,6 @@ extern "C" {
|
||||||
bool no_alloc; // don't allocate memory for the tensor data
|
bool no_alloc; // don't allocate memory for the tensor data
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// compute types
|
|
||||||
|
|
||||||
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
|
||||||
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
|
||||||
enum ggml_task_type {
|
|
||||||
GGML_TASK_TYPE_INIT = 0,
|
|
||||||
GGML_TASK_TYPE_COMPUTE,
|
|
||||||
GGML_TASK_TYPE_FINALIZE,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_compute_params {
|
|
||||||
enum ggml_task_type type;
|
|
||||||
|
|
||||||
// ith = thread index, nth = number of threads
|
|
||||||
int ith, nth;
|
|
||||||
|
|
||||||
// work buffer for all threads
|
|
||||||
size_t wsize;
|
|
||||||
void * wdata;
|
|
||||||
};
|
|
||||||
|
|
||||||
// numa strategies
|
// numa strategies
|
||||||
enum ggml_numa_strategy {
|
enum ggml_numa_strategy {
|
||||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||||
|
|
|
@ -49,6 +49,7 @@ class Keys:
|
||||||
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
||||||
POOLING_TYPE = "{arch}.pooling_type"
|
POOLING_TYPE = "{arch}.pooling_type"
|
||||||
LOGIT_SCALE = "{arch}.logit_scale"
|
LOGIT_SCALE = "{arch}.logit_scale"
|
||||||
|
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
@ -62,6 +63,7 @@ class Keys:
|
||||||
CAUSAL = "{arch}.attention.causal"
|
CAUSAL = "{arch}.attention.causal"
|
||||||
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
||||||
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
||||||
|
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
|
@ -73,6 +75,11 @@ class Keys:
|
||||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||||
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
||||||
|
|
||||||
|
class Split:
|
||||||
|
LLM_KV_SPLIT_NO = "split.no"
|
||||||
|
LLM_KV_SPLIT_COUNT = "split.count"
|
||||||
|
LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"
|
||||||
|
|
||||||
class SSM:
|
class SSM:
|
||||||
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
||||||
INNER_SIZE = "{arch}.ssm.inner_size"
|
INNER_SIZE = "{arch}.ssm.inner_size"
|
||||||
|
@ -97,6 +104,8 @@ class Keys:
|
||||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||||
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
||||||
|
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
|
||||||
|
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
|
||||||
HF_JSON = "tokenizer.huggingface.json"
|
HF_JSON = "tokenizer.huggingface.json"
|
||||||
RWKV = "tokenizer.rwkv.world"
|
RWKV = "tokenizer.rwkv.world"
|
||||||
CHAT_TEMPLATE = "tokenizer.chat_template"
|
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||||
|
@ -149,6 +158,8 @@ class MODEL_ARCH(IntEnum):
|
||||||
OLMO = auto()
|
OLMO = auto()
|
||||||
ARCTIC = auto()
|
ARCTIC = auto()
|
||||||
DEEPSEEK2 = auto()
|
DEEPSEEK2 = auto()
|
||||||
|
BITNET = auto()
|
||||||
|
T5 = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -200,6 +211,36 @@ class MODEL_TENSOR(IntEnum):
|
||||||
ATTN_KV_B = auto()
|
ATTN_KV_B = auto()
|
||||||
ATTN_Q_A_NORM = auto()
|
ATTN_Q_A_NORM = auto()
|
||||||
ATTN_KV_A_NORM = auto()
|
ATTN_KV_A_NORM = auto()
|
||||||
|
FFN_SUB_NORM = auto()
|
||||||
|
ATTN_SUB_NORM = auto()
|
||||||
|
DEC_ATTN_NORM = auto()
|
||||||
|
DEC_ATTN_Q = auto()
|
||||||
|
DEC_ATTN_K = auto()
|
||||||
|
DEC_ATTN_V = auto()
|
||||||
|
DEC_ATTN_OUT = auto()
|
||||||
|
DEC_ATTN_REL_B = auto()
|
||||||
|
DEC_CROSS_ATTN_NORM = auto()
|
||||||
|
DEC_CROSS_ATTN_Q = auto()
|
||||||
|
DEC_CROSS_ATTN_K = auto()
|
||||||
|
DEC_CROSS_ATTN_V = auto()
|
||||||
|
DEC_CROSS_ATTN_OUT = auto()
|
||||||
|
DEC_CROSS_ATTN_REL_B = auto()
|
||||||
|
DEC_FFN_NORM = auto()
|
||||||
|
DEC_FFN_GATE = auto()
|
||||||
|
DEC_FFN_DOWN = auto()
|
||||||
|
DEC_FFN_UP = auto()
|
||||||
|
DEC_OUTPUT_NORM = auto()
|
||||||
|
ENC_ATTN_NORM = auto()
|
||||||
|
ENC_ATTN_Q = auto()
|
||||||
|
ENC_ATTN_K = auto()
|
||||||
|
ENC_ATTN_V = auto()
|
||||||
|
ENC_ATTN_OUT = auto()
|
||||||
|
ENC_ATTN_REL_B = auto()
|
||||||
|
ENC_FFN_NORM = auto()
|
||||||
|
ENC_FFN_GATE = auto()
|
||||||
|
ENC_FFN_DOWN = auto()
|
||||||
|
ENC_FFN_UP = auto()
|
||||||
|
ENC_OUTPUT_NORM = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
|
@ -237,6 +278,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.OLMO: "olmo",
|
MODEL_ARCH.OLMO: "olmo",
|
||||||
MODEL_ARCH.ARCTIC: "arctic",
|
MODEL_ARCH.ARCTIC: "arctic",
|
||||||
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
||||||
|
MODEL_ARCH.BITNET: "bitnet",
|
||||||
|
MODEL_ARCH.T5: "t5",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -288,6 +331,36 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
||||||
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
||||||
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
||||||
|
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
|
||||||
|
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
|
||||||
|
MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
|
||||||
|
MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
|
||||||
|
MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
|
||||||
|
MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
|
||||||
|
MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
|
||||||
|
MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
|
||||||
|
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
|
||||||
|
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
@ -808,6 +881,53 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.BITNET: [
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_SUB_NORM,
|
||||||
|
MODEL_TENSOR.FFN_SUB_NORM,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.T5: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_NORM,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_Q,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_K,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_V,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_OUT,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_REL_B,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_K,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_V,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B,
|
||||||
|
MODEL_TENSOR.DEC_FFN_NORM,
|
||||||
|
MODEL_TENSOR.DEC_FFN_GATE,
|
||||||
|
MODEL_TENSOR.DEC_FFN_DOWN,
|
||||||
|
MODEL_TENSOR.DEC_FFN_UP,
|
||||||
|
MODEL_TENSOR.DEC_OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_Q,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_K,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_V,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_REL_B,
|
||||||
|
MODEL_TENSOR.ENC_FFN_NORM,
|
||||||
|
MODEL_TENSOR.ENC_FFN_GATE,
|
||||||
|
MODEL_TENSOR.ENC_FFN_DOWN,
|
||||||
|
MODEL_TENSOR.ENC_FFN_UP,
|
||||||
|
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
||||||
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ import struct
|
||||||
import tempfile
|
import tempfile
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
|
from pathlib import Path
|
||||||
from io import BufferedWriter
|
from io import BufferedWriter
|
||||||
from typing import IO, Any, Sequence, Mapping
|
from typing import IO, Any, Sequence, Mapping
|
||||||
from string import ascii_letters, digits
|
from string import ascii_letters, digits
|
||||||
|
@ -31,6 +32,9 @@ from .quants import quant_shape_from_byte_shape
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TensorInfo:
|
class TensorInfo:
|
||||||
shape: Sequence[int]
|
shape: Sequence[int]
|
||||||
|
@ -55,11 +59,11 @@ class WriterState(Enum):
|
||||||
|
|
||||||
|
|
||||||
class GGUFWriter:
|
class GGUFWriter:
|
||||||
fout: BufferedWriter | None
|
fout: list[BufferedWriter] | None
|
||||||
path: os.PathLike[str] | str | None
|
path: Path | None
|
||||||
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
|
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
|
||||||
tensors: dict[str, TensorInfo]
|
tensors: list[dict[str, TensorInfo]]
|
||||||
kv_data: dict[str, GGUFValue]
|
kv_data: list[dict[str, GGUFValue]]
|
||||||
state: WriterState
|
state: WriterState
|
||||||
_simple_value_packing = {
|
_simple_value_packing = {
|
||||||
GGUFValueType.UINT8: "B",
|
GGUFValueType.UINT8: "B",
|
||||||
|
@ -76,26 +80,38 @@ class GGUFWriter:
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False,
|
self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE,
|
||||||
endianess: GGUFEndian = GGUFEndian.LITTLE,
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False
|
||||||
):
|
):
|
||||||
self.fout = None
|
self.fout = None
|
||||||
self.path = path
|
self.path = Path(path) if path else None
|
||||||
self.arch = arch
|
self.arch = arch
|
||||||
self.endianess = endianess
|
self.endianess = endianess
|
||||||
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
|
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
|
||||||
self.use_temp_file = use_temp_file
|
self.use_temp_file = use_temp_file
|
||||||
self.temp_file = None
|
self.temp_file = None
|
||||||
self.tensors = dict()
|
self.tensors = [{}]
|
||||||
self.kv_data = dict()
|
self.kv_data = [{}]
|
||||||
|
self.split_max_tensors = split_max_tensors
|
||||||
|
self.split_max_size = split_max_size
|
||||||
|
self.dry_run = dry_run
|
||||||
|
self.small_first_shard = small_first_shard
|
||||||
logger.info("gguf: This GGUF file is for {0} Endian only".format(
|
logger.info("gguf: This GGUF file is for {0} Endian only".format(
|
||||||
"Big" if self.endianess == GGUFEndian.BIG else "Little",
|
"Big" if self.endianess == GGUFEndian.BIG else "Little",
|
||||||
))
|
))
|
||||||
self.state = WriterState.NO_FILE
|
self.state = WriterState.NO_FILE
|
||||||
|
|
||||||
|
if self.small_first_shard:
|
||||||
|
self.tensors.append({})
|
||||||
|
|
||||||
self.add_architecture()
|
self.add_architecture()
|
||||||
|
|
||||||
def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None:
|
def format_shard_names(self, path: Path) -> list[Path]:
|
||||||
|
if len(self.tensors) == 1:
|
||||||
|
return [path]
|
||||||
|
return [path.with_name(SHARD_NAME_FORMAT.format(path.stem, i + 1, len(self.tensors))) for i in range(len(self.tensors))]
|
||||||
|
|
||||||
|
def open_output_file(self, path: Path | None = None) -> None:
|
||||||
if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
|
if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
|
||||||
# allow calling this multiple times as long as the path is the same
|
# allow calling this multiple times as long as the path is the same
|
||||||
return
|
return
|
||||||
|
@ -106,22 +122,58 @@ class GGUFWriter:
|
||||||
self.path = path
|
self.path = path
|
||||||
|
|
||||||
if self.path is not None:
|
if self.path is not None:
|
||||||
if self.fout is not None:
|
filenames = self.print_plan()
|
||||||
self.fout.close()
|
self.fout = [open(filename, "wb") for filename in filenames]
|
||||||
self.fout = open(self.path, "wb")
|
|
||||||
self.state = WriterState.EMPTY
|
self.state = WriterState.EMPTY
|
||||||
|
|
||||||
def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None:
|
def print_plan(self) -> list[Path]:
|
||||||
|
logger.info("Writing the following files:")
|
||||||
|
assert self.path is not None
|
||||||
|
filenames = self.format_shard_names(self.path)
|
||||||
|
assert len(filenames) == len(self.tensors)
|
||||||
|
for name, tensors in zip(filenames, self.tensors):
|
||||||
|
logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}")
|
||||||
|
|
||||||
|
if self.dry_run:
|
||||||
|
logger.info("Dry run, not writing files")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
return filenames
|
||||||
|
|
||||||
|
def add_shard_kv_data(self) -> None:
|
||||||
|
if len(self.tensors) == 1:
|
||||||
|
return
|
||||||
|
|
||||||
|
total_tensors = sum(len(t) for t in self.tensors)
|
||||||
|
assert self.fout is not None
|
||||||
|
total_splits = len(self.fout)
|
||||||
|
self.kv_data.extend({} for _ in range(len(self.kv_data), total_splits))
|
||||||
|
for i, kv_data in enumerate(self.kv_data):
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16)
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16)
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32)
|
||||||
|
|
||||||
|
def write_header_to_file(self, path: Path | None = None) -> None:
|
||||||
|
if len(self.tensors) == 1 and (self.split_max_tensors != 0 or self.split_max_size != 0):
|
||||||
|
logger.warning("Model fails split requirements, not splitting")
|
||||||
|
|
||||||
self.open_output_file(path)
|
self.open_output_file(path)
|
||||||
|
|
||||||
if self.state is not WriterState.EMPTY:
|
if self.state is not WriterState.EMPTY:
|
||||||
raise ValueError(f'Expected output file to be empty, got {self.state}')
|
raise ValueError(f'Expected output file to be empty, got {self.state}')
|
||||||
|
|
||||||
self._write_packed("<I", GGUF_MAGIC, skip_pack_prefix = True)
|
assert self.fout is not None
|
||||||
self._write_packed("I", GGUF_VERSION)
|
assert len(self.fout) == len(self.tensors)
|
||||||
self._write_packed("Q", len(self.tensors))
|
assert len(self.kv_data) == 1
|
||||||
self._write_packed("Q", len(self.kv_data))
|
|
||||||
self.flush()
|
self.add_shard_kv_data()
|
||||||
|
|
||||||
|
for fout, tensors, kv_data in zip(self.fout, self.tensors, self.kv_data):
|
||||||
|
fout.write(self._pack("<I", GGUF_MAGIC, skip_pack_prefix = True))
|
||||||
|
fout.write(self._pack("I", GGUF_VERSION))
|
||||||
|
fout.write(self._pack("Q", len(tensors)))
|
||||||
|
fout.write(self._pack("Q", len(kv_data)))
|
||||||
|
fout.flush()
|
||||||
self.state = WriterState.HEADER
|
self.state = WriterState.HEADER
|
||||||
|
|
||||||
def write_kv_data_to_file(self) -> None:
|
def write_kv_data_to_file(self) -> None:
|
||||||
|
@ -129,13 +181,15 @@ class GGUFWriter:
|
||||||
raise ValueError(f'Expected output file to contain the header, got {self.state}')
|
raise ValueError(f'Expected output file to contain the header, got {self.state}')
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
|
|
||||||
kv_data = bytearray()
|
for fout, kv_data in zip(self.fout, self.kv_data):
|
||||||
|
kv_bytes = bytearray()
|
||||||
|
|
||||||
for key, val in self.kv_data.items():
|
for key, val in kv_data.items():
|
||||||
kv_data += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
|
kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
|
||||||
kv_data += self._pack_val(val.value, val.type, add_vtype=True)
|
kv_bytes += self._pack_val(val.value, val.type, add_vtype=True)
|
||||||
|
|
||||||
|
fout.write(kv_bytes)
|
||||||
|
|
||||||
self.fout.write(kv_data)
|
|
||||||
self.flush()
|
self.flush()
|
||||||
self.state = WriterState.KV_DATA
|
self.state = WriterState.KV_DATA
|
||||||
|
|
||||||
|
@ -144,28 +198,29 @@ class GGUFWriter:
|
||||||
raise ValueError(f'Expected output file to contain KV data, got {self.state}')
|
raise ValueError(f'Expected output file to contain KV data, got {self.state}')
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
|
|
||||||
|
for fout, tensors in zip(self.fout, self.tensors):
|
||||||
ti_data = bytearray()
|
ti_data = bytearray()
|
||||||
offset_tensor = 0
|
offset_tensor = 0
|
||||||
|
|
||||||
for name, ti in self.tensors.items():
|
for name, ti in tensors.items():
|
||||||
ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
|
ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
|
||||||
n_dims = len(ti.shape)
|
n_dims = len(ti.shape)
|
||||||
ti_data += self._pack("I", n_dims)
|
ti_data += self._pack("I", n_dims)
|
||||||
for i in range(n_dims):
|
for j in range(n_dims):
|
||||||
ti_data += self._pack("Q", ti.shape[n_dims - 1 - i])
|
ti_data += self._pack("Q", ti.shape[n_dims - 1 - j])
|
||||||
ti_data += self._pack("I", ti.dtype)
|
ti_data += self._pack("I", ti.dtype)
|
||||||
ti_data += self._pack("Q", offset_tensor)
|
ti_data += self._pack("Q", offset_tensor)
|
||||||
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
|
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
|
||||||
|
|
||||||
self.fout.write(ti_data)
|
fout.write(ti_data)
|
||||||
self.flush()
|
fout.flush()
|
||||||
self.state = WriterState.TI_DATA
|
self.state = WriterState.TI_DATA
|
||||||
|
|
||||||
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
|
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
|
||||||
if key in self.kv_data:
|
if any(key in kv_data for kv_data in self.kv_data):
|
||||||
raise ValueError(f'Duplicated key name {key!r}')
|
raise ValueError(f'Duplicated key name {key!r}')
|
||||||
|
|
||||||
self.kv_data[key] = GGUFValue(value=val, type=vtype)
|
self.kv_data[0][key] = GGUFValue(value=val, type=vtype)
|
||||||
|
|
||||||
def add_uint8(self, key: str, val: int) -> None:
|
def add_uint8(self, key: str, val: int) -> None:
|
||||||
self.add_key_value(key,val, GGUFValueType.UINT8)
|
self.add_key_value(key,val, GGUFValueType.UINT8)
|
||||||
|
@ -206,9 +261,6 @@ class GGUFWriter:
|
||||||
self.add_key_value(key, val, GGUFValueType.STRING)
|
self.add_key_value(key, val, GGUFValueType.STRING)
|
||||||
|
|
||||||
def add_array(self, key: str, val: Sequence[Any]) -> None:
|
def add_array(self, key: str, val: Sequence[Any]) -> None:
|
||||||
if not isinstance(val, Sequence):
|
|
||||||
raise ValueError("Value must be a sequence for array type")
|
|
||||||
|
|
||||||
self.add_key_value(key, val, GGUFValueType.ARRAY)
|
self.add_key_value(key, val, GGUFValueType.ARRAY)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -222,7 +274,7 @@ class GGUFWriter:
|
||||||
if self.state is not WriterState.NO_FILE:
|
if self.state is not WriterState.NO_FILE:
|
||||||
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
|
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
|
||||||
|
|
||||||
if name in self.tensors:
|
if any(name in tensors for tensors in self.tensors):
|
||||||
raise ValueError(f'Duplicated tensor name {name!r}')
|
raise ValueError(f'Duplicated tensor name {name!r}')
|
||||||
|
|
||||||
if raw_dtype is None:
|
if raw_dtype is None:
|
||||||
|
@ -247,7 +299,18 @@ class GGUFWriter:
|
||||||
if tensor_dtype == np.uint8:
|
if tensor_dtype == np.uint8:
|
||||||
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
||||||
|
|
||||||
self.tensors[name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
|
# make sure there is at least one tensor before splitting
|
||||||
|
if len(self.tensors[-1]) > 0:
|
||||||
|
if ( # split when over tensor limit
|
||||||
|
self.split_max_tensors != 0
|
||||||
|
and len(self.tensors[-1]) >= self.split_max_tensors
|
||||||
|
) or ( # split when over size limit
|
||||||
|
self.split_max_size != 0
|
||||||
|
and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size
|
||||||
|
):
|
||||||
|
self.tensors.append({})
|
||||||
|
|
||||||
|
self.tensors[-1][name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
|
||||||
|
|
||||||
def add_tensor(
|
def add_tensor(
|
||||||
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
|
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
|
||||||
|
@ -264,7 +327,7 @@ class GGUFWriter:
|
||||||
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
|
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
|
||||||
|
|
||||||
if self.temp_file is None:
|
if self.temp_file is None:
|
||||||
self.tensors[name].tensor = tensor
|
self.tensors[-1][name].tensor = tensor
|
||||||
return
|
return
|
||||||
|
|
||||||
tensor.tofile(self.temp_file)
|
tensor.tofile(self.temp_file)
|
||||||
|
@ -282,9 +345,24 @@ class GGUFWriter:
|
||||||
|
|
||||||
if self.endianess == GGUFEndian.BIG:
|
if self.endianess == GGUFEndian.BIG:
|
||||||
tensor.byteswap(inplace=True)
|
tensor.byteswap(inplace=True)
|
||||||
self.write_padding(self.fout, self.fout.tell())
|
|
||||||
tensor.tofile(self.fout)
|
file_id = -1
|
||||||
self.write_padding(self.fout, tensor.nbytes)
|
for i, tensors in enumerate(self.tensors):
|
||||||
|
if len(tensors) > 0:
|
||||||
|
file_id = i
|
||||||
|
break
|
||||||
|
|
||||||
|
fout = self.fout[file_id]
|
||||||
|
|
||||||
|
# pop the first tensor info
|
||||||
|
# TODO: cleaner way to get the first key
|
||||||
|
first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0]
|
||||||
|
ti = self.tensors[file_id].pop(first_tensor_name)
|
||||||
|
assert ti.nbytes == tensor.nbytes
|
||||||
|
|
||||||
|
self.write_padding(fout, fout.tell())
|
||||||
|
tensor.tofile(fout)
|
||||||
|
self.write_padding(fout, tensor.nbytes)
|
||||||
|
|
||||||
self.state = WriterState.WEIGHTS
|
self.state = WriterState.WEIGHTS
|
||||||
|
|
||||||
|
@ -293,31 +371,43 @@ class GGUFWriter:
|
||||||
|
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
|
|
||||||
self.write_padding(self.fout, self.fout.tell())
|
for fout in self.fout:
|
||||||
|
self.write_padding(fout, fout.tell())
|
||||||
|
|
||||||
if self.temp_file is None:
|
if self.temp_file is None:
|
||||||
|
shard_bar = None
|
||||||
bar = None
|
bar = None
|
||||||
|
|
||||||
if progress:
|
if progress:
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
total_bytes = sum(t.nbytes for t in self.tensors.values())
|
total_bytes = sum(ti.nbytes for t in self.tensors for ti in t.values())
|
||||||
|
|
||||||
|
if len(self.fout) > 1:
|
||||||
|
shard_bar = tqdm(desc=f"Shard (0/{len(self.fout)})", total=None, unit="byte", unit_scale=True)
|
||||||
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
||||||
|
|
||||||
|
for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)):
|
||||||
|
if shard_bar is not None:
|
||||||
|
shard_bar.set_description(f"Shard ({i + 1}/{len(self.fout)})")
|
||||||
|
total = sum(ti.nbytes for ti in tensors.values())
|
||||||
|
shard_bar.reset(total=(total if total > 0 else None))
|
||||||
|
|
||||||
# relying on the fact that Python dicts preserve insertion order (since 3.7)
|
# relying on the fact that Python dicts preserve insertion order (since 3.7)
|
||||||
for ti in self.tensors.values():
|
for ti in tensors.values():
|
||||||
assert ti.tensor is not None # can only iterate once over the tensors
|
assert ti.tensor is not None # can only iterate once over the tensors
|
||||||
assert ti.tensor.nbytes == ti.nbytes
|
assert ti.tensor.nbytes == ti.nbytes
|
||||||
ti.tensor.tofile(self.fout)
|
ti.tensor.tofile(fout)
|
||||||
|
if shard_bar is not None:
|
||||||
|
shard_bar.update(ti.nbytes)
|
||||||
if bar is not None:
|
if bar is not None:
|
||||||
bar.update(ti.nbytes)
|
bar.update(ti.nbytes)
|
||||||
self.write_padding(self.fout, ti.nbytes)
|
self.write_padding(fout, ti.nbytes)
|
||||||
ti.tensor = None
|
ti.tensor = None
|
||||||
else:
|
else:
|
||||||
self.temp_file.seek(0)
|
self.temp_file.seek(0)
|
||||||
|
|
||||||
shutil.copyfileobj(self.temp_file, self.fout)
|
shutil.copyfileobj(self.temp_file, self.fout[0 if not self.small_first_shard else 1])
|
||||||
self.flush()
|
self.flush()
|
||||||
self.temp_file.close()
|
self.temp_file.close()
|
||||||
|
|
||||||
|
@ -325,11 +415,13 @@ class GGUFWriter:
|
||||||
|
|
||||||
def flush(self) -> None:
|
def flush(self) -> None:
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
self.fout.flush()
|
for fout in self.fout:
|
||||||
|
fout.flush()
|
||||||
|
|
||||||
def close(self) -> None:
|
def close(self) -> None:
|
||||||
if self.fout is not None:
|
if self.fout is not None:
|
||||||
self.fout.close()
|
for fout in self.fout:
|
||||||
|
fout.close()
|
||||||
self.fout = None
|
self.fout = None
|
||||||
|
|
||||||
def add_architecture(self) -> None:
|
def add_architecture(self) -> None:
|
||||||
|
@ -400,6 +492,9 @@ class GGUFWriter:
|
||||||
def add_parallel_residual(self, use: bool) -> None:
|
def add_parallel_residual(self, use: bool) -> None:
|
||||||
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||||
|
|
||||||
|
def add_decoder_start_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
|
||||||
|
|
||||||
def add_head_count(self, count: int) -> None:
|
def add_head_count(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
@ -448,6 +543,9 @@ class GGUFWriter:
|
||||||
def add_kv_lora_rank(self, length: int) -> None:
|
def add_kv_lora_rank(self, length: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
|
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_relative_attn_buckets_count(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_pooling_type(self, value: PoolingType) -> None:
|
def add_pooling_type(self, value: PoolingType) -> None:
|
||||||
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
|
@ -538,6 +636,12 @@ class GGUFWriter:
|
||||||
def add_add_space_prefix(self, value: bool) -> None:
|
def add_add_space_prefix(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
||||||
|
|
||||||
|
def add_remove_extra_whitespaces(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Tokenizer.REMOVE_EXTRA_WS, value)
|
||||||
|
|
||||||
|
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
|
||||||
|
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
|
||||||
|
|
||||||
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
||||||
if not isinstance(value, str):
|
if not isinstance(value, str):
|
||||||
template_default = None
|
template_default = None
|
||||||
|
@ -599,6 +703,9 @@ class GGUFWriter:
|
||||||
kv_data += self._pack("Q", len(encoded_val))
|
kv_data += self._pack("Q", len(encoded_val))
|
||||||
kv_data += encoded_val
|
kv_data += encoded_val
|
||||||
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
|
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
|
||||||
|
if isinstance(val, bytes):
|
||||||
|
ltype = GGUFValueType.UINT8
|
||||||
|
else:
|
||||||
ltype = GGUFValueType.get_type(val[0])
|
ltype = GGUFValueType.get_type(val[0])
|
||||||
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
|
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
|
||||||
raise ValueError("All items in a GGUF array should be of the same type")
|
raise ValueError("All items in a GGUF array should be of the same type")
|
||||||
|
@ -611,6 +718,13 @@ class GGUFWriter:
|
||||||
|
|
||||||
return kv_data
|
return kv_data
|
||||||
|
|
||||||
def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None:
|
@staticmethod
|
||||||
assert self.fout is not None
|
def format_n_bytes_to_str(num: int) -> str:
|
||||||
self.fout.write(self._pack(fmt, value, skip_pack_prefix))
|
if num == 0:
|
||||||
|
return "negligible - metadata only"
|
||||||
|
fnum = float(num)
|
||||||
|
for unit in ("", "K", "M", "G"):
|
||||||
|
if abs(fnum) < 1000.0:
|
||||||
|
return f"{fnum:3.1f}{unit}"
|
||||||
|
fnum /= 1000.0
|
||||||
|
return f"{fnum:.1f}T - over 1TB, split recommended"
|
||||||
|
|
|
@ -24,6 +24,7 @@ class TensorNameMap:
|
||||||
"backbone.embedding", # mamba
|
"backbone.embedding", # mamba
|
||||||
"backbone.embeddings", # mamba-hf
|
"backbone.embeddings", # mamba-hf
|
||||||
"transformer.in_out_embed", # Grok
|
"transformer.in_out_embed", # Grok
|
||||||
|
"shared", # t5
|
||||||
),
|
),
|
||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
|
@ -413,6 +414,128 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.ATTN_KV_A_NORM: (
|
MODEL_TENSOR.ATTN_KV_A_NORM: (
|
||||||
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
|
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_SUB_NORM: (
|
||||||
|
"model.layers.{bid}.self_attn.inner_attn_ln", # bitnet
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_SUB_NORM: (
|
||||||
|
"model.layers.{bid}.mlp.ffn_layernorm", # bitnet
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.0.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_Q: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_K: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_V: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_OUT: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_REL_B: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.1.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_K: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_V: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.2.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_GATE: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_UP: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_DOWN: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_OUTPUT_NORM: (
|
||||||
|
"decoder.final_layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_NORM: (
|
||||||
|
"encoder.block.{bid}.layer.0.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_Q: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_K: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_V: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_OUT: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_REL_B: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_NORM: (
|
||||||
|
"encoder.block.{bid}.layer.1.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_GATE: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_UP: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_DOWN: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_OUTPUT_NORM: (
|
||||||
|
"encoder.final_layer_norm", # t5
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
# architecture-specific block mappings
|
# architecture-specific block mappings
|
||||||
|
|
|
@ -208,7 +208,9 @@ def translate_tensor_name(name):
|
||||||
'ssm_d': 'State space model skip connection',
|
'ssm_d': 'State space model skip connection',
|
||||||
'ssm_dt': 'State space model time step',
|
'ssm_dt': 'State space model time step',
|
||||||
'ssm_out': 'State space model output projection',
|
'ssm_out': 'State space model output projection',
|
||||||
'blk': 'Block'
|
'blk': 'Block',
|
||||||
|
'enc': 'Encoder',
|
||||||
|
'dec': 'Decoder',
|
||||||
}
|
}
|
||||||
|
|
||||||
expanded_words = []
|
expanded_words = []
|
||||||
|
@ -291,6 +293,10 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
|
||||||
tensor_group_name = "base"
|
tensor_group_name = "base"
|
||||||
if tensor_components[0] == 'blk':
|
if tensor_components[0] == 'blk':
|
||||||
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}"
|
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}"
|
||||||
|
elif tensor_components[0] in ['enc', 'dec'] and tensor_components[1] == 'blk':
|
||||||
|
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}.{tensor_components[2]}"
|
||||||
|
elif tensor_components[0] in ['enc', 'dec']:
|
||||||
|
tensor_group_name = f"{tensor_components[0]}"
|
||||||
|
|
||||||
# Check if new Tensor Group
|
# Check if new Tensor Group
|
||||||
if tensor_group_name not in tensor_groups:
|
if tensor_group_name not in tensor_groups:
|
||||||
|
|
241
llama.cpp
241
llama.cpp
|
@ -225,6 +225,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_OLMO,
|
LLM_ARCH_OLMO,
|
||||||
LLM_ARCH_ARCTIC,
|
LLM_ARCH_ARCTIC,
|
||||||
LLM_ARCH_DEEPSEEK2,
|
LLM_ARCH_DEEPSEEK2,
|
||||||
|
LLM_ARCH_BITNET,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -263,6 +264,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_OLMO, "olmo" },
|
{ LLM_ARCH_OLMO, "olmo" },
|
||||||
{ LLM_ARCH_ARCTIC, "arctic" },
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
||||||
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
||||||
|
{ LLM_ARCH_BITNET, "bitnet" },
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -500,6 +502,8 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_ATTN_KV_B,
|
LLM_TENSOR_ATTN_KV_B,
|
||||||
LLM_TENSOR_ATTN_Q_A_NORM,
|
LLM_TENSOR_ATTN_Q_A_NORM,
|
||||||
LLM_TENSOR_ATTN_KV_A_NORM,
|
LLM_TENSOR_ATTN_KV_A_NORM,
|
||||||
|
LLM_TENSOR_ATTN_SUB_NORM,
|
||||||
|
LLM_TENSOR_FFN_SUB_NORM,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
||||||
|
@ -1113,6 +1117,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_BITNET,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_SUB_NORM, "blk.%d.attn_sub_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
{
|
{
|
||||||
|
@ -2118,6 +2140,8 @@ struct llama_layer {
|
||||||
struct ggml_tensor * attn_out_norm_b;
|
struct ggml_tensor * attn_out_norm_b;
|
||||||
struct ggml_tensor * attn_q_a_norm;
|
struct ggml_tensor * attn_q_a_norm;
|
||||||
struct ggml_tensor * attn_kv_a_norm;
|
struct ggml_tensor * attn_kv_a_norm;
|
||||||
|
struct ggml_tensor * attn_sub_norm;
|
||||||
|
struct ggml_tensor * ffn_sub_norm;
|
||||||
|
|
||||||
// attention
|
// attention
|
||||||
struct ggml_tensor * wq;
|
struct ggml_tensor * wq;
|
||||||
|
@ -2185,6 +2209,15 @@ struct llama_layer {
|
||||||
// long rope factors
|
// long rope factors
|
||||||
struct ggml_tensor * rope_long = nullptr;
|
struct ggml_tensor * rope_long = nullptr;
|
||||||
struct ggml_tensor * rope_short = nullptr;
|
struct ggml_tensor * rope_short = nullptr;
|
||||||
|
|
||||||
|
// bitnet scale
|
||||||
|
struct ggml_tensor * wq_scale;
|
||||||
|
struct ggml_tensor * wk_scale;
|
||||||
|
struct ggml_tensor * wv_scale;
|
||||||
|
struct ggml_tensor * wo_scale;
|
||||||
|
struct ggml_tensor * ffn_gate_scale;
|
||||||
|
struct ggml_tensor * ffn_up_scale;
|
||||||
|
struct ggml_tensor * ffn_down_scale;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_kv_cell {
|
struct llama_kv_cell {
|
||||||
|
@ -4710,6 +4743,15 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 26: model.type = e_model::MODEL_3B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default: (void)0;
|
default: (void)0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6655,6 +6697,44 @@ static bool llm_load_tensors(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
|
||||||
|
// output
|
||||||
|
{
|
||||||
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
|
||||||
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
|
||||||
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
|
||||||
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
|
||||||
|
|
||||||
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||||
|
layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
|
||||||
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("unknown architecture");
|
throw std::runtime_error("unknown architecture");
|
||||||
}
|
}
|
||||||
|
@ -7295,7 +7375,10 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
|
|
||||||
ggml_build_forward_expand(graph, cur);
|
ggml_build_forward_expand(graph, cur);
|
||||||
|
|
||||||
|
if (wo) {
|
||||||
cur = ggml_mul_mat(ctx, wo, cur);
|
cur = ggml_mul_mat(ctx, wo, cur);
|
||||||
|
}
|
||||||
|
|
||||||
if (wo_b) {
|
if (wo_b) {
|
||||||
cb(cur, "kqv_wo", il);
|
cb(cur, "kqv_wo", il);
|
||||||
}
|
}
|
||||||
|
@ -11709,6 +11792,153 @@ struct llm_build_context {
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_bitnet() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// B1.K
|
||||||
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
|
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// B1.V
|
||||||
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
||||||
|
nullptr, nullptr,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.layers[il].attn_sub_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_sub_norm", il);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
||||||
|
if (model.layers[il].bo) {
|
||||||
|
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
||||||
|
}
|
||||||
|
cb(cur, "attn_o_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// feed-forward forward
|
||||||
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||||
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
struct ggml_tensor *tmp = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
|
||||||
|
tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_up_scale);
|
||||||
|
cb(tmp, "ffn_up", il);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur);
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_gate_scale);
|
||||||
|
cb(cur, "ffn_gate", il);
|
||||||
|
|
||||||
|
cur = ggml_silu(ctx0, cur);
|
||||||
|
cb(cur, "ffn_silu", il);
|
||||||
|
|
||||||
|
cur = ggml_mul(ctx0, cur, tmp);
|
||||||
|
cb(cur, "ffn_gate_par", il);
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.layers[il].ffn_sub_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_sub_norm", il);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
||||||
|
cb(cur, "ffn_down", il);
|
||||||
|
}
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||||
|
@ -11932,6 +12162,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_deepseek2();
|
result = llm.build_deepseek2();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
|
{
|
||||||
|
result = llm.build_bitnet();
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
@ -12551,12 +12785,6 @@ static int llama_decode_internal(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_PERF
|
|
||||||
// print timing information per ggml operation (for debugging purposes)
|
|
||||||
// requires GGML_PERF to be defined
|
|
||||||
ggml_graph_print(gf);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// plot the computation graph in dot format (for debugging purposes)
|
// plot the computation graph in dot format (for debugging purposes)
|
||||||
//if (n_past%100 == 0) {
|
//if (n_past%100 == 0) {
|
||||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||||
|
@ -16760,6 +16988,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_BERT:
|
case LLM_ARCH_BERT:
|
||||||
case LLM_ARCH_NOMIC_BERT:
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
case LLM_ARCH_STABLELM:
|
case LLM_ARCH_STABLELM:
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
case LLM_ARCH_QWEN:
|
case LLM_ARCH_QWEN:
|
||||||
case LLM_ARCH_QWEN2:
|
case LLM_ARCH_QWEN2:
|
||||||
case LLM_ARCH_QWEN2MOE:
|
case LLM_ARCH_QWEN2MOE:
|
||||||
|
|
2
llama.h
2
llama.h
|
@ -786,7 +786,7 @@ extern "C" {
|
||||||
// Get the number of threads used for prompt and batch processing (multiple token).
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
||||||
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||||
|
|
||||||
// Set whether the model is in embeddings model or not
|
// Set whether the model is in embeddings mode or not
|
||||||
// If true, embeddings will be returned but logits will not
|
// If true, embeddings will be returned but logits will not
|
||||||
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
||||||
|
|
||||||
|
|
37
sgemm.cpp
37
sgemm.cpp
|
@ -249,8 +249,7 @@ class tinyBLAS {
|
||||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(int64_t m, int64_t n, int task) {
|
void matmul(int64_t m, int64_t n) {
|
||||||
if (task == GGML_TASK_TYPE_COMPUTE)
|
|
||||||
mnpack(0, m, 0, n);
|
mnpack(0, m, 0, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -458,8 +457,7 @@ class tinyBLAS_Q0_ARM {
|
||||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(int64_t m, int64_t n, int task) {
|
void matmul(int64_t m, int64_t n) {
|
||||||
if (task == GGML_TASK_TYPE_COMPUTE)
|
|
||||||
mnpack(0, m, 0, n);
|
mnpack(0, m, 0, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -596,8 +594,7 @@ class tinyBLAS_Q0_AVX {
|
||||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(int64_t m, int64_t n, int task) {
|
void matmul(int64_t m, int64_t n) {
|
||||||
if (task == GGML_TASK_TYPE_COMPUTE)
|
|
||||||
mnpack(0, m, 0, n);
|
mnpack(0, m, 0, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -829,7 +826,7 @@ class tinyBLAS_Q0_AVX {
|
||||||
* For example, for single-threaded single-precision GEMM you can say
|
* For example, for single-threaded single-precision GEMM you can say
|
||||||
*
|
*
|
||||||
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
|
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
|
||||||
* 0, 1, GGML_TASK_TYPE_COMPUTE,
|
* 0, 1,
|
||||||
* GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
|
* GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
|
||||||
*
|
*
|
||||||
* @param m is rows in `A` and `C`
|
* @param m is rows in `A` and `C`
|
||||||
|
@ -843,14 +840,13 @@ class tinyBLAS_Q0_AVX {
|
||||||
* @param ldc is row stride of `C`
|
* @param ldc is row stride of `C`
|
||||||
* @param ith is thread id (must be less than `nth`)
|
* @param ith is thread id (must be less than `nth`)
|
||||||
* @param nth is number of threads (must be greater than zero)
|
* @param nth is number of threads (must be greater than zero)
|
||||||
* @param task is GGML task type
|
|
||||||
* @param Atype is GGML data type of `A`
|
* @param Atype is GGML data type of `A`
|
||||||
* @param Btype is GGML data type of `B`
|
* @param Btype is GGML data type of `B`
|
||||||
* @param Ctype is GGML data type of `C`
|
* @param Ctype is GGML data type of `C`
|
||||||
* @return true if this function was able to service the matmul request
|
* @return true if this function was able to service the matmul request
|
||||||
*/
|
*/
|
||||||
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
||||||
int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) {
|
int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
|
||||||
|
|
||||||
assert(m >= 0);
|
assert(m >= 0);
|
||||||
assert(n >= 0);
|
assert(n >= 0);
|
||||||
|
@ -877,7 +873,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__AVX__) || defined(__AVX2__)
|
#elif defined(__AVX__) || defined(__AVX2__)
|
||||||
if (k % 8)
|
if (k % 8)
|
||||||
|
@ -887,7 +883,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_NEON)
|
#elif defined(__ARM_NEON)
|
||||||
if (n < 4)
|
if (n < 4)
|
||||||
|
@ -899,7 +895,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -917,7 +913,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
||||||
if (k % 8)
|
if (k % 8)
|
||||||
|
@ -929,7 +925,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
||||||
if (n < 8)
|
if (n < 8)
|
||||||
|
@ -943,7 +939,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const ggml_fp16_t *)B, ldb,
|
(const ggml_fp16_t *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||||
if (k % 4)
|
if (k % 4)
|
||||||
|
@ -955,7 +951,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -971,7 +967,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||||
tinyBLAS_Q0_ARM<block_q8_0> tb{
|
tinyBLAS_Q0_ARM<block_q8_0> tb{
|
||||||
|
@ -979,7 +975,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -995,7 +991,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||||
tinyBLAS_Q0_ARM<block_q4_0> tb{
|
tinyBLAS_Q0_ARM<block_q4_0> tb{
|
||||||
|
@ -1003,7 +999,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -1025,7 +1021,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(void)ldc;
|
(void)ldc;
|
||||||
(void)ith;
|
(void)ith;
|
||||||
(void)nth;
|
(void)nth;
|
||||||
(void)task;
|
|
||||||
(void)Atype;
|
(void)Atype;
|
||||||
(void)Btype;
|
(void)Btype;
|
||||||
(void)Ctype;
|
(void)Ctype;
|
||||||
|
|
2
sgemm.h
2
sgemm.h
|
@ -7,7 +7,7 @@ extern "C" {
|
||||||
|
|
||||||
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
||||||
const void *, int64_t, void *, int64_t, int, int,
|
const void *, int64_t, void *, int64_t, int, int,
|
||||||
int, int, int, int);
|
int, int, int);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -785,6 +785,10 @@ struct test_cpy : public test_case {
|
||||||
return VARS_TO_STR3(type_src, type_dst, ne);
|
return VARS_TO_STR3(type_src, type_dst, ne);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double max_nmse_err() override {
|
||||||
|
return 1e-6;
|
||||||
|
}
|
||||||
|
|
||||||
size_t op_size(ggml_tensor * t) override {
|
size_t op_size(ggml_tensor * t) override {
|
||||||
return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
|
return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
const uint tid = gl_LocalInvocationID.x;
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue