feat: first things to do
This commit is contained in:
parent
400d5d722d
commit
86a5d96fc6
6 changed files with 1879 additions and 1012 deletions
|
@ -59,9 +59,9 @@ option(LLAMA_GPROF "llama: enable gprof"
|
||||||
option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF)
|
option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF)
|
||||||
|
|
||||||
# sanitizers
|
# sanitizers
|
||||||
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
|
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" ON)
|
||||||
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
|
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" ON)
|
||||||
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" ON)
|
||||||
|
|
||||||
# instruction set specific
|
# instruction set specific
|
||||||
if (LLAMA_NATIVE)
|
if (LLAMA_NATIVE)
|
||||||
|
@ -126,7 +126,7 @@ option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM"
|
||||||
set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism")
|
set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism")
|
||||||
|
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ON)
|
||||||
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
||||||
|
|
||||||
# add perf arguments
|
# add perf arguments
|
||||||
|
|
|
@ -53,7 +53,8 @@ class Model(ABC):
|
||||||
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
|
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
|
||||||
self.part_names = self._get_part_names()
|
self.part_names = self._get_part_names()
|
||||||
self.hparams = Model.load_hparams(self.dir_model)
|
self.hparams = Model.load_hparams(self.dir_model)
|
||||||
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
|
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess,
|
||||||
|
use_temp_file=False)
|
||||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -80,7 +81,8 @@ class Model(ABC):
|
||||||
from safetensors import safe_open
|
from safetensors import safe_open
|
||||||
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
|
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
|
||||||
else:
|
else:
|
||||||
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
|
ctx = contextlib.nullcontext(
|
||||||
|
torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
|
||||||
|
|
||||||
with ctx as model_part:
|
with ctx as model_part:
|
||||||
for name in model_part.keys():
|
for name in model_part.keys():
|
||||||
|
@ -117,7 +119,8 @@ class Model(ABC):
|
||||||
if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
|
if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
||||||
print(f"gguf: rms norm epsilon = {f_rms_eps}")
|
print(f"gguf: rms norm epsilon = {f_rms_eps}")
|
||||||
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
|
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"],
|
||||||
|
optional=True)) is not None:
|
||||||
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
|
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
|
||||||
print(f"gguf: layer norm epsilon = {f_norm_eps}")
|
print(f"gguf: layer norm epsilon = {f_norm_eps}")
|
||||||
if (n_experts := self.hparams.get("num_local_experts")) is not None:
|
if (n_experts := self.hparams.get("num_local_experts")) is not None:
|
||||||
|
@ -205,6 +208,7 @@ class Model(ABC):
|
||||||
for name in names:
|
for name in names:
|
||||||
cls._model_classes[name] = modelcls
|
cls._model_classes[name] = modelcls
|
||||||
return modelcls
|
return modelcls
|
||||||
|
|
||||||
return func
|
return func
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -286,7 +290,7 @@ class Model(ABC):
|
||||||
|
|
||||||
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
||||||
added_vocab = tokenizer.special_tokens
|
added_vocab = tokenizer.special_tokens
|
||||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
|
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
|
||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if i not in reverse_vocab:
|
if i not in reverse_vocab:
|
||||||
|
@ -771,8 +775,8 @@ class BaichuanModel(Model):
|
||||||
|
|
||||||
return (
|
return (
|
||||||
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
.swapaxes(1, 2)
|
.swapaxes(1, 2)
|
||||||
.reshape(weights.shape)
|
.reshape(weights.shape)
|
||||||
)
|
)
|
||||||
|
|
||||||
def _reverse_hf_permute_part(
|
def _reverse_hf_permute_part(
|
||||||
|
@ -923,8 +927,8 @@ class XverseModel(Model):
|
||||||
|
|
||||||
return (
|
return (
|
||||||
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
.swapaxes(1, 2)
|
.swapaxes(1, 2)
|
||||||
.reshape(weights.shape)
|
.reshape(weights.shape)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1201,9 +1205,11 @@ class StableLMModel(Model):
|
||||||
self.gguf_writer.add_block_count(block_count)
|
self.gguf_writer.add_block_count(block_count)
|
||||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
|
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
self.gguf_writer.add_rope_dimension_count(
|
||||||
|
int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
self.gguf_writer.add_parallel_residual(
|
||||||
|
hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||||
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
|
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
|
||||||
|
|
||||||
|
|
||||||
|
@ -1213,7 +1219,7 @@ class LlamaModel(Model):
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
try:
|
try:
|
||||||
self. _set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
self._set_vocab_llama_hf()
|
self._set_vocab_llama_hf()
|
||||||
|
|
||||||
|
@ -1450,8 +1456,8 @@ class MiniCPMModel(Model):
|
||||||
|
|
||||||
return (
|
return (
|
||||||
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
.swapaxes(1, 2)
|
.swapaxes(1, 2)
|
||||||
.reshape(weights.shape)
|
.reshape(weights.shape)
|
||||||
)
|
)
|
||||||
|
|
||||||
def write_tensors(self):
|
def write_tensors(self):
|
||||||
|
@ -1612,7 +1618,8 @@ class GPT2Model(Model):
|
||||||
|
|
||||||
for name, data_torch in self.get_tensors():
|
for name, data_torch in self.get_tensors():
|
||||||
# we don't need these
|
# we don't need these
|
||||||
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")):
|
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq",
|
||||||
|
".attn.bias", ".attn.masked_bias")):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
|
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
|
||||||
|
@ -1995,7 +2002,8 @@ in chat mode so that the conversation can end normally.")
|
||||||
bid = re.findall(qkv_pattern, name)[0]
|
bid = re.findall(qkv_pattern, name)[0]
|
||||||
qkv = data_torch
|
qkv = data_torch
|
||||||
qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
|
qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
|
||||||
q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
|
q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[...,
|
||||||
|
q_per_kv + 1: q_per_kv + 2, :]
|
||||||
# The model weights of q and k equire additional reshape.
|
# The model weights of q and k equire additional reshape.
|
||||||
q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
|
q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
|
||||||
k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
|
k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
|
||||||
|
@ -2061,6 +2069,7 @@ class BertModel(Model):
|
||||||
if tok.startswith(b"##"):
|
if tok.startswith(b"##"):
|
||||||
return tok[2:]
|
return tok[2:]
|
||||||
return b"\xe2\x96\x81" + tok
|
return b"\xe2\x96\x81" + tok
|
||||||
|
|
||||||
tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
|
tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
|
||||||
|
|
||||||
# set up bos and eos tokens (cls and sep)
|
# set up bos and eos tokens (cls and sep)
|
||||||
|
@ -2153,6 +2162,38 @@ class NomicBertModel(BertModel):
|
||||||
yield name, data
|
yield name, data
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("JinaBertModel")
|
||||||
|
class JinaBertModel(BertModel):
|
||||||
|
model_arch = gguf.MODEL_ARCH.JINA_BERT
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
print(f'hparams {self.hparams}')
|
||||||
|
|
||||||
|
assert self.hparams["position_embedding_type"] == "alibi"
|
||||||
|
|
||||||
|
# def __init__(self, *args, **kwargs):
|
||||||
|
# super().__init__(*args, **kwargs)
|
||||||
|
#
|
||||||
|
# assert self.hparams["position_embedding_type"] == "alibi"
|
||||||
|
#
|
||||||
|
# # GeGLU activation
|
||||||
|
# assert self.hparams["feed_forward_type"] == "geglu"
|
||||||
|
#
|
||||||
|
# def get_tensors(self):
|
||||||
|
# assert self.vocab_size is not None
|
||||||
|
# for name, data in super().get_tensors():
|
||||||
|
# print(f'get_tensors: {name} {data.shape}')
|
||||||
|
# # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
|
||||||
|
# if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
|
||||||
|
# rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
|
||||||
|
# assert data.shape == (rounded_vocab_size, self.hparams["hidden_size"])
|
||||||
|
# data = data[:self.vocab_size, :]
|
||||||
|
# yield name, data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GemmaForCausalLM")
|
@Model.register("GemmaForCausalLM")
|
||||||
class GemmaModel(Model):
|
class GemmaModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GEMMA
|
model_arch = gguf.MODEL_ARCH.GEMMA
|
||||||
|
@ -2170,7 +2211,8 @@ class GemmaModel(Model):
|
||||||
self.gguf_writer.add_block_count(block_count)
|
self.gguf_writer.add_block_count(block_count)
|
||||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
|
self.gguf_writer.add_head_count_kv(
|
||||||
|
self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||||
self.gguf_writer.add_key_length(hparams["head_dim"])
|
self.gguf_writer.add_key_length(hparams["head_dim"])
|
||||||
self.gguf_writer.add_value_length(hparams["head_dim"])
|
self.gguf_writer.add_value_length(hparams["head_dim"])
|
||||||
|
@ -2255,7 +2297,7 @@ class MambaModel(Model):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
d_model = self.find_hparam(["hidden_size", "d_model"])
|
d_model = self.find_hparam(["hidden_size", "d_model"])
|
||||||
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
|
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
|
||||||
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
|
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
|
||||||
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
|
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
|
||||||
# ceiling division
|
# ceiling division
|
||||||
|
@ -2268,10 +2310,10 @@ class MambaModel(Model):
|
||||||
assert d_inner == 2 * d_model
|
assert d_inner == 2 * d_model
|
||||||
|
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
self.gguf_writer.add_name(self.dir_model.name)
|
||||||
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
self.gguf_writer.add_context_length(2 ** 20) # arbitrary value; for those who use the default
|
||||||
self.gguf_writer.add_embedding_length(d_model)
|
self.gguf_writer.add_embedding_length(d_model)
|
||||||
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
||||||
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
||||||
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
||||||
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
||||||
self.gguf_writer.add_ssm_inner_size(d_inner)
|
self.gguf_writer.add_ssm_inner_size(d_inner)
|
||||||
|
@ -2286,7 +2328,7 @@ class MambaModel(Model):
|
||||||
|
|
||||||
tok_embd = None
|
tok_embd = None
|
||||||
tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight"
|
tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight"
|
||||||
output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight"
|
output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight"
|
||||||
|
|
||||||
for name, data_torch in self.get_tensors():
|
for name, data_torch in self.get_tensors():
|
||||||
old_dtype = data_torch.dtype
|
old_dtype = data_torch.dtype
|
||||||
|
@ -2327,7 +2369,8 @@ class MambaModel(Model):
|
||||||
data = data.astype(np.float32)
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
# if f16 desired, convert big float32 2-dim weight tensors to float16
|
# if f16 desired, convert big float32 2-dim weight tensors to float16
|
||||||
if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
|
if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith(
|
||||||
|
(".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
|
||||||
data = data.astype(np.float16)
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
@ -2420,6 +2463,7 @@ def main() -> None:
|
||||||
hparams = Model.load_hparams(dir_model)
|
hparams = Model.load_hparams(dir_model)
|
||||||
|
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
|
print(hparams["architectures"])
|
||||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
||||||
|
|
||||||
|
|
|
@ -111,6 +111,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
REFACT = auto()
|
REFACT = auto()
|
||||||
BERT = auto()
|
BERT = auto()
|
||||||
NOMIC_BERT = auto()
|
NOMIC_BERT = auto()
|
||||||
|
JINA_BERT = auto()
|
||||||
BLOOM = auto()
|
BLOOM = auto()
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
QWEN = auto()
|
QWEN = auto()
|
||||||
|
@ -180,6 +181,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.REFACT: "refact",
|
MODEL_ARCH.REFACT: "refact",
|
||||||
MODEL_ARCH.BERT: "bert",
|
MODEL_ARCH.BERT: "bert",
|
||||||
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
||||||
|
MODEL_ARCH.JINA_BERT: "jina-bert",
|
||||||
MODEL_ARCH.BLOOM: "bloom",
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
MODEL_ARCH.QWEN: "qwen",
|
MODEL_ARCH.QWEN: "qwen",
|
||||||
|
@ -357,6 +359,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM,
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.JINA_BERT: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
|
],
|
||||||
MODEL_ARCH.MPT: [
|
MODEL_ARCH.MPT: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
|
|
@ -217,6 +217,9 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
||||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||||
|
"encoder.layer.{bid}.mlp.gated_layers", # jina-bert
|
||||||
|
"encoder.layer.{bid}.mlp.layernorm", # jina-bert
|
||||||
|
"encoder.layer.{bid}.mlp.wo", # jina-bert
|
||||||
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||||
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||||
"model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
"model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||||
|
|
37
llama.cpp
37
llama.cpp
|
@ -205,6 +205,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_REFACT,
|
LLM_ARCH_REFACT,
|
||||||
LLM_ARCH_BERT,
|
LLM_ARCH_BERT,
|
||||||
LLM_ARCH_NOMIC_BERT,
|
LLM_ARCH_NOMIC_BERT,
|
||||||
|
LLM_ARCH_JINA_BERT,
|
||||||
LLM_ARCH_BLOOM,
|
LLM_ARCH_BLOOM,
|
||||||
LLM_ARCH_STABLELM,
|
LLM_ARCH_STABLELM,
|
||||||
LLM_ARCH_QWEN,
|
LLM_ARCH_QWEN,
|
||||||
|
@ -237,6 +238,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_REFACT, "refact" },
|
{ LLM_ARCH_REFACT, "refact" },
|
||||||
{ LLM_ARCH_BERT, "bert" },
|
{ LLM_ARCH_BERT, "bert" },
|
||||||
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
||||||
|
{ LLM_ARCH_JINA_BERT, "jina-bert" },
|
||||||
{ LLM_ARCH_BLOOM, "bloom" },
|
{ LLM_ARCH_BLOOM, "bloom" },
|
||||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||||
{ LLM_ARCH_QWEN, "qwen" },
|
{ LLM_ARCH_QWEN, "qwen" },
|
||||||
|
@ -665,6 +667,22 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_JINA_BERT,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||||
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_BLOOM,
|
LLM_ARCH_BLOOM,
|
||||||
{
|
{
|
||||||
|
@ -3770,6 +3788,18 @@ static void llm_load_hparams(
|
||||||
model.type = e_model::MODEL_335M; break; // bge-large
|
model.type = e_model::MODEL_335M; break; // bge-large
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_JINA_BERT:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||||
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
||||||
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
|
||||||
|
case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_NOMIC_BERT:
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
@ -4488,6 +4518,7 @@ static bool llm_load_tensors(
|
||||||
model.layers.resize(n_layer);
|
model.layers.resize(n_layer);
|
||||||
|
|
||||||
const auto tn = LLM_TN(model.arch);
|
const auto tn = LLM_TN(model.arch);
|
||||||
|
//std::printf("JOAN HERE ARCH %i", model.arch);
|
||||||
switch (model.arch) {
|
switch (model.arch) {
|
||||||
case LLM_ARCH_LLAMA:
|
case LLM_ARCH_LLAMA:
|
||||||
case LLM_ARCH_REFACT:
|
case LLM_ARCH_REFACT:
|
||||||
|
@ -4782,6 +4813,7 @@ static bool llm_load_tensors(
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_BERT:
|
case LLM_ARCH_BERT:
|
||||||
|
case LLM_ARCH_JINA_BERT:
|
||||||
case LLM_ARCH_NOMIC_BERT:
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
{
|
{
|
||||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
@ -4799,7 +4831,7 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
auto & layer = model.layers[i];
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
if (model.arch == LLM_ARCH_BERT) {
|
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT) {
|
||||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
||||||
|
|
||||||
|
@ -4820,7 +4852,7 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||||
|
|
||||||
if (model.arch == LLM_ARCH_BERT) {
|
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT) {
|
||||||
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
||||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
||||||
|
|
||||||
|
@ -14558,6 +14590,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_REFACT:
|
case LLM_ARCH_REFACT:
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
case LLM_ARCH_MAMBA:
|
case LLM_ARCH_MAMBA:
|
||||||
|
case LLM_ARCH_JINA_BERT:
|
||||||
return LLAMA_ROPE_TYPE_NONE;
|
return LLAMA_ROPE_TYPE_NONE;
|
||||||
|
|
||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue