xcomposer2 support
- loading of all lora tensors in correct shape [done] - make llm inference possible [done] - make clip conversion possible [done] - dynamic shape for lora tensors [] - add image token mask [] - add conditional "lora" on tensors during inference
This commit is contained in:
parent
01684139c3
commit
5188ea09d6
5 changed files with 512 additions and 37 deletions
|
@ -203,6 +203,8 @@ class Model:
|
|||
return CodeShellModel
|
||||
if model_architecture == "OrionForCausalLM":
|
||||
return OrionModel
|
||||
if model_architecture == "InternLM2ForCausalLM":
|
||||
return InternLM2Model
|
||||
return Model
|
||||
|
||||
def _is_model_safetensors(self) -> bool:
|
||||
|
@ -254,6 +256,8 @@ class Model:
|
|||
return gguf.MODEL_ARCH.CODESHELL
|
||||
if arch == "OrionForCausalLM":
|
||||
return gguf.MODEL_ARCH.ORION
|
||||
if arch == "InternLM2ForCausalLM":
|
||||
return gguf.MODEL_ARCH.INTERNLM2
|
||||
|
||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||
|
||||
|
@ -1344,6 +1348,147 @@ class CodeShellModel(Model):
|
|||
self.gguf_writer.add_tensor("output.weight", data)
|
||||
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
class InternLM2Model(Model):
|
||||
def set_vocab(self):
|
||||
# (TODO): Is there a better way?
|
||||
# Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
|
||||
# \x00 specially and convert it into an emoji character to prevent it from being mistakenly
|
||||
# recognized as an empty string in C++.
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||
|
||||
tokens: list[bytes] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
if not tokenizer_path.is_file():
|
||||
print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
tokenizer = SentencePieceProcessor(str(tokenizer_path))
|
||||
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||
|
||||
for token_id in range(vocab_size):
|
||||
piece = tokenizer.id_to_piece(token_id)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.get_score(token_id)
|
||||
if text == b"\x00":
|
||||
# (TODO): fixme
|
||||
# Hack here and replace the \x00 characters.
|
||||
print(f"InternLM2 convert token '{text}' to '🐉'!")
|
||||
text = "🐉"
|
||||
|
||||
toktype = SentencePieceTokenTypes.NORMAL
|
||||
if tokenizer.is_unknown(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
elif tokenizer.is_control(token_id):
|
||||
toktype = SentencePieceTokenTypes.CONTROL
|
||||
elif tokenizer.is_unused(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNUSED
|
||||
elif tokenizer.is_byte(token_id):
|
||||
toktype = SentencePieceTokenTypes.BYTE
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
added_tokens_file = self.dir_model / 'added_tokens.json'
|
||||
if added_tokens_file.is_file():
|
||||
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||
added_tokens_json = json.load(f)
|
||||
|
||||
for key in added_tokens_json:
|
||||
tokens.append(key.encode("utf-8"))
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name("InternLM2")
|
||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
||||
|
||||
def post_write_tensors(self, tensor_map, name, data_torch):
|
||||
old_dtype = data_torch.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
|
||||
data = data_torch.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if self.ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
def write_tensors(self):
|
||||
from einops import rearrange
|
||||
|
||||
num_heads = self.hparams.get("num_attention_heads")
|
||||
num_kv_heads = self.hparams.get("num_key_value_heads")
|
||||
hidden_size = self.hparams.get("hidden_size")
|
||||
q_per_kv = num_heads // num_kv_heads
|
||||
head_dim = hidden_size // num_heads
|
||||
num_groups = num_heads // q_per_kv
|
||||
|
||||
block_count = self.hparams["num_hidden_layers"]
|
||||
model_kv = dict(self.get_tensors())
|
||||
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||
qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
|
||||
for name, data_torch in model_kv.items():
|
||||
# we don't need these
|
||||
if name.endswith(".rotary_emb.inv_freq"):
|
||||
continue
|
||||
plora_tensor = True if "Plora" in name else False
|
||||
if re.match(qkv_pattern, name) and not plora_tensor:
|
||||
bid = re.findall(qkv_pattern, name)[0]
|
||||
qkv = data_torch
|
||||
qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv+2, i=head_dim)
|
||||
q, k, v = qkv[...,:q_per_kv,:], qkv[...,q_per_kv:q_per_kv+1,:], qkv[...,q_per_kv+1:q_per_kv+2,:]
|
||||
q = rearrange(q, " o g n i -> o (g n i)").T
|
||||
k = rearrange(k, " o g n i -> o (g n i)").T
|
||||
v = rearrange(v, " o g n i -> o (g n i)").T
|
||||
self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q)
|
||||
self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k)
|
||||
self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v)
|
||||
else:
|
||||
self.post_write_tensors(tensor_map, name, data_torch)
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
|
|
@ -41,6 +41,9 @@ def get_tensor_name(name: str) -> str:
|
|||
|
||||
if "mm_projector" in name:
|
||||
return name.replace("model.mm_projector", "mm")
|
||||
|
||||
if "vision_proj" in name:
|
||||
return name.replace("vision_proj", "mm")
|
||||
|
||||
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
|
||||
|
||||
|
@ -80,12 +83,13 @@ ap.add_argument("--vision-only", action="store_true", required=False,
|
|||
help="Save a vision-only model. It can't be used to encode texts")
|
||||
ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
|
||||
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
|
||||
ap.add_argument("--clip_model_is_openclip", action="store_true", required=False,
|
||||
help="The clip model is from openclip (for ViT-SO400M type))")
|
||||
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
||||
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
|
||||
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
|
||||
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
|
||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
||||
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
|
||||
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
|
||||
default_image_std = [0.26862954, 0.26130258, 0.27577711]
|
||||
ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
|
||||
|
@ -105,7 +109,7 @@ if args.use_f32:
|
|||
# output in the same directory as the model if output_dir is None
|
||||
dir_model = args.model_dir
|
||||
|
||||
if args.clip_model_is_vision:
|
||||
if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
|
||||
vocab = None
|
||||
tokens = None
|
||||
else:
|
||||
|
@ -133,7 +137,7 @@ ftype = 1
|
|||
if args.use_f32:
|
||||
ftype = 0
|
||||
|
||||
if args.clip_model_is_vision:
|
||||
if args.clip_model_is_vision or args.clip_model_is_openclip:
|
||||
model = CLIPVisionModel.from_pretrained(dir_model)
|
||||
processor = None
|
||||
else:
|
||||
|
|
|
@ -102,6 +102,7 @@ class MODEL_ARCH(IntEnum):
|
|||
PLAMO = auto()
|
||||
CODESHELL = auto()
|
||||
ORION = auto()
|
||||
INTERNLM2 = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
|
@ -132,6 +133,21 @@ class MODEL_TENSOR(IntEnum):
|
|||
ATTN_Q_NORM = auto()
|
||||
ATTN_K_NORM = auto()
|
||||
|
||||
ATTN_QKV_LORA_A = auto()
|
||||
ATTN_QKV_LORA_B = auto()
|
||||
ATTN_OUT_LORA_A = auto()
|
||||
ATTN_OUT_LORA_B = auto()
|
||||
FFN_UP_LORA_A = auto()
|
||||
FFN_UP_LORA_B = auto()
|
||||
FFN_GATE_LORA_A = auto()
|
||||
FFN_GATE_LORA_B = auto()
|
||||
FFN_DOWN_LORA_A = auto()
|
||||
FFN_DOWN_LORA_B = auto()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.LLAMA: "llama",
|
||||
|
@ -153,6 +169,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.PLAMO: "plamo",
|
||||
MODEL_ARCH.CODESHELL: "codeshell",
|
||||
MODEL_ARCH.ORION: "orion",
|
||||
MODEL_ARCH.INTERNLM2: "internlm2",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
|
@ -182,6 +199,18 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
|
||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
|
||||
|
||||
MODEL_TENSOR.ATTN_QKV_LORA_A : "blk.{bid}.attn_qkv_lora_a",
|
||||
MODEL_TENSOR.ATTN_QKV_LORA_B : "blk.{bid}.attn_qkv_lora_b",
|
||||
MODEL_TENSOR.ATTN_OUT_LORA_A : "blk.{bid}.attn_out_lora_a",
|
||||
MODEL_TENSOR.ATTN_OUT_LORA_B : "blk.{bid}.attn_out_lora_b",
|
||||
MODEL_TENSOR.FFN_UP_LORA_A : "blk.{bid}.ffn_up_lora_a",
|
||||
MODEL_TENSOR.FFN_UP_LORA_B : "blk.{bid}.ffn_up_lora_b",
|
||||
MODEL_TENSOR.FFN_GATE_LORA_A : "blk.{bid}.ffn_gate_lora_a",
|
||||
MODEL_TENSOR.FFN_GATE_LORA_B : "blk.{bid}.ffn_gate_lora_b",
|
||||
MODEL_TENSOR.FFN_DOWN_LORA_A : "blk.{bid}.ffn_down_lora_a",
|
||||
MODEL_TENSOR.FFN_DOWN_LORA_B : "blk.{bid}.ffn_down_lora_b",
|
||||
|
||||
}
|
||||
|
||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
|
@ -446,6 +475,32 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.INTERNLM2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
|
||||
MODEL_TENSOR.ATTN_QKV_LORA_A,
|
||||
MODEL_TENSOR.ATTN_QKV_LORA_B,
|
||||
MODEL_TENSOR.ATTN_OUT_LORA_A,
|
||||
MODEL_TENSOR.ATTN_OUT_LORA_B,
|
||||
MODEL_TENSOR.FFN_UP_LORA_A,
|
||||
MODEL_TENSOR.FFN_UP_LORA_B,
|
||||
MODEL_TENSOR.FFN_GATE_LORA_A,
|
||||
MODEL_TENSOR.FFN_GATE_LORA_B,
|
||||
MODEL_TENSOR.FFN_DOWN_LORA_A,
|
||||
MODEL_TENSOR.FFN_DOWN_LORA_B,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ class TensorNameMap:
|
|||
"language_model.embedding.word_embeddings", # persimmon
|
||||
"wte", # gpt2
|
||||
"transformer.embd.wte", # phi2
|
||||
"model.tok_embeddings", # internlm2
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
|
@ -42,7 +43,7 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.OUTPUT: (
|
||||
"embed_out", # gptneox
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen
|
||||
"output", # llama-pth bloom
|
||||
"output", # llama-pth bloom internlm2
|
||||
"word_embeddings_for_head", # persimmon
|
||||
"lm_head.linear", # phi2
|
||||
),
|
||||
|
@ -51,7 +52,7 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.OUTPUT_NORM: (
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
"transformer.ln_f", # gpt2 gpt-j falcon
|
||||
"model.norm", # llama-hf baichuan
|
||||
"model.norm", # llama-hf baichuan internlm2
|
||||
"norm", # llama-pth
|
||||
"embeddings.LayerNorm", # bert
|
||||
"transformer.norm_f", # mpt
|
||||
|
@ -84,6 +85,7 @@ class TensorNameMap:
|
|||
"h.{bid}.ln_1", # gpt2
|
||||
"transformer.h.{bid}.ln", # phi2
|
||||
"model.layers.layers.{bid}.norm", # plamo
|
||||
"model.layers.{bid}.attention_norm", # internlm2
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
|
@ -111,6 +113,7 @@ class TensorNameMap:
|
|||
"encoder.layer.{bid}.attention.self.query", # bert
|
||||
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
|
||||
"model.layers.{bid}.attention.wq" # internlm2
|
||||
),
|
||||
|
||||
# Attention key
|
||||
|
@ -120,6 +123,7 @@ class TensorNameMap:
|
|||
"encoder.layer.{bid}.attention.self.key", # bert
|
||||
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
||||
"model.layers.{bid}.attention.wk" # internlm2
|
||||
),
|
||||
|
||||
# Attention value
|
||||
|
@ -129,6 +133,13 @@ class TensorNameMap:
|
|||
"encoder.layer.{bid}.attention.self.value", # bert
|
||||
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
||||
"model.layers.{bid}.attention.wv" # internlm2
|
||||
),
|
||||
MODEL_TENSOR.ATTN_QKV_LORA_A: (
|
||||
"model.layers.{bid}.attention.wqkv.Plora_A", # internlm2
|
||||
),
|
||||
MODEL_TENSOR.ATTN_QKV_LORA_B: (
|
||||
"model.layers.{bid}.attention.wqkv.Plora_B", # internlm2
|
||||
),
|
||||
|
||||
# Attention output
|
||||
|
@ -147,6 +158,13 @@ class TensorNameMap:
|
|||
"h.{bid}.attn.c_proj", # gpt2
|
||||
"transformer.h.{bid}.mixer.out_proj", # phi2
|
||||
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
||||
"model.layers.{bid}.attention.wo", # internlm2
|
||||
),
|
||||
MODEL_TENSOR.ATTN_OUT_LORA_A: (
|
||||
"model.layers.{bid}.attention.wo.Plora_A", # internlm2
|
||||
),
|
||||
MODEL_TENSOR.ATTN_OUT_LORA_B: (
|
||||
"model.layers.{bid}.attention.wo.Plora_B", # internlm2
|
||||
),
|
||||
|
||||
# Rotary embeddings
|
||||
|
@ -169,6 +187,7 @@ class TensorNameMap:
|
|||
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||
"model.layers.{bid}.ln2", # yi
|
||||
"h.{bid}.ln_2", # gpt2
|
||||
"model.layers.{bid}.ffn_norm", # internlm2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
|
@ -194,6 +213,13 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.mlp.fc1", # phi2
|
||||
"model.layers.{bid}.mlp.fc1", # phi2
|
||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||
"model.layers.{bid}.feed_forward.w3", # internlm2
|
||||
),
|
||||
MODEL_TENSOR.FFN_UP_LORA_A: (
|
||||
"model.layers.{bid}.feed_forward.w3.Plora_A", # internlm2
|
||||
),
|
||||
MODEL_TENSOR.FFN_UP_LORA_B: (
|
||||
"model.layers.{bid}.feed_forward.w3.Plora_B", # internlm2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
|
@ -212,6 +238,13 @@ class TensorNameMap:
|
|||
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||
"transformer.h.{bid}.mlp.w2", # qwen
|
||||
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
||||
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||
),
|
||||
MODEL_TENSOR.FFN_GATE_LORA_A: (
|
||||
"model.layers.{bid}.feed_forward.w1.Plora_A", # internlm2
|
||||
),
|
||||
MODEL_TENSOR.FFN_GATE_LORA_B: (
|
||||
"model.layers.{bid}.feed_forward.w1.Plora_B", # internlm2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||
|
@ -236,6 +269,13 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.mlp.fc2", # phi2
|
||||
"model.layers.{bid}.mlp.fc2", # phi2
|
||||
"model.layers.layers.{bid}.mlp.down_proj", # plamo
|
||||
"model.layers.{bid}.feed_forward.w2", # internlm2
|
||||
),
|
||||
MODEL_TENSOR.FFN_DOWN_LORA_A: (
|
||||
"model.layers.{bid}.feed_forward.w2.Plora_A", # internlm2
|
||||
),
|
||||
MODEL_TENSOR.FFN_DOWN_LORA_B: (
|
||||
"model.layers.{bid}.feed_forward.w2.Plora_B", # internlm2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
|
|
293
llama.cpp
293
llama.cpp
|
@ -204,6 +204,7 @@ enum llm_arch {
|
|||
LLM_ARCH_PLAMO,
|
||||
LLM_ARCH_CODESHELL,
|
||||
LLM_ARCH_ORION,
|
||||
LLM_ARCH_INTERNLM2,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
|
@ -226,6 +227,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_PLAMO, "plamo" },
|
||||
{ LLM_ARCH_CODESHELL, "codeshell" },
|
||||
{ LLM_ARCH_ORION, "orion" },
|
||||
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||
};
|
||||
|
||||
enum llm_kv {
|
||||
|
@ -372,6 +374,18 @@ enum llm_tensor {
|
|||
LLM_TENSOR_FFN_UP_EXP,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
|
||||
// dynamic/partial LORA tensors
|
||||
LLM_TENSOR_ATTN_QKV_LORA_A,
|
||||
LLM_TENSOR_ATTN_QKV_LORA_B,
|
||||
LLM_TENSOR_ATTN_OUT_LORA_A,
|
||||
LLM_TENSOR_ATTN_OUT_LORA_B,
|
||||
LLM_TENSOR_FFN_UP_LORA_A,
|
||||
LLM_TENSOR_FFN_UP_LORA_B,
|
||||
LLM_TENSOR_FFN_GATE_LORA_A,
|
||||
LLM_TENSOR_FFN_GATE_LORA_B,
|
||||
LLM_TENSOR_FFN_DOWN_LORA_A,
|
||||
LLM_TENSOR_FFN_DOWN_LORA_B,
|
||||
};
|
||||
|
||||
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
||||
|
@ -664,12 +678,37 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_INTERNLM2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
// dynamic/partial LORA tensors
|
||||
{ LLM_TENSOR_ATTN_QKV_LORA_A, "blk.%d.attn_qkv_lora_a" },
|
||||
{ LLM_TENSOR_ATTN_QKV_LORA_B, "blk.%d.attn_qkv_lora_b" },
|
||||
{ LLM_TENSOR_ATTN_OUT_LORA_A, "blk.%d.attn_out_lora_a" },
|
||||
{ LLM_TENSOR_ATTN_OUT_LORA_B, "blk.%d.attn_out_lora_b" },
|
||||
{ LLM_TENSOR_FFN_UP_LORA_A, "blk.%d.ffn_up_lora_a" },
|
||||
{ LLM_TENSOR_FFN_UP_LORA_B, "blk.%d.ffn_up_lora_b" },
|
||||
{ LLM_TENSOR_FFN_GATE_LORA_A, "blk.%d.ffn_gate_lora_a" },
|
||||
{ LLM_TENSOR_FFN_GATE_LORA_B, "blk.%d.ffn_gate_lora_b" },
|
||||
{ LLM_TENSOR_FFN_DOWN_LORA_A, "blk.%d.ffn_down_lora_a" },
|
||||
{ LLM_TENSOR_FFN_DOWN_LORA_B, "blk.%d.ffn_down_lora_b" },
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
|
@ -1377,6 +1416,7 @@ enum e_model {
|
|||
MODEL_13B,
|
||||
MODEL_14B,
|
||||
MODEL_15B,
|
||||
MODEL_20B,
|
||||
MODEL_30B,
|
||||
MODEL_34B,
|
||||
MODEL_40B,
|
||||
|
@ -1529,6 +1569,19 @@ struct llama_layer {
|
|||
struct ggml_tensor * ffn_down_b; // b2
|
||||
struct ggml_tensor * ffn_up_b; // b3
|
||||
struct ggml_tensor * ffn_act;
|
||||
|
||||
// intern_ml ploras:
|
||||
struct ggml_tensor * attn_qkv_lora_a; // qkv
|
||||
struct ggml_tensor * attn_qkv_lora_b;
|
||||
struct ggml_tensor * attn_out_lora_a; // wo
|
||||
struct ggml_tensor * attn_out_lora_b;
|
||||
struct ggml_tensor * ffn_up_lora_a; // w3
|
||||
struct ggml_tensor * ffn_up_lora_b;
|
||||
struct ggml_tensor * ffn_gate_lora_a; // w1
|
||||
struct ggml_tensor * ffn_gate_lora_b;
|
||||
struct ggml_tensor * ffn_down_lora_a; // w2
|
||||
struct ggml_tensor * ffn_down_lora_b;
|
||||
|
||||
};
|
||||
|
||||
struct llama_kv_cell {
|
||||
|
@ -2367,7 +2420,6 @@ struct llama_model_loader {
|
|||
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
||||
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
||||
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
||||
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
|
@ -2716,7 +2768,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XSS - 3.0625 bpw";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
|
@ -2731,6 +2782,7 @@ static const char * llama_model_type_name(e_model type) {
|
|||
case MODEL_13B: return "13B";
|
||||
case MODEL_14B: return "14B";
|
||||
case MODEL_15B: return "15B";
|
||||
case MODEL_20B: return "20B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_34B: return "34B";
|
||||
case MODEL_40B: return "40B";
|
||||
|
@ -2743,6 +2795,14 @@ static const char * llama_model_type_name(e_model type) {
|
|||
default: return "?B";
|
||||
}
|
||||
}
|
||||
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
||||
switch (type) {
|
||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
||||
model.arch = ml.get_arch();
|
||||
|
@ -3006,6 +3066,15 @@ static void llm_load_hparams(
|
|||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 48: model.type = e_model::MODEL_20B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
|
@ -3269,7 +3338,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|||
// hparams
|
||||
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
||||
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
|
||||
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
|
||||
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
||||
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
||||
|
@ -4018,8 +4087,54 @@ static bool llm_load_tensors(
|
|||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
// output
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
// layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
|
||||
// plora support - if available
|
||||
|
||||
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_QKV_LORA_A, "weight", i).c_str()) >= 0) {
|
||||
LLAMA_LOG_INFO("Found LORA tensors for layer %d\n", i);
|
||||
// this is not correct {n_embd, hparams.n_head * hparams.n_head_kv}
|
||||
layer.attn_qkv_lora_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV_LORA_A, "weight", i), {n_embd, hparams.n_head * hparams.n_head_kv});
|
||||
layer.attn_qkv_lora_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV_LORA_B, "weight", i), {hparams.n_head * hparams.n_head_kv, n_embd + 2*n_embd_gqa});
|
||||
|
||||
|
||||
layer.attn_out_lora_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT_LORA_A, "weight", i), {n_embd, 256}); // wo
|
||||
layer.attn_out_lora_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT_LORA_B, "weight", i), {256, n_embd});
|
||||
layer.ffn_up_lora_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_LORA_A, "weight", i), {n_embd, 256}); // w3
|
||||
layer.ffn_up_lora_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_LORA_B, "weight", i), {256, n_ff});
|
||||
layer.ffn_gate_lora_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_LORA_A, "weight", i), {n_embd, 256}); // w1
|
||||
layer.ffn_gate_lora_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_LORA_B, "weight", i), {256, n_ff});
|
||||
layer.ffn_down_lora_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_LORA_A, "weight", i), {n_ff, 256}); // w2
|
||||
layer.ffn_down_lora_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_LORA_B, "weight", i), {256, n_embd});
|
||||
}
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
|
@ -6589,6 +6704,126 @@ struct llm_build_context {
|
|||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_internlm2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph(
|
||||
|
@ -6746,6 +6981,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
case LLM_ARCH_ORION:
|
||||
{
|
||||
result = llm.build_orion();
|
||||
} break;
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
{
|
||||
result = llm.build_internlm2();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
|
@ -6772,6 +7011,7 @@ static int llama_decode_internal(
|
|||
|
||||
if (n_tokens == 0) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
|
||||
fprintf(stderr, "%s: n_tokens == 0\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -6840,7 +7080,7 @@ static int llama_decode_internal(
|
|||
}
|
||||
|
||||
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
||||
return 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||
|
@ -6878,6 +7118,11 @@ static int llama_decode_internal(
|
|||
n_threads = std::min(4, n_threads);
|
||||
}
|
||||
|
||||
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
||||
if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) {
|
||||
n_threads = 1;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
||||
|
@ -7653,7 +7898,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|||
}
|
||||
}
|
||||
|
||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
|
||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special, bool add_space_prefix) {
|
||||
std::vector<llama_vocab::id> output;
|
||||
|
||||
// OG tokenizer behavior:
|
||||
|
@ -7689,7 +7934,9 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|||
//
|
||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
if (&fragment == &fragment_buffer.front()) {
|
||||
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
||||
if (add_space_prefix){
|
||||
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
|
@ -8703,7 +8950,6 @@ void llama_sample_classifier_free_guidance(
|
|||
|
||||
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
||||
GGML_ASSERT(ctx);
|
||||
|
||||
auto N = float(llama_n_vocab(llama_get_model(ctx)));
|
||||
int64_t t_start_sample_us;
|
||||
t_start_sample_us = ggml_time_us();
|
||||
|
@ -9234,13 +9480,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
else if (new_type != GGML_TYPE_Q8_0) {
|
||||
new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
} else if (name == "token_embd.weight") {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
||||
new_type = GGML_TYPE_Q2_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
||||
if (name.find("attn_v.weight") != std::string::npos) {
|
||||
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
||||
|
@ -9251,6 +9490,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
||||
++qs.i_ffn_down;
|
||||
}
|
||||
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
|
||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
||||
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
||||
|
@ -9258,9 +9498,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
}
|
||||
|
@ -9298,9 +9535,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
||||
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
//else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
// if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
|
||||
//}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
|
||||
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
||||
|
@ -9332,14 +9566,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||
if (arch != LLM_ARCH_FALCON) {
|
||||
if (qs.model.hparams.n_expert == 8) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
} else {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
|
@ -9382,8 +9615,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
bool convert_incompatible_tensor = false;
|
||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
||||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
||||
new_type == GGML_TYPE_IQ3_XXS) {
|
||||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
|
||||
int nx = tensor->ne[0];
|
||||
int ny = tensor->ne[1];
|
||||
if (nx % QK_K != 0) {
|
||||
|
@ -9397,7 +9629,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
switch (new_type) {
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
||||
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
||||
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
||||
|
@ -9439,7 +9670,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
|
||||
|
||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
}
|
||||
|
@ -9545,7 +9775,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
||||
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
|
||||
|
||||
if (!ml.use_mmap) {
|
||||
if (read_data.size() < ggml_nbytes(tensor)) {
|
||||
read_data.resize(ggml_nbytes(tensor));
|
||||
|
@ -11266,7 +11496,8 @@ int32_t llama_tokenize(
|
|||
int32_t n_max_tokens,
|
||||
bool add_bos,
|
||||
bool special) {
|
||||
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
|
||||
bool add_space_prefix = model->arch == LLM_ARCH_INTERNLM2;
|
||||
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special, add_space_prefix);
|
||||
|
||||
if (n_max_tokens < (int) res.size()) {
|
||||
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
||||
|
@ -11281,7 +11512,7 @@ int32_t llama_tokenize(
|
|||
}
|
||||
|
||||
static std::string llama_decode_text(const std::string & text) {
|
||||
std::string decoded_text;
|
||||
std::string decoded_text;
|
||||
auto unicode_sequences = codepoints_from_utf8(text);
|
||||
for (auto& unicode_sequence : unicode_sequences) {
|
||||
decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
|
||||
|
@ -11292,7 +11523,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|||
|
||||
// does not write null-terminator to buf
|
||||
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
||||
if (0 <= token && token < llama_n_vocab(model)) {
|
||||
if (0 <= token && token < llama_n_vocab(model)) {
|
||||
switch (llama_vocab_get_type(model->vocab)) {
|
||||
case LLAMA_VOCAB_TYPE_SPM: {
|
||||
// NOTE: we accept all unsupported token types,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue