Merge branch 'gguf' of https://github.com/goerch/llama.cpp into gguf

This commit is contained in:
goerch 2023-08-21 18:48:23 +02:00
commit a856685648
17 changed files with 765 additions and 309 deletions

View file

@ -9,13 +9,13 @@
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Hot topics:**
### 🚧 Incoming breaking change + refactoring:
- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
- New roadmap: https://github.com/users/ggerganov/projects/7
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
See PR https://github.com/ggerganov/llama.cpp/pull/2398 for more info.
To devs: avoid making big changes to `llama.h` / `llama.cpp` until merged
----
<details>
<summary>Table of Contents</summary>
@ -99,6 +99,7 @@ as the main playground for developing new features for the [ggml](https://github
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
**UI:**

View file

@ -118,6 +118,11 @@ gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
# TOKENIZATION
@ -147,9 +152,7 @@ if Path(dir_model + "/tokenizer.model").is_file():
if tokenizer.is_control(i):
toktype = 3
# TODO: How to determinate if a token is user defined?
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = 4
# toktype = 4 is user-defined = tokens from added_tokens.json
if tokenizer.is_unused(i):
toktype = 5
@ -160,6 +163,17 @@ if Path(dir_model + "/tokenizer.model").is_file():
scores.append(score)
toktypes.append(toktype)
if Path(dir_model + "/added_tokens.json").is_file():
with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
addtokens_json = json.load(f)
print("gguf: get added tokens")
for key in addtokens_json:
tokens.append( key.encode("utf-8") )
scores.append(-1000.0)
toktypes.append(4) # user-defined token type
gguf_writer.add_tokenizer_model("llama")
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)

View file

@ -0,0 +1,334 @@
import sys, struct, math, argparse
from pathlib import Path
import numpy as np
import gguf
# Note: Does not support GGML_QKK_64
QK_K = 256
# Items here are (block size, type size)
GGML_QUANT_SIZES = {
gguf.GGMLQuantizationType.F32 : (1, 4),
gguf.GGMLQuantizationType.F16 : (1, 2),
gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
}
class Hyperparameters:
def __init__(self):
self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
self.n_ff = 0
def set_n_ff(self, model):
ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
ff_tensor = model.tensors[ff_tensor_idx]
self.n_ff = ff_tensor.dims[1]
def load(self, data, offset):
(
self.n_vocab,
self.n_embd,
self.n_mult,
self.n_head,
self.n_layer,
self.n_rot,
self.ftype,
) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
return 4 * 7
def __str__(self):
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
class Vocab:
def __init__(self):
self.items = []
def load(self, data, offset, n_vocab):
orig_offset = offset
for _ in range(n_vocab):
itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
assert itemlen < 4096, 'Absurd vocab item length'
offset += 4
vocab = bytes(data[offset:offset + itemlen])
offset += itemlen
score = struct.unpack('<f', data[offset:offset + 4])[0]
offset += 4
self.items.append((vocab, score))
return offset - orig_offset
class Tensor:
def __init__(self):
self.name = None
self.dims = ()
self.dtype = None
self.start_offset = 0
self.len_bytes = 0
def load(self, data, offset):
orig_offset = offset
(n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
assert name_len < 4096, 'Absurd tensor name length'
quant = GGML_QUANT_SIZES.get(dtype)
assert quant is not None, 'Unknown tensor type'
(blksize, tysize) = quant
offset += 12
self.dtype= dtype
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len])
offset += name_len
pad = ((offset + 31) & ~31) - offset
offset += pad
n_elems = np.prod(self.dims)
n_bytes = (n_elems * tysize) // blksize
self.start_offset = offset
self.len_bytes = n_bytes
offset += n_bytes
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
return offset - orig_offset
class GGMLV3Model:
def __init__(self):
self.hyperparameters = None
self.vocab = None
self.tensor_map = {}
self.tensors = []
def validate_header(self, data, offset):
if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
raise ValueError('Only GGJTv3 supported')
return 8
def load(self, data, offset):
offset += self.validate_header(data, offset)
hp = Hyperparameters()
offset += hp.load(data, offset)
vocab = Vocab()
offset += vocab.load(data, offset, hp.n_vocab)
tensors = []
tensor_map = {}
while offset < len(data):
tensor = Tensor()
offset += tensor.load(data, offset)
tensor_map[tensor.name] = len(tensors)
tensors.append(tensor)
self.hyperparameters = hp
self.vocab = vocab
self.tensors = tensors
self.tensor_map = tensor_map
hp.set_n_ff(self)
return offset
class GGMLToGGUF:
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None):
hp = ggml_model.hyperparameters
self.model = ggml_model
self.data = data
self.cfg = cfg
self.params_override = params_override
self.vocab_override = vocab_override
if params_override is not None:
n_kv_head = params_override.n_head_kv
else:
if cfg.gqa == 1:
n_kv_head = hp.n_head
else:
gqa = float(cfg.gqa)
n_kv_head = None
for x in range(1, 256):
if float(hp.n_head) / float(x) == gqa:
n_kv_head = x
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
self.n_kv_head = n_kv_head
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
def save(self):
print('* Preparing to save GGUF file')
gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
self.add_params(gguf_writer)
self.add_vocab(gguf_writer)
self.add_tensors(gguf_writer)
print(" gguf: write header")
gguf_writer.write_header_to_file()
print(" gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print(" gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
def add_params(self, gguf_writer):
hp = self.model.hyperparameters
cfg = self.cfg
desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
try:
# Filenames aren't necessarily valid UTF8.
name = cfg.name if cfg.name is not None else cfg.input.name
except UnicodeDecodeError:
name = None
print('* Adding model parameters and KV items')
if name is not None:
gguf_writer.add_name(name)
gguf_writer.add_description(desc)
if self.params_override is not None:
po = self.params_override
assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
gguf_writer.add_context_length (po.n_ctx)
gguf_writer.add_embedding_length (po.n_embd)
gguf_writer.add_block_count (po.n_layer)
gguf_writer.add_feed_forward_length (po.n_ff)
gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
gguf_writer.add_head_count (po.n_head)
gguf_writer.add_head_count_kv (po.n_head_kv)
gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps)
return
gguf_writer.add_context_length(cfg.context_length)
gguf_writer.add_embedding_length(hp.n_embd)
gguf_writer.add_block_count(hp.n_layer)
gguf_writer.add_feed_forward_length(hp.n_ff)
gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
gguf_writer.add_head_count(hp.n_head)
gguf_writer.add_head_count_kv(self.n_kv_head)
gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
def add_vocab(self, gguf_writer):
hp = self.model.hyperparameters
gguf_writer.add_tokenizer_model('llama')
tokens = []
scores = []
toktypes = []
if self.vocab_override is not None:
vo = self.vocab_override
print('* Adding vocab item(s)')
for (idx, vitem) in enumerate(vo.all_tokens()):
if len(vitem) == 3:
tokens.append(vitem[0])
scores.append(vitem[1])
toktypes.append(vitem[2])
else:
# Maybe try to guess the token type here?
tokens.append(vitem[0])
scores.append(vitem[1])
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
if len(toktypes) > 0:
gguf_writer.add_token_types(toktypes)
return
print(f'* Adding {hp.n_vocab} vocab item(s)')
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
tt = 1 # Normal
if len(vbytes) == 0:
tt = 3 # Control
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
hv = hex(vbytes[0])[2:].upper()
vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8')
tt = 6 # Byte
else:
vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
toktypes.append(tt)
tokens.append(vbytes)
scores.append(vscore)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
def add_tensors(self, gguf_writer):
nm = self.name_map
data = self.data
print(f'* Adding {len(self.model.tensors)} tensor(s)')
for tensor in self.model.tensors:
name = str(tensor.name, 'UTF-8')
if name.endswith('.weight'):
name = name[:-7]
suffix = '.weight'
elif name.endswith('.bias'):
name = name[:-5]
suffix = '.bias'
mapped_name = nm.get(name)
assert mapped_name is not None, f'Bad name {name}'
mapped_name += suffix
tempdims = list(tensor.dims[:])
if len(tempdims) > 1:
temp = tempdims[1]
tempdims[1] = tempdims[0]
tempdims[0] = temp
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
def handle_metadata(cfg, hp):
import convert
assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
hf_config_path = cfg.model_metadata_dir / "config.json"
orig_config_path = cfg.model_metadata_dir / "params.json"
# We pass a fake model here. "original" mode will check the shapes of some
# tensors if information is missing in the .json file: other than that, the
# model data isn't used so this should be safe (at least for now).
fakemodel = {
'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
}
fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
if hf_config_path.exists():
params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
elif orig_config_path.exists():
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
else:
raise ValueError('Unable to load metadata')
vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
convert.check_vocab_size(params, vocab)
return (params, vocab)
def handle_args():
parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename')
parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
parser.add_argument('--name', help = 'Set model name')
parser.add_argument('--desc', help = 'Set model description')
parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
return parser.parse_args()
def main():
cfg = handle_args()
print(f'* Using config: {cfg}')
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
data = np.memmap(cfg.input, mode = 'r')
model = GGMLV3Model()
print('* Scanning GGML input file')
offset = model.load(data, 0)
print(f'* GGML model hyperparameters: {model.hyperparameters}')
vocab_override = None
params_override = None
if cfg.model_metadata_dir is not None:
(params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
print(f'* Overriding params: {params_override}')
print(f'* Overriding vocab: {vocab_override}')
else:
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}')
main()

View file

@ -126,6 +126,11 @@ gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
# TOKENIZATION
@ -155,9 +160,7 @@ if Path(dir_model + "/tokenizer.model").is_file():
if tokenizer.is_control(i):
toktype = 3
# TODO: How to determinate if a token is user defined?
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = 4
# toktype = 4 is user-defined = tokens from added_tokens.json
if tokenizer.is_unused(i):
toktype = 5
@ -168,6 +171,18 @@ if Path(dir_model + "/tokenizer.model").is_file():
scores.append(score)
toktypes.append(toktype)
if Path(dir_model + "/added_tokens.json").is_file():
with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
addtokens_json = json.load(f)
print("gguf: get added tokens")
for key in addtokens_json:
tokens.append( key.encode("utf-8") )
scores.append(-1000.0)
toktypes.append(4) # user-defined token type
gguf_writer.add_tokenizer_model("llama")
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
@ -264,7 +279,9 @@ for part_name in part_names:
data = data.squeeze().numpy()
# reverse permute these
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
if name.endswith(".q_proj.weight"):
data = reverse_hf_permute(data, head_count)
if name.endswith(".k_proj.weight"):
data = reverse_hf_permute(data, head_count, head_count_kv)
# map tensor names

26
convert.py Executable file → Normal file
View file

@ -241,11 +241,13 @@ class BpeVocab:
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
added_tokens = {}
vocab_size: int = len(self.bpe_tokenizer)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base: int = vocab_size
@ -261,12 +263,12 @@ class BpeVocab:
for i, item in enumerate(tokenizer):
text: bytes = item.encode("utf-8")
score: float = -i
yield text, score, 4
yield text, score, gguf.TokenType.USER_DEFINED
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, 4
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
yield from self.bpe_tokens()
@ -304,27 +306,27 @@ class SentencePieceVocab:
text: bytes = piece.encode("utf-8")
score: float = tokenizer.get_score(i)
toktype = 1 # defualt to normal token type
toktype = gguf.TokenType.NORMAL
if tokenizer.is_unknown(i):
toktype = 2
toktype = gguf.TokenType.UNKNOWN
if tokenizer.is_control(i):
toktype = 3
toktype = gguf.TokenType.CONTROL
# NOTE: I think added_tokens are user defined.
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = 4
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
if tokenizer.is_unused(i):
toktype = 5
toktype = gguf.TokenType.UNUSED
if tokenizer.is_byte(i):
toktype = 6
toktype = gguf.TokenType.BYTE
yield text, score, toktype
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, 4
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
yield from self.sentencepiece_tokens()
@ -342,6 +344,7 @@ Vocab = Union[BpeVocab, SentencePieceVocab]
#
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
if n_head_kv is not None and n_head != n_head_kv:
n_head //= n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
@ -724,6 +727,7 @@ class OutputFile:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
def add_meta_arch(self, params: Params) -> None:
self.gguf.add_name ("llama")
self.gguf.add_context_length (params.n_ctx)
self.gguf.add_embedding_length (params.n_embd)
self.gguf.add_block_count (params.n_layer)
@ -836,12 +840,12 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
print(f"Permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head_kv)
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
#tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
print(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head_kv)
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
else:

View file

@ -139,14 +139,16 @@ void print_sample_weights(TransformerWeights *w){
struct llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
struct token_score {
token tok;
struct token_data {
token text;
float score;
ttype type;
};
std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;
std::vector<token_data> id_to_token;
};
struct my_llama_hparams {
@ -516,36 +518,30 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
std::vector<const char *> strings;
std::vector<float> scores;
int n_vocab = llama_n_vocab(lctx);
strings.resize(n_vocab, NULL);
scores.resize(n_vocab, 0);
n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
const int n_vocab = llama_n_vocab(lctx);
vocab->id_to_token.resize(n_vocab);
for (int i=0; i<n_vocab; ++i) {
std::string tok = std::string(strings[i]);
float score = scores[i];
vocab->id_to_token[i].tok = tok;
vocab->id_to_token[i].score = score;
vocab->token_to_id.emplace(tok, i);
vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
}
llama_free(lctx);
llama_free_model(lmodel);
} else { // assume llama2.c vocabulary
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
llama_file file(filename, "rb");
uint32_t n_vocab = config->vocab_size;
const int n_vocab = config->vocab_size;
/* uint32_t max_token_length = */ file.read_u32(); // unused
vocab->id_to_token.resize(n_vocab);
for (uint32_t i=0; i<n_vocab; ++i) {
for (int i=0; i<n_vocab; ++i) {
float_t score = file.read_f32();
uint32_t len = file.read_u32();
std::string tok = file.read_string(len);
vocab->id_to_token[i].tok = tok;
std::string text = file.read_string(len);
vocab->id_to_token[i].text = text;
vocab->id_to_token[i].score = score;
vocab->token_to_id.emplace(tok, i);
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
vocab->token_to_id.emplace(text, i);
}
}
}
@ -611,10 +607,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
// uint32_t n_vocab = model->hparams.n_vocab;
// for (uint32_t i = 0; i < n_vocab; i++) {
// const auto & token_score = vocab->id_to_token.at(i);
// file.write_u32((uint32_t) token_score.tok.size());
// file.write_raw(token_score.tok.data(), token_score.tok.size());
// file.write_raw(&token_score.score, sizeof(token_score.score));
// const auto & token_data = vocab->id_to_token.at(i);
// file.write_u32((uint32_t) token_data.tok.size());
// file.write_raw(token_data.tok.data(), token_data.tok.size());
// file.write_raw(&token_data.score, sizeof(token_data.score));
// }
//
// // stuff AK weights into GG weights one by one.

View file

@ -5,6 +5,7 @@
#include <cmath>
#include <ctime>
#include <sstream>
#include <cstring>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@ -121,6 +122,27 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
printf("\n");
}
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
int n_vocab, int n_thread) {
std::vector<float> result;
result.reserve(tokens.size() * n_vocab);
size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
size_t n_tokens = tokens.size() - i_chunk * n_batch;
n_tokens = std::min(n_tokens, size_t(n_batch));
if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return {};
}
const auto logits = llama_get_logits(ctx);
result.insert(result.end(), logits, logits + n_tokens * n_vocab);
n_past += n_tokens;
}
return result;
}
void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// Calculates hellaswag score (acc_norm) from prompt
//
@ -209,17 +231,19 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
double acc = 0.0f;
const int n_vocab = llama_n_vocab(ctx);
std::vector<float> tok_logits(n_vocab);
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
// Tokenize the context to count tokens
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
size_t context_size = context_embd.size();
for (size_t ending_idx=0;ending_idx<4;ending_idx++) {
// Tokenize the query
std::vector<int> query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos);
size_t query_size = query_embd.size();
// Do the 1st ending
// In this case we include the context when evaluating
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos);
auto query_size = query_embd.size();
//printf("First query: %d\n",(int)query_size);
// Stop if query wont fit the ctx window
if (query_size > (size_t)params.n_ctx) {
@ -232,25 +256,66 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
query_embd.resize(32);
}
// Evaluate the query
if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) {
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
if (logits.empty()) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
const auto query_logits = llama_get_logits(ctx);
std::vector<float> logits;
logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab);
std::memcpy(tok_logits.data(), logits.data() + (context_size-1)*n_vocab, n_vocab*sizeof(float));
const auto first_probs = softmax(tok_logits);
hs_data[task_idx].ending_logprob_count[ending_idx] = 0;
hs_data[task_idx].ending_logprob[ending_idx] = 0.0f;
hs_data[task_idx].ending_logprob_count[0] = 1;
hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]);
// Calculate the logprobs over the ending
for (size_t j = context_size-1; j < query_size - 1; j++) {
// Calculate probability of next token, given the previous ones.
const std::vector<float> tok_logits(
logits.begin() + (j + 0) * n_vocab,
logits.begin() + (j + 1) * n_vocab);
for (size_t j = context_size; j < query_size - 1; j++) {
std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
const float prob = softmax(tok_logits)[query_embd[j + 1]];
hs_data[task_idx].ending_logprob[0] += std::log(prob);
hs_data[task_idx].ending_logprob_count[0]++;
}
// Calculate the mean token logprob for acc_norm
hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0];
// Do the remaining endings
// For these, we use the bare ending with n_past = context_size
//
for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
// Tokenize the query
query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
query_size = query_embd.size();
// Stop if query wont fit the ctx window
if (context_size + query_size > (size_t)params.n_ctx) {
fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
return;
}
// Speedup small evaluations by evaluating atleast 32 tokens
// No, resizing to 32 is actually slightly slower (at least on CUDA)
//if (query_size < 32) {
// query_embd.resize(32);
//}
// Evaluate the query
logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab, params.n_threads);
if (logits.empty()) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);
// Calculate the logprobs over the ending
for (size_t j = 0; j < query_size - 1; j++) {
std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
const float prob = softmax(tok_logits)[query_embd[j + 1]];
@ -267,9 +332,9 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
// Find the ending with maximum logprob
size_t ending_logprob_max_idx = -1;
double ending_logprob_max_val = -INFINITY;
for (size_t j=0; j < 4; j++) {
size_t ending_logprob_max_idx = 0;
double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];
for (size_t j = 1; j < 4; j++) {
if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
ending_logprob_max_idx = j;
ending_logprob_max_val = hs_data[task_idx].ending_logprob[j];

View file

@ -11,8 +11,10 @@ echo >> $PUBLIC/index.js # add newline
FILES=$(ls $PUBLIC)
cd $PUBLIC
for FILE in $FILES; do
func=$(echo $FILE | tr '.' '_')
echo "generate $FILE.hpp ($func)"
xxd -n $func -i $PUBLIC/$FILE > $DIR/$FILE.hpp
echo "generate $FILE.hpp"
# use simple flag for old version of xxd
xxd -i $FILE > $DIR/$FILE.hpp
done

View file

@ -144,12 +144,12 @@
import { SchemaConverter } from '/json-schema-to-grammar.mjs';
const session = signal({
prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.",
prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
historyTemplate: "{{name}}: {{message}}",
transcript: [],
type: "chat",
char: "llama",
char: "Llama",
user: "User",
})

View file

@ -170,14 +170,16 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
struct llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
struct token_score {
token tok;
struct token_data {
token text;
float score;
ttype type;
};
std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;
std::vector<token_data> id_to_token;
};
struct my_llama_hparams {
@ -2629,10 +2631,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
// // write_vocab
// uint32_t n_vocab = model->hparams.n_vocab;
// for (uint32_t i = 0; i < n_vocab; i++) {
// const auto & token_score = vocab->id_to_token.at(i);
// file.write_u32((uint32_t) token_score.tok.size());
// file.write_raw(token_score.tok.data(), token_score.tok.size());
// file.write_raw(&token_score.score, sizeof(token_score.score));
// const auto & token_data = vocab->id_to_token.at(i);
// file.write_u32((uint32_t) token_data.tok.size());
// file.write_raw(token_data.tok.data(), token_data.tok.size());
// file.write_raw(&token_data.score, sizeof(token_data.score));
// }
// // write tensors
// write_tensor(&file, model->tok_embeddings);
@ -3055,20 +3057,13 @@ int main(int argc, char ** argv) {
struct llama_vocab vocab;
{
std::vector<const char *> strings;
std::vector<float> scores;
int n_vocab = llama_n_vocab(lctx);
strings.resize(n_vocab, NULL);
scores.resize(n_vocab, 0);
n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
const int n_vocab = llama_n_vocab(lctx);
vocab.id_to_token.resize(n_vocab);
for (int i=0; i<n_vocab; ++i) {
std::string tok = std::string(strings[i]);
float score = scores[i];
vocab.id_to_token[i].tok = tok;
vocab.id_to_token[i].score = score;
vocab.token_to_id.emplace(tok, i);
vocab.id_to_token[i].text = llama_token_get_text(lctx, i);
vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
vocab.id_to_token[i].type = llama_token_get_type(lctx, i);
vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
}
}

View file

@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) {
threadgroup_barrier(mem_flags::mem_device);
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup_barrier(mem_flags::mem_device);
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
if (sgitg==0) {
for (int i = 0; i < n_rows; i++) {

244
ggml.c
View file

@ -1643,11 +1643,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
[GGML_TYPE_I8] = {
.type_name = "i8",
.blck_size = 1,
.type_size = sizeof(int8_t),
.is_quantized = false,
},
[GGML_TYPE_I16] = {
.type_name = "i16",
.blck_size = 1,
.type_size = sizeof(int16_t),
.is_quantized = false,
},
[GGML_TYPE_I32] = {
.type_name = "i32",
.blck_size = 1,
.type_size = sizeof(int32_t),
.is_quantized = false,
},
[GGML_TYPE_F32] = {
.type_name = "f32",
.blck_size = 1,
.type_size = sizeof(float),
.is_quantized = false,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
},
[GGML_TYPE_F16] = {
.type_name = "f16",
.blck_size = 1,
.type_size = sizeof(ggml_fp16_t),
.is_quantized = false,
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
@ -1655,6 +1681,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_F16,
},
[GGML_TYPE_Q4_0] = {
.type_name = "q4_0",
.blck_size = QK4_0,
.type_size = sizeof(block_q4_0),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
.from_float = quantize_row_q4_0,
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
@ -1662,6 +1692,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q4_1] = {
.type_name = "q4_1",
.blck_size = QK4_1,
.type_size = sizeof(block_q4_1),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
.from_float = quantize_row_q4_1,
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
@ -1669,6 +1703,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_1,
},
[GGML_TYPE_Q5_0] = {
.type_name = "q5_0",
.blck_size = QK5_0,
.type_size = sizeof(block_q5_0),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
.from_float = quantize_row_q5_0,
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
@ -1676,6 +1714,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q5_1] = {
.type_name = "q5_1",
.blck_size = QK5_1,
.type_size = sizeof(block_q5_1),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
.from_float = quantize_row_q5_1,
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
@ -1683,6 +1725,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_1,
},
[GGML_TYPE_Q8_0] = {
.type_name = "q8_0",
.blck_size = QK8_0,
.type_size = sizeof(block_q8_0),
.is_quantized = true,
.to_float = dequantize_row_q8_0,
.from_float = quantize_row_q8_0,
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
@ -1690,12 +1736,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q8_1] = {
.type_name = "q8_1",
.blck_size = QK8_1,
.type_size = sizeof(block_q8_1),
.is_quantized = true,
.from_float = quantize_row_q8_1,
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
.vec_dot_type = GGML_TYPE_Q8_1,
},
#ifdef GGML_USE_K_QUANTS
[GGML_TYPE_Q2_K] = {
.type_name = "q2_K",
.blck_size = QK_K,
.type_size = sizeof(block_q2_K),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
.from_float = quantize_row_q2_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
@ -1703,6 +1757,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q3_K] = {
.type_name = "q3_K",
.blck_size = QK_K,
.type_size = sizeof(block_q3_K),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
.from_float = quantize_row_q3_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
@ -1710,6 +1768,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q4_K] = {
.type_name = "q4_K",
.blck_size = QK_K,
.type_size = sizeof(block_q4_K),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
.from_float = quantize_row_q4_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
@ -1717,6 +1779,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q5_K] = {
.type_name = "q5_K",
.blck_size = QK_K,
.type_size = sizeof(block_q5_K),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
.from_float = quantize_row_q5_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
@ -1724,6 +1790,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q6_K] = {
.type_name = "q6_K",
.blck_size = QK_K,
.type_size = sizeof(block_q6_K),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
.from_float = quantize_row_q6_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
@ -1731,15 +1801,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
},
[GGML_TYPE_Q8_K] = {
.type_name = "q8_K",
.blck_size = QK_K,
.type_size = sizeof(block_q8_K),
.is_quantized = true,
.from_float = quantize_row_q8_K,
}
#endif
};
// For internal test use
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) {
GGML_ASSERT(i < GGML_TYPE_COUNT);
return type_traits[i];
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
GGML_ASSERT(type < GGML_TYPE_COUNT);
return type_traits[type];
}
@ -3648,98 +3722,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
*s = idx;
}
//
// data types
//
static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = 1,
[GGML_TYPE_F16] = 1,
[GGML_TYPE_Q4_0] = QK4_0,
[GGML_TYPE_Q4_1] = QK4_1,
[GGML_TYPE_Q5_0] = QK5_0,
[GGML_TYPE_Q5_1] = QK5_1,
[GGML_TYPE_Q8_0] = QK8_0,
[GGML_TYPE_Q8_1] = QK8_1,
#ifdef GGML_USE_K_QUANTS
[GGML_TYPE_Q2_K] = QK_K,
[GGML_TYPE_Q3_K] = QK_K,
[GGML_TYPE_Q4_K] = QK_K,
[GGML_TYPE_Q5_K] = QK_K,
[GGML_TYPE_Q6_K] = QK_K,
[GGML_TYPE_Q8_K] = QK_K,
#endif
[GGML_TYPE_I8] = 1,
[GGML_TYPE_I16] = 1,
[GGML_TYPE_I32] = 1,
};
static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = sizeof(float),
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
[GGML_TYPE_Q5_0] = sizeof(block_q5_0),
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
#ifdef GGML_USE_K_QUANTS
[GGML_TYPE_Q2_K] = sizeof(block_q2_K),
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
#endif
[GGML_TYPE_I8] = sizeof(int8_t),
[GGML_TYPE_I16] = sizeof(int16_t),
[GGML_TYPE_I32] = sizeof(int32_t),
};
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = "f32",
[GGML_TYPE_F16] = "f16",
[GGML_TYPE_Q4_0] = "q4_0",
[GGML_TYPE_Q4_1] = "q4_1",
[GGML_TYPE_Q5_0] = "q5_0",
[GGML_TYPE_Q5_1] = "q5_1",
[GGML_TYPE_Q8_0] = "q8_0",
[GGML_TYPE_Q8_1] = "q8_1",
[GGML_TYPE_Q2_K] = "q2_K",
[GGML_TYPE_Q3_K] = "q3_K",
[GGML_TYPE_Q4_K] = "q4_K",
[GGML_TYPE_Q5_K] = "q5_K",
[GGML_TYPE_Q6_K] = "q6_K",
[GGML_TYPE_Q8_K] = "q8_K",
[GGML_TYPE_I8] = "i8",
[GGML_TYPE_I16] = "i16",
[GGML_TYPE_I32] = "i32",
};
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = false,
[GGML_TYPE_F16] = false,
[GGML_TYPE_Q4_0] = true,
[GGML_TYPE_Q4_1] = true,
[GGML_TYPE_Q5_0] = true,
[GGML_TYPE_Q5_1] = true,
[GGML_TYPE_Q8_0] = true,
[GGML_TYPE_Q8_1] = true,
[GGML_TYPE_Q2_K] = true,
[GGML_TYPE_Q3_K] = true,
[GGML_TYPE_Q4_K] = true,
[GGML_TYPE_Q5_K] = true,
[GGML_TYPE_Q6_K] = true,
[GGML_TYPE_Q8_K] = true,
[GGML_TYPE_I8] = false,
[GGML_TYPE_I16] = false,
[GGML_TYPE_I32] = false,
};
static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"NONE",
@ -4109,7 +4091,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
//
// is enough, but just in case, adding the second part
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
}
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
@ -4119,23 +4101,27 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
}
int ggml_blck_size(enum ggml_type type) {
return GGML_BLCK_SIZE[type];
return type_traits[type].blck_size;
}
size_t ggml_type_size(enum ggml_type type) {
return GGML_TYPE_SIZE[type];
return type_traits[type].type_size;
}
float ggml_type_sizef(enum ggml_type type) {
return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
}
const char * ggml_type_name(enum ggml_type type) {
return GGML_TYPE_NAME[type];
return type_traits[type].type_name;
}
bool ggml_is_quantized(enum ggml_type type) {
return type_traits[type].is_quantized;
}
const char * ggml_op_name(enum ggml_op op) {
@ -4147,7 +4133,7 @@ const char * ggml_op_symbol(enum ggml_op op) {
}
size_t ggml_element_size(const struct ggml_tensor * tensor) {
return GGML_TYPE_SIZE[tensor->type];
return ggml_type_size(tensor->type);
}
static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
@ -4185,10 +4171,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct
(t0->ne[3] == t1->ne[3]);
}
bool ggml_is_quantized(enum ggml_type type) {
return GGML_IS_QUANTIZED[type];
}
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
enum ggml_type wtype = GGML_TYPE_COUNT;
@ -4226,8 +4208,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] &&
tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
@ -4236,7 +4218,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
@ -4251,7 +4233,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
@ -4570,7 +4552,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
size_t data_size = 0;
if (data == NULL && !ctx->no_alloc) {
data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
for (int i = 1; i < n_dims; i++) {
data_size *= ne[i];
}
@ -4625,8 +4607,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
result->ne[i] = ne[i];
}
result->nb[0] = GGML_TYPE_SIZE[type];
result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]);
result->nb[0] = ggml_type_size(type);
result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
for (int i = 2; i < GGML_MAX_DIMS; i++) {
result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
}
@ -7748,7 +7730,7 @@ static void ggml_compute_forward_dup_same_cont(
memcpy(
((char *) dst->data + ie0*nb0),
((char *) src0->data + ie0*nb00),
(ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
(ie1 - ie0) * ggml_type_size(src0->type));
}
}
@ -7782,7 +7764,7 @@ static void ggml_compute_forward_dup_f16(
if (src0->type == dst->type &&
ne00 == ne0 &&
nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
// copy by rows
const size_t rs = ne00*nb00;
for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -7840,7 +7822,7 @@ static void ggml_compute_forward_dup_f16(
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
size_t id = 0;
size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
char * dst_ptr = (char *) dst->data;
for (int i03 = 0; i03 < ne03; i03++) {
@ -8053,7 +8035,7 @@ static void ggml_compute_forward_dup_f32(
if (src0->type == dst->type &&
ne00 == ne0 &&
nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
// copy by rows
const size_t rs = ne00*nb00;
for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -8092,7 +8074,7 @@ static void ggml_compute_forward_dup_f32(
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
size_t id = 0;
size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
char * dst_ptr = (char *) dst->data;
for (int i03 = 0; i03 < ne03; i03++) {
@ -8504,7 +8486,7 @@ static void ggml_compute_forward_add_q_f32(
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
GGML_ASSERT(nb00 == ggml_type_size(type));
GGML_ASSERT(nb10 == sizeof(float));
// dst cannot be transposed or permuted
@ -8778,7 +8760,7 @@ static void ggml_compute_forward_add1_q_f32(
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
// we don't support permuted src0
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
GGML_ASSERT(nb00 == ggml_type_size(type));
// dst cannot be transposed or permuted
GGML_ASSERT(nb0 <= nb1);
@ -10634,7 +10616,7 @@ static void ggml_compute_forward_mul_mat(
GGML_ASSERT(ne3 == ne13);
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
GGML_ASSERT(nb00 == ggml_type_size(type));
GGML_ASSERT(nb10 == sizeof(float));
// dst cannot be transposed or permuted
@ -10717,7 +10699,7 @@ static void ggml_compute_forward_mul_mat(
if (params->type == GGML_TASK_INIT) {
if (src1->type != vec_dot_type) {
char * wdata = params->wdata;
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
@ -10737,7 +10719,7 @@ static void ggml_compute_forward_mul_mat(
}
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
@ -11210,7 +11192,7 @@ static void ggml_compute_forward_get_rows_q(
assert( dst->ne[0] == nc);
assert( dst->ne[1] == nr);
assert(src0->nb[0] == GGML_TYPE_SIZE[type]);
assert(src0->nb[0] == ggml_type_size(type));
for (int i = 0; i < nr; ++i) {
const int r = ((int32_t *) src1->data)[i];
@ -16387,7 +16369,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
size_t cur = 0;
if (ggml_is_quantized(node->type)) {
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
}
work_size = MAX(work_size, cur);
@ -16400,7 +16382,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
size_t cur = 0;
if (ggml_is_quantized(node->src[0]->type)) {
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
}
work_size = MAX(work_size, cur);
@ -16412,7 +16394,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
size_t cur = 0;
if (ggml_is_quantized(node->src[0]->type)) {
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks;
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
}
work_size = MAX(work_size, cur);
@ -16495,12 +16477,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
// the threads are still spinning
if (node->src[0]->type != GGML_TYPE_F32) {
// here we need memory just for single 2D matrix from src0
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]);
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
}
} else
#endif
if (node->src[1]->type != vec_dot_type) {
cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type];
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
} else {
cur = 0;
}
@ -18306,8 +18288,8 @@ enum ggml_opt_result ggml_opt_resume(
struct ggml_tensor * f) {
// build forward + backward compute graphs
struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;

6
ggml.h
View file

@ -1856,6 +1856,10 @@ extern "C" {
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
typedef struct {
const char * type_name;
int blck_size;
size_t type_size;
bool is_quantized;
ggml_to_float_t to_float;
ggml_from_float_t from_float;
ggml_from_float_t from_float_reference;
@ -1863,7 +1867,7 @@ extern "C" {
enum ggml_type vec_dot_type;
} ggml_type_traits_t;
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
#ifdef __cplusplus
}

62
gguf.py
View file

@ -5,7 +5,7 @@ import tempfile
import numpy as np
from enum import IntEnum, auto
from typing import Any, IO, List
from typing import Any, IO, List, Optional
#
# constants
@ -45,7 +45,7 @@ KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
# RoPE
KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
KEY_ROPE_SCALE = "{arch}.rope.scale"
KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear"
# tokenization
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
@ -61,6 +61,7 @@ KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
#
# recommended mapping of model tensor names for storage in gguf
#
@ -319,6 +320,15 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
return tensor_map
class TokenType(IntEnum):
NORMAL = 1
UNKNOWN = 2
CONTROL = 3
USER_DEFINED = 4
UNUSED = 5
BYTE = 6
#
# implementation
#
@ -327,6 +337,18 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
class GGMLQuantizationType(IntEnum):
F32 = 0
F16 = 1
Q4_0 = 2
Q4_1 = 3
Q5_0 = 6
Q5_1 = 7
Q8_0 = 8
Q8_1 = 9
Q2_K = 10
Q3_K = 11
Q4_K = 12
Q5_K = 13
Q6_K = 14
Q8_K = 15
class GGUFValueType(IntEnum):
@ -359,7 +381,7 @@ class GGUFValueType(IntEnum):
class GGUFWriter:
def __init__(self, path: str, arch: str):
def __init__(self, path: str, arch: str, use_temp_file = True):
self.fout = open(path, "wb")
self.arch = arch
self.offset_tensor = 0
@ -369,6 +391,8 @@ class GGUFWriter:
self.ti_data = b""
self.ti_data_count = 0
self.add_architecture()
self.use_temp_file = use_temp_file
self.tensors = []
def write_header_to_file(self):
self.fout.write(struct.pack("<I", GGUF_MAGIC))
@ -476,8 +500,8 @@ class GGUFWriter:
def ggml_pad(x: int, n: int) -> int:
return ((x + n - 1) // n) * n
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int):
assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
encoded_name = name.encode("utf8")
self.ti_data += struct.pack("<I", len(encoded_name))
@ -486,23 +510,30 @@ class GGUFWriter:
self.ti_data += struct.pack("<I", n_dims)
for i in range(n_dims):
self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
if raw_dtype is None:
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
else:
dtype = raw_dtype
self.ti_data += struct.pack("<I", dtype)
self.ti_data += struct.pack("<Q", self.offset_tensor)
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
self.ti_data_count += 1
def add_tensor(self, name: str, tensor: np.ndarray):
if not hasattr(self, "temp_file"):
def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
if self.use_temp_file and not hasattr(self, "temp_file"):
self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
self.temp_file.seek(0)
self.add_tensor_info(name, tensor.shape, tensor.dtype, tensor.nbytes)
self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
if not self.use_temp_file:
self.tensors.append((tensor, pad))
return
tensor.tofile(self.temp_file)
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
if pad != 0:
self.temp_file.write(bytes([0] * pad))
@ -524,6 +555,13 @@ class GGUFWriter:
if pad != 0:
self.fout.write(bytes([0] * pad))
if not self.use_temp_file:
for (currtensor, currpad) in self.tensors:
currtensor.tofile(self.fout)
if currpad != 0:
self.fout.write(bytes([0] * currpad))
return
self.temp_file.seek(0)
shutil.copyfileobj(self.temp_file, self.fout)
@ -620,8 +658,8 @@ class GGUFWriter:
self.add_uint32(
KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
def add_rope_scale(self, value: float):
self.add_float32(KEY_ROPE_SCALE.format(arch=self.arch), value)
def add_rope_scale_linear(self, value: float):
self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
def add_tokenizer_model(self, model: str):
self.add_string(KEY_TOKENIZER_MODEL, model)

View file

@ -771,11 +771,12 @@ struct llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
struct token_data {
token tok;
token text;
float score;
int toktype;
ttype type;
};
llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@ -1436,6 +1437,14 @@ static void llama_model_load_internal(
hparams.n_head_kv = hparams.n_head;
GGUF_GET(hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv");
// TODO: manually setting rope scale should override this
// rope_freq_scale (inverse of the kv) is optional
float ropescale = 1.0f;
GGUF_GET(ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, "llama.rope.scale_linear");
if (ropescale != 1.0f) {
rope_freq_scale = 1.0f/ropescale;
}
// get general kv
GGUF_GET(general_name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
GGUF_GET(general_arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
@ -1513,12 +1522,12 @@ static void llama_model_load_internal(
vocab.token_to_id[word] = i;
auto & token_data = vocab.id_to_token[i];
token_data.tok = std::move(word);
token_data.text = std::move(word);
token_data.score = scores[i];
token_data.toktype = toktypes[i];
token_data.type = (llama_token_type) toktypes[i];
// determine the newline token: 0x0A == 10 == '\n'
if (token_data.tok == "<0x0A>") {
if (token_data.text == "<0x0A>") {
vocab.linefeed_id = i;
}
}
@ -1550,12 +1559,12 @@ static void llama_model_load_internal(
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, general_name.c_str());
// special tokens
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].tok.c_str() ); }
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].tok.c_str() ); }
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].tok.c_str() ); }
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].tok.c_str() ); }
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].tok.c_str() ); }
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].tok.c_str() ); }
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
}
if (vocab_only) {
@ -2347,15 +2356,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
}
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 1;
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
}
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 2;
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
}
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 3;
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
}
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
}
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
}
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
}
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
@ -2373,22 +2394,10 @@ static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
return id == vocab.special_pad_id;
}
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 4;
}
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 5;
}
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 6;
}
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto& token_data = vocab.id_to_token.at(id);
auto buf = token_data.tok.substr(3, 2);
auto buf = token_data.text.substr(3, 2);
return strtol(buf.c_str(), NULL, 16);
}
@ -2701,6 +2710,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
bool found = false;
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
do {
@ -4949,25 +4959,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
return ctx->embedding.data();
}
int llama_get_vocab(
const struct llama_context * ctx,
const char * * strings,
float * scores,
int capacity) {
return llama_model_get_vocab(&ctx->model, strings, scores, capacity);
const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
return ctx->model.vocab.id_to_token[token].text.c_str();
}
int llama_model_get_vocab(
const struct llama_model * model,
const char * * strings,
float * scores,
int capacity) {
int n = std::min(capacity, (int) model->vocab.id_to_token.size());
for (int i = 0; i<n; ++i) {
strings[i] = model->vocab.id_to_token[i].tok.c_str();
scores[i] = model->vocab.id_to_token[i].score;
float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
return ctx->model.vocab.id_to_token[token].score;
}
return n;
llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
return ctx->model.vocab.id_to_token[token].type;
}
llama_token llama_token_bos(const struct llama_context * ctx) {
@ -5038,7 +5039,7 @@ int llama_token_to_str(const struct llama_context * ctx, llama_token token, char
int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * buf, int length) {
if (0 <= token && token < llama_model_n_vocab(&ctx->model)) {
std::string result = ctx->model.vocab.id_to_token[token].tok;
std::string result = ctx->model.vocab.id_to_token[token].text;
if (length < (int) result.length()) {
return -result.length();
}
@ -5052,7 +5053,7 @@ int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token,
int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
if (0 <= token && token < llama_model_n_vocab(model)) {
if (llama_is_normal_token(model->vocab, token)) {
std::string result = model->vocab.id_to_token[token].tok;
std::string result = model->vocab.id_to_token[token].text;
if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
result = llama_unescape_whitespace(result);
}

26
llama.h
View file

@ -72,6 +72,16 @@ extern "C" {
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
};
enum llama_token_type {
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
LLAMA_TOKEN_TYPE_NORMAL = 1,
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
LLAMA_TOKEN_TYPE_CONTROL = 3,
LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
LLAMA_TOKEN_TYPE_UNUSED = 5,
LLAMA_TOKEN_TYPE_BYTE = 6,
};
// model file types
enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0,
@ -330,19 +340,11 @@ extern "C" {
// Vocab
//
// Get the vocabulary as output parameters.
// Returns number of results.
LLAMA_API int llama_get_vocab(
const struct llama_context * ctx,
const char * * strings,
float * scores,
int capacity);
LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
LLAMA_API int llama_model_get_vocab(
const struct llama_model * model,
const char * * strings,
float * scores,
int capacity);
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
LLAMA_API llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
// Special tokens
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence