Merge branch 'gguf' of https://github.com/goerch/llama.cpp into gguf

This commit is contained in:
goerch 2023-08-21 18:48:23 +02:00
commit a856685648
17 changed files with 765 additions and 309 deletions

View file

@ -9,13 +9,13 @@
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Hot topics:** ### 🚧 Incoming breaking change + refactoring:
- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998 See PR https://github.com/ggerganov/llama.cpp/pull/2398 for more info.
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
- New roadmap: https://github.com/users/ggerganov/projects/7 To devs: avoid making big changes to `llama.h` / `llama.cpp` until merged
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1 ----
<details> <details>
<summary>Table of Contents</summary> <summary>Table of Contents</summary>
@ -99,6 +99,7 @@ as the main playground for developing new features for the [ggml](https://github
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
**UI:** **UI:**

View file

@ -250,7 +250,7 @@ for part_name in part_names:
sys.exit() sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
# if f32 desired, convert any float16 to float32 # if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16: if ftype == 0 and data_dtype == np.float16:

View file

@ -118,6 +118,11 @@ gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv) gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
# TOKENIZATION # TOKENIZATION
@ -147,9 +152,7 @@ if Path(dir_model + "/tokenizer.model").is_file():
if tokenizer.is_control(i): if tokenizer.is_control(i):
toktype = 3 toktype = 3
# TODO: How to determinate if a token is user defined? # toktype = 4 is user-defined = tokens from added_tokens.json
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = 4
if tokenizer.is_unused(i): if tokenizer.is_unused(i):
toktype = 5 toktype = 5
@ -160,6 +163,17 @@ if Path(dir_model + "/tokenizer.model").is_file():
scores.append(score) scores.append(score)
toktypes.append(toktype) toktypes.append(toktype)
if Path(dir_model + "/added_tokens.json").is_file():
with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
addtokens_json = json.load(f)
print("gguf: get added tokens")
for key in addtokens_json:
tokens.append( key.encode("utf-8") )
scores.append(-1000.0)
toktypes.append(4) # user-defined token type
gguf_writer.add_tokenizer_model("llama") gguf_writer.add_tokenizer_model("llama")
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores) gguf_writer.add_token_scores(scores)

View file

@ -0,0 +1,334 @@
import sys, struct, math, argparse
from pathlib import Path
import numpy as np
import gguf
# Note: Does not support GGML_QKK_64
QK_K = 256
# Items here are (block size, type size)
GGML_QUANT_SIZES = {
gguf.GGMLQuantizationType.F32 : (1, 4),
gguf.GGMLQuantizationType.F16 : (1, 2),
gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
}
class Hyperparameters:
def __init__(self):
self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
self.n_ff = 0
def set_n_ff(self, model):
ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
ff_tensor = model.tensors[ff_tensor_idx]
self.n_ff = ff_tensor.dims[1]
def load(self, data, offset):
(
self.n_vocab,
self.n_embd,
self.n_mult,
self.n_head,
self.n_layer,
self.n_rot,
self.ftype,
) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
return 4 * 7
def __str__(self):
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
class Vocab:
def __init__(self):
self.items = []
def load(self, data, offset, n_vocab):
orig_offset = offset
for _ in range(n_vocab):
itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
assert itemlen < 4096, 'Absurd vocab item length'
offset += 4
vocab = bytes(data[offset:offset + itemlen])
offset += itemlen
score = struct.unpack('<f', data[offset:offset + 4])[0]
offset += 4
self.items.append((vocab, score))
return offset - orig_offset
class Tensor:
def __init__(self):
self.name = None
self.dims = ()
self.dtype = None
self.start_offset = 0
self.len_bytes = 0
def load(self, data, offset):
orig_offset = offset
(n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
assert name_len < 4096, 'Absurd tensor name length'
quant = GGML_QUANT_SIZES.get(dtype)
assert quant is not None, 'Unknown tensor type'
(blksize, tysize) = quant
offset += 12
self.dtype= dtype
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len])
offset += name_len
pad = ((offset + 31) & ~31) - offset
offset += pad
n_elems = np.prod(self.dims)
n_bytes = (n_elems * tysize) // blksize
self.start_offset = offset
self.len_bytes = n_bytes
offset += n_bytes
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
return offset - orig_offset
class GGMLV3Model:
def __init__(self):
self.hyperparameters = None
self.vocab = None
self.tensor_map = {}
self.tensors = []
def validate_header(self, data, offset):
if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
raise ValueError('Only GGJTv3 supported')
return 8
def load(self, data, offset):
offset += self.validate_header(data, offset)
hp = Hyperparameters()
offset += hp.load(data, offset)
vocab = Vocab()
offset += vocab.load(data, offset, hp.n_vocab)
tensors = []
tensor_map = {}
while offset < len(data):
tensor = Tensor()
offset += tensor.load(data, offset)
tensor_map[tensor.name] = len(tensors)
tensors.append(tensor)
self.hyperparameters = hp
self.vocab = vocab
self.tensors = tensors
self.tensor_map = tensor_map
hp.set_n_ff(self)
return offset
class GGMLToGGUF:
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None):
hp = ggml_model.hyperparameters
self.model = ggml_model
self.data = data
self.cfg = cfg
self.params_override = params_override
self.vocab_override = vocab_override
if params_override is not None:
n_kv_head = params_override.n_head_kv
else:
if cfg.gqa == 1:
n_kv_head = hp.n_head
else:
gqa = float(cfg.gqa)
n_kv_head = None
for x in range(1, 256):
if float(hp.n_head) / float(x) == gqa:
n_kv_head = x
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
self.n_kv_head = n_kv_head
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
def save(self):
print('* Preparing to save GGUF file')
gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
self.add_params(gguf_writer)
self.add_vocab(gguf_writer)
self.add_tensors(gguf_writer)
print(" gguf: write header")
gguf_writer.write_header_to_file()
print(" gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print(" gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
def add_params(self, gguf_writer):
hp = self.model.hyperparameters
cfg = self.cfg
desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
try:
# Filenames aren't necessarily valid UTF8.
name = cfg.name if cfg.name is not None else cfg.input.name
except UnicodeDecodeError:
name = None
print('* Adding model parameters and KV items')
if name is not None:
gguf_writer.add_name(name)
gguf_writer.add_description(desc)
if self.params_override is not None:
po = self.params_override
assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
gguf_writer.add_context_length (po.n_ctx)
gguf_writer.add_embedding_length (po.n_embd)
gguf_writer.add_block_count (po.n_layer)
gguf_writer.add_feed_forward_length (po.n_ff)
gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
gguf_writer.add_head_count (po.n_head)
gguf_writer.add_head_count_kv (po.n_head_kv)
gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps)
return
gguf_writer.add_context_length(cfg.context_length)
gguf_writer.add_embedding_length(hp.n_embd)
gguf_writer.add_block_count(hp.n_layer)
gguf_writer.add_feed_forward_length(hp.n_ff)
gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
gguf_writer.add_head_count(hp.n_head)
gguf_writer.add_head_count_kv(self.n_kv_head)
gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
def add_vocab(self, gguf_writer):
hp = self.model.hyperparameters
gguf_writer.add_tokenizer_model('llama')
tokens = []
scores = []
toktypes = []
if self.vocab_override is not None:
vo = self.vocab_override
print('* Adding vocab item(s)')
for (idx, vitem) in enumerate(vo.all_tokens()):
if len(vitem) == 3:
tokens.append(vitem[0])
scores.append(vitem[1])
toktypes.append(vitem[2])
else:
# Maybe try to guess the token type here?
tokens.append(vitem[0])
scores.append(vitem[1])
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
if len(toktypes) > 0:
gguf_writer.add_token_types(toktypes)
return
print(f'* Adding {hp.n_vocab} vocab item(s)')
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
tt = 1 # Normal
if len(vbytes) == 0:
tt = 3 # Control
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
hv = hex(vbytes[0])[2:].upper()
vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8')
tt = 6 # Byte
else:
vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
toktypes.append(tt)
tokens.append(vbytes)
scores.append(vscore)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
def add_tensors(self, gguf_writer):
nm = self.name_map
data = self.data
print(f'* Adding {len(self.model.tensors)} tensor(s)')
for tensor in self.model.tensors:
name = str(tensor.name, 'UTF-8')
if name.endswith('.weight'):
name = name[:-7]
suffix = '.weight'
elif name.endswith('.bias'):
name = name[:-5]
suffix = '.bias'
mapped_name = nm.get(name)
assert mapped_name is not None, f'Bad name {name}'
mapped_name += suffix
tempdims = list(tensor.dims[:])
if len(tempdims) > 1:
temp = tempdims[1]
tempdims[1] = tempdims[0]
tempdims[0] = temp
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
def handle_metadata(cfg, hp):
import convert
assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
hf_config_path = cfg.model_metadata_dir / "config.json"
orig_config_path = cfg.model_metadata_dir / "params.json"
# We pass a fake model here. "original" mode will check the shapes of some
# tensors if information is missing in the .json file: other than that, the
# model data isn't used so this should be safe (at least for now).
fakemodel = {
'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
}
fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
if hf_config_path.exists():
params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
elif orig_config_path.exists():
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
else:
raise ValueError('Unable to load metadata')
vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
convert.check_vocab_size(params, vocab)
return (params, vocab)
def handle_args():
parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename')
parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
parser.add_argument('--name', help = 'Set model name')
parser.add_argument('--desc', help = 'Set model description')
parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
return parser.parse_args()
def main():
cfg = handle_args()
print(f'* Using config: {cfg}')
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
data = np.memmap(cfg.input, mode = 'r')
model = GGMLV3Model()
print('* Scanning GGML input file')
offset = model.load(data, 0)
print(f'* GGML model hyperparameters: {model.hyperparameters}')
vocab_override = None
params_override = None
if cfg.model_metadata_dir is not None:
(params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
print(f'* Overriding params: {params_override}')
print(f'* Overriding vocab: {vocab_override}')
else:
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}')
main()

View file

@ -126,6 +126,11 @@ gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv) gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
# TOKENIZATION # TOKENIZATION
@ -155,9 +160,7 @@ if Path(dir_model + "/tokenizer.model").is_file():
if tokenizer.is_control(i): if tokenizer.is_control(i):
toktype = 3 toktype = 3
# TODO: How to determinate if a token is user defined? # toktype = 4 is user-defined = tokens from added_tokens.json
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = 4
if tokenizer.is_unused(i): if tokenizer.is_unused(i):
toktype = 5 toktype = 5
@ -168,6 +171,18 @@ if Path(dir_model + "/tokenizer.model").is_file():
scores.append(score) scores.append(score)
toktypes.append(toktype) toktypes.append(toktype)
if Path(dir_model + "/added_tokens.json").is_file():
with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
addtokens_json = json.load(f)
print("gguf: get added tokens")
for key in addtokens_json:
tokens.append( key.encode("utf-8") )
scores.append(-1000.0)
toktypes.append(4) # user-defined token type
gguf_writer.add_tokenizer_model("llama") gguf_writer.add_tokenizer_model("llama")
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores) gguf_writer.add_token_scores(scores)
@ -264,7 +279,9 @@ for part_name in part_names:
data = data.squeeze().numpy() data = data.squeeze().numpy()
# reverse permute these # reverse permute these
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"): if name.endswith(".q_proj.weight"):
data = reverse_hf_permute(data, head_count)
if name.endswith(".k_proj.weight"):
data = reverse_hf_permute(data, head_count, head_count_kv) data = reverse_hf_permute(data, head_count, head_count_kv)
# map tensor names # map tensor names
@ -277,7 +294,7 @@ for part_name in part_names:
sys.exit() sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
# if f32 desired, convert any float16 to float32 # if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16: if ftype == 0 and data_dtype == np.float16:

38
convert.py Executable file → Normal file
View file

@ -241,17 +241,19 @@ class BpeVocab:
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else: else:
added_tokens = {} added_tokens = {}
vocab_size: int = len(self.bpe_tokenizer) vocab_size: int = len(self.bpe_tokenizer)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values()) actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids: if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_list = [text for (text, idx) in items] self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base: int = vocab_size self.vocab_size_base: int = vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens self.fname_added_tokens = fname_added_tokens
def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]: def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
tokenizer = self.bpe_tokenizer tokenizer = self.bpe_tokenizer
@ -261,12 +263,12 @@ class BpeVocab:
for i, item in enumerate(tokenizer): for i, item in enumerate(tokenizer):
text: bytes = item.encode("utf-8") text: bytes = item.encode("utf-8")
score: float = -i score: float = -i
yield text, score, 4 yield text, score, gguf.TokenType.USER_DEFINED
def added_tokens(self) -> Iterable[Tuple[bytes, float]]: def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list: for text in self.added_tokens_list:
score = -1000.0 score = -1000.0
yield text.encode("utf-8"), score, 4 yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
def all_tokens(self) -> Iterable[Tuple[bytes, float]]: def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
yield from self.bpe_tokens() yield from self.bpe_tokens()
@ -304,27 +306,27 @@ class SentencePieceVocab:
text: bytes = piece.encode("utf-8") text: bytes = piece.encode("utf-8")
score: float = tokenizer.get_score(i) score: float = tokenizer.get_score(i)
toktype = 1 # defualt to normal token type toktype = gguf.TokenType.NORMAL
if tokenizer.is_unknown(i): if tokenizer.is_unknown(i):
toktype = 2 toktype = gguf.TokenType.UNKNOWN
if tokenizer.is_control(i): if tokenizer.is_control(i):
toktype = 3 toktype = gguf.TokenType.CONTROL
# NOTE: I think added_tokens are user defined. # NOTE: I think added_tokens are user defined.
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = 4 # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
if tokenizer.is_unused(i): if tokenizer.is_unused(i):
toktype = 5 toktype = gguf.TokenType.UNUSED
if tokenizer.is_byte(i): if tokenizer.is_byte(i):
toktype = 6 toktype = gguf.TokenType.BYTE
yield text, score, toktype yield text, score, toktype
def added_tokens(self) -> Iterable[Tuple[bytes, float]]: def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list: for text in self.added_tokens_list:
score = -1000.0 score = -1000.0
yield text.encode("utf-8"), score, 4 yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
def all_tokens(self) -> Iterable[Tuple[bytes, float]]: def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
yield from self.sentencepiece_tokens() yield from self.sentencepiece_tokens()
@ -342,6 +344,7 @@ Vocab = Union[BpeVocab, SentencePieceVocab]
# #
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
if n_head_kv is not None and n_head != n_head_kv: if n_head_kv is not None and n_head != n_head_kv:
n_head //= n_head_kv n_head //= n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
@ -724,6 +727,7 @@ class OutputFile:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
def add_meta_arch(self, params: Params) -> None: def add_meta_arch(self, params: Params) -> None:
self.gguf.add_name ("llama")
self.gguf.add_context_length (params.n_ctx) self.gguf.add_context_length (params.n_ctx)
self.gguf.add_embedding_length (params.n_embd) self.gguf.add_embedding_length (params.n_embd)
self.gguf.add_block_count (params.n_layer) self.gguf.add_block_count (params.n_layer)
@ -836,12 +840,12 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
for i in itertools.count(): for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model: if f"model.layers.{i}.self_attn.q_proj.weight" in model:
print(f"Permuting layer {i}") print(f"Permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head_kv) tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
#tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model: elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
print(f"Unpacking and permuting layer {i}") print(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head_kv) tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
else: else:

View file

@ -139,14 +139,16 @@ void print_sample_weights(TransformerWeights *w){
struct llama_vocab { struct llama_vocab {
using id = int32_t; using id = int32_t;
using token = std::string; using token = std::string;
using ttype = llama_token_type;
struct token_score { struct token_data {
token tok; token text;
float score; float score;
ttype type;
}; };
std::unordered_map<token, id> token_to_id; std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token; std::vector<token_data> id_to_token;
}; };
struct my_llama_hparams { struct my_llama_hparams {
@ -516,36 +518,30 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params); struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
std::vector<const char *> strings; const int n_vocab = llama_n_vocab(lctx);
std::vector<float> scores;
int n_vocab = llama_n_vocab(lctx);
strings.resize(n_vocab, NULL);
scores.resize(n_vocab, 0);
n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
vocab->id_to_token.resize(n_vocab); vocab->id_to_token.resize(n_vocab);
for (int i=0; i<n_vocab; ++i) { for (int i=0; i<n_vocab; ++i) {
std::string tok = std::string(strings[i]); vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
float score = scores[i]; vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
vocab->id_to_token[i].tok = tok; vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
vocab->id_to_token[i].score = score; vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
vocab->token_to_id.emplace(tok, i);
} }
llama_free(lctx); llama_free(lctx);
llama_free_model(lmodel); llama_free_model(lmodel);
} else { // assume llama2.c vocabulary } else { // assume llama2.c vocabulary
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename); printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
llama_file file(filename, "rb"); llama_file file(filename, "rb");
uint32_t n_vocab = config->vocab_size; const int n_vocab = config->vocab_size;
/* uint32_t max_token_length = */ file.read_u32(); // unused /* uint32_t max_token_length = */ file.read_u32(); // unused
vocab->id_to_token.resize(n_vocab); vocab->id_to_token.resize(n_vocab);
for (uint32_t i=0; i<n_vocab; ++i) { for (int i=0; i<n_vocab; ++i) {
float_t score = file.read_f32(); float_t score = file.read_f32();
uint32_t len = file.read_u32(); uint32_t len = file.read_u32();
std::string tok = file.read_string(len); std::string text = file.read_string(len);
vocab->id_to_token[i].tok = tok; vocab->id_to_token[i].text = text;
vocab->id_to_token[i].score = score; vocab->id_to_token[i].score = score;
vocab->token_to_id.emplace(tok, i); vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
vocab->token_to_id.emplace(text, i);
} }
} }
} }
@ -611,10 +607,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk. // // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
// uint32_t n_vocab = model->hparams.n_vocab; // uint32_t n_vocab = model->hparams.n_vocab;
// for (uint32_t i = 0; i < n_vocab; i++) { // for (uint32_t i = 0; i < n_vocab; i++) {
// const auto & token_score = vocab->id_to_token.at(i); // const auto & token_data = vocab->id_to_token.at(i);
// file.write_u32((uint32_t) token_score.tok.size()); // file.write_u32((uint32_t) token_data.tok.size());
// file.write_raw(token_score.tok.data(), token_score.tok.size()); // file.write_raw(token_data.tok.data(), token_data.tok.size());
// file.write_raw(&token_score.score, sizeof(token_score.score)); // file.write_raw(&token_data.score, sizeof(token_data.score));
// } // }
// //
// // stuff AK weights into GG weights one by one. // // stuff AK weights into GG weights one by one.

View file

@ -5,6 +5,7 @@
#include <cmath> #include <cmath>
#include <ctime> #include <ctime>
#include <sstream> #include <sstream>
#include <cstring>
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
@ -121,6 +122,27 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
printf("\n"); printf("\n");
} }
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
int n_vocab, int n_thread) {
std::vector<float> result;
result.reserve(tokens.size() * n_vocab);
size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
size_t n_tokens = tokens.size() - i_chunk * n_batch;
n_tokens = std::min(n_tokens, size_t(n_batch));
if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return {};
}
const auto logits = llama_get_logits(ctx);
result.insert(result.end(), logits, logits + n_tokens * n_vocab);
n_past += n_tokens;
}
return result;
}
void hellaswag_score(llama_context * ctx, const gpt_params & params) { void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// Calculates hellaswag score (acc_norm) from prompt // Calculates hellaswag score (acc_norm) from prompt
// //
@ -209,50 +231,93 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
double acc = 0.0f; double acc = 0.0f;
const int n_vocab = llama_n_vocab(ctx); const int n_vocab = llama_n_vocab(ctx);
std::vector<float> tok_logits(n_vocab);
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) { for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
// Tokenize the context to count tokens // Tokenize the context to count tokens
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos); std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
size_t context_size = context_embd.size(); size_t context_size = context_embd.size();
for (size_t ending_idx=0;ending_idx<4;ending_idx++) { // Do the 1st ending
// In this case we include the context when evaluating
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos);
auto query_size = query_embd.size();
//printf("First query: %d\n",(int)query_size);
// Stop if query wont fit the ctx window
if (query_size > (size_t)params.n_ctx) {
fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
return;
}
// Speedup small evaluations by evaluating atleast 32 tokens
if (query_size < 32) {
query_embd.resize(32);
}
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
if (logits.empty()) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
std::memcpy(tok_logits.data(), logits.data() + (context_size-1)*n_vocab, n_vocab*sizeof(float));
const auto first_probs = softmax(tok_logits);
hs_data[task_idx].ending_logprob_count[0] = 1;
hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]);
// Calculate the logprobs over the ending
for (size_t j = context_size; j < query_size - 1; j++) {
std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
const float prob = softmax(tok_logits)[query_embd[j + 1]];
hs_data[task_idx].ending_logprob[0] += std::log(prob);
hs_data[task_idx].ending_logprob_count[0]++;
}
// Calculate the mean token logprob for acc_norm
hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0];
// Do the remaining endings
// For these, we use the bare ending with n_past = context_size
//
for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
// Tokenize the query // Tokenize the query
std::vector<int> query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos); query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
size_t query_size = query_embd.size(); query_size = query_embd.size();
// Stop if query wont fit the ctx window // Stop if query wont fit the ctx window
if (query_size > (size_t)params.n_ctx) { if (context_size + query_size > (size_t)params.n_ctx) {
fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size); fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
return; return;
} }
// Speedup small evaluations by evaluating atleast 32 tokens // Speedup small evaluations by evaluating atleast 32 tokens
if (query_size < 32) { // No, resizing to 32 is actually slightly slower (at least on CUDA)
query_embd.resize(32); //if (query_size < 32) {
} // query_embd.resize(32);
//}
// Evaluate the query // Evaluate the query
if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) { logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab, params.n_threads);
if (logits.empty()) {
fprintf(stderr, "%s : failed to eval\n", __func__); fprintf(stderr, "%s : failed to eval\n", __func__);
return; return;
} }
const auto query_logits = llama_get_logits(ctx); hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
std::vector<float> logits; hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);
logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab);
hs_data[task_idx].ending_logprob_count[ending_idx] = 0;
hs_data[task_idx].ending_logprob[ending_idx] = 0.0f;
// Calculate the logprobs over the ending // Calculate the logprobs over the ending
for (size_t j = context_size-1; j < query_size - 1; j++) { for (size_t j = 0; j < query_size - 1; j++) {
// Calculate probability of next token, given the previous ones. std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
const std::vector<float> tok_logits(
logits.begin() + (j + 0) * n_vocab,
logits.begin() + (j + 1) * n_vocab);
const float prob = softmax(tok_logits)[query_embd[ j + 1]]; const float prob = softmax(tok_logits)[query_embd[j + 1]];
hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob); hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
hs_data[task_idx].ending_logprob_count[ending_idx]++; hs_data[task_idx].ending_logprob_count[ending_idx]++;
@ -267,9 +332,9 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
} }
// Find the ending with maximum logprob // Find the ending with maximum logprob
size_t ending_logprob_max_idx = -1; size_t ending_logprob_max_idx = 0;
double ending_logprob_max_val = -INFINITY; double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];
for (size_t j=0; j < 4; j++) { for (size_t j = 1; j < 4; j++) {
if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) { if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
ending_logprob_max_idx = j; ending_logprob_max_idx = j;
ending_logprob_max_val = hs_data[task_idx].ending_logprob[j]; ending_logprob_max_val = hs_data[task_idx].ending_logprob[j];

View file

@ -11,8 +11,10 @@ echo >> $PUBLIC/index.js # add newline
FILES=$(ls $PUBLIC) FILES=$(ls $PUBLIC)
cd $PUBLIC
for FILE in $FILES; do for FILE in $FILES; do
func=$(echo $FILE | tr '.' '_') echo "generate $FILE.hpp"
echo "generate $FILE.hpp ($func)"
xxd -n $func -i $PUBLIC/$FILE > $DIR/$FILE.hpp # use simple flag for old version of xxd
xxd -i $FILE > $DIR/$FILE.hpp
done done

View file

@ -144,12 +144,12 @@
import { SchemaConverter } from '/json-schema-to-grammar.mjs'; import { SchemaConverter } from '/json-schema-to-grammar.mjs';
const session = signal({ const session = signal({
prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.", prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
template: "{{prompt}}\n\n{{history}}\n{{char}}:", template: "{{prompt}}\n\n{{history}}\n{{char}}:",
historyTemplate: "{{name}}: {{message}}", historyTemplate: "{{name}}: {{message}}",
transcript: [], transcript: [],
type: "chat", type: "chat",
char: "llama", char: "Llama",
user: "User", user: "User",
}) })

View file

@ -170,14 +170,16 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
struct llama_vocab { struct llama_vocab {
using id = int32_t; using id = int32_t;
using token = std::string; using token = std::string;
using ttype = llama_token_type;
struct token_score { struct token_data {
token tok; token text;
float score; float score;
ttype type;
}; };
std::unordered_map<token, id> token_to_id; std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token; std::vector<token_data> id_to_token;
}; };
struct my_llama_hparams { struct my_llama_hparams {
@ -2629,10 +2631,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
// // write_vocab // // write_vocab
// uint32_t n_vocab = model->hparams.n_vocab; // uint32_t n_vocab = model->hparams.n_vocab;
// for (uint32_t i = 0; i < n_vocab; i++) { // for (uint32_t i = 0; i < n_vocab; i++) {
// const auto & token_score = vocab->id_to_token.at(i); // const auto & token_data = vocab->id_to_token.at(i);
// file.write_u32((uint32_t) token_score.tok.size()); // file.write_u32((uint32_t) token_data.tok.size());
// file.write_raw(token_score.tok.data(), token_score.tok.size()); // file.write_raw(token_data.tok.data(), token_data.tok.size());
// file.write_raw(&token_score.score, sizeof(token_score.score)); // file.write_raw(&token_data.score, sizeof(token_data.score));
// } // }
// // write tensors // // write tensors
// write_tensor(&file, model->tok_embeddings); // write_tensor(&file, model->tok_embeddings);
@ -3055,20 +3057,13 @@ int main(int argc, char ** argv) {
struct llama_vocab vocab; struct llama_vocab vocab;
{ {
std::vector<const char *> strings; const int n_vocab = llama_n_vocab(lctx);
std::vector<float> scores;
int n_vocab = llama_n_vocab(lctx);
strings.resize(n_vocab, NULL);
scores.resize(n_vocab, 0);
n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
vocab.id_to_token.resize(n_vocab); vocab.id_to_token.resize(n_vocab);
for (int i=0; i<n_vocab; ++i) { for (int i=0; i<n_vocab; ++i) {
std::string tok = std::string(strings[i]); vocab.id_to_token[i].text = llama_token_get_text(lctx, i);
float score = scores[i]; vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
vocab.id_to_token[i].tok = tok; vocab.id_to_token[i].type = llama_token_get_type(lctx, i);
vocab.id_to_token[i].score = score; vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
vocab.token_to_id.emplace(tok, i);
} }
} }

View file

@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
threadgroup_barrier(mem_flags::mem_device);
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
} }
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_device);
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
if (sgitg==0) { if (sgitg==0) {
for (int i = 0; i < n_rows; i++) { for (int i = 0; i < n_rows; i++) {

244
ggml.c
View file

@ -1643,11 +1643,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
[GGML_TYPE_I8] = {
.type_name = "i8",
.blck_size = 1,
.type_size = sizeof(int8_t),
.is_quantized = false,
},
[GGML_TYPE_I16] = {
.type_name = "i16",
.blck_size = 1,
.type_size = sizeof(int16_t),
.is_quantized = false,
},
[GGML_TYPE_I32] = {
.type_name = "i32",
.blck_size = 1,
.type_size = sizeof(int32_t),
.is_quantized = false,
},
[GGML_TYPE_F32] = { [GGML_TYPE_F32] = {
.type_name = "f32",
.blck_size = 1,
.type_size = sizeof(float),
.is_quantized = false,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32, .vec_dot_type = GGML_TYPE_F32,
}, },
[GGML_TYPE_F16] = { [GGML_TYPE_F16] = {
.type_name = "f16",
.blck_size = 1,
.type_size = sizeof(ggml_fp16_t),
.is_quantized = false,
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
@ -1655,6 +1681,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_F16, .vec_dot_type = GGML_TYPE_F16,
}, },
[GGML_TYPE_Q4_0] = { [GGML_TYPE_Q4_0] = {
.type_name = "q4_0",
.blck_size = QK4_0,
.type_size = sizeof(block_q4_0),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q4_0, .to_float = (ggml_to_float_t) dequantize_row_q4_0,
.from_float = quantize_row_q4_0, .from_float = quantize_row_q4_0,
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
@ -1662,6 +1692,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
}, },
[GGML_TYPE_Q4_1] = { [GGML_TYPE_Q4_1] = {
.type_name = "q4_1",
.blck_size = QK4_1,
.type_size = sizeof(block_q4_1),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q4_1, .to_float = (ggml_to_float_t) dequantize_row_q4_1,
.from_float = quantize_row_q4_1, .from_float = quantize_row_q4_1,
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
@ -1669,6 +1703,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_1, .vec_dot_type = GGML_TYPE_Q8_1,
}, },
[GGML_TYPE_Q5_0] = { [GGML_TYPE_Q5_0] = {
.type_name = "q5_0",
.blck_size = QK5_0,
.type_size = sizeof(block_q5_0),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q5_0, .to_float = (ggml_to_float_t) dequantize_row_q5_0,
.from_float = quantize_row_q5_0, .from_float = quantize_row_q5_0,
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
@ -1676,6 +1714,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
}, },
[GGML_TYPE_Q5_1] = { [GGML_TYPE_Q5_1] = {
.type_name = "q5_1",
.blck_size = QK5_1,
.type_size = sizeof(block_q5_1),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q5_1, .to_float = (ggml_to_float_t) dequantize_row_q5_1,
.from_float = quantize_row_q5_1, .from_float = quantize_row_q5_1,
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
@ -1683,6 +1725,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_1, .vec_dot_type = GGML_TYPE_Q8_1,
}, },
[GGML_TYPE_Q8_0] = { [GGML_TYPE_Q8_0] = {
.type_name = "q8_0",
.blck_size = QK8_0,
.type_size = sizeof(block_q8_0),
.is_quantized = true,
.to_float = dequantize_row_q8_0, .to_float = dequantize_row_q8_0,
.from_float = quantize_row_q8_0, .from_float = quantize_row_q8_0,
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
@ -1690,12 +1736,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
}, },
[GGML_TYPE_Q8_1] = { [GGML_TYPE_Q8_1] = {
.type_name = "q8_1",
.blck_size = QK8_1,
.type_size = sizeof(block_q8_1),
.is_quantized = true,
.from_float = quantize_row_q8_1, .from_float = quantize_row_q8_1,
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
.vec_dot_type = GGML_TYPE_Q8_1, .vec_dot_type = GGML_TYPE_Q8_1,
}, },
#ifdef GGML_USE_K_QUANTS #ifdef GGML_USE_K_QUANTS
[GGML_TYPE_Q2_K] = { [GGML_TYPE_Q2_K] = {
.type_name = "q2_K",
.blck_size = QK_K,
.type_size = sizeof(block_q2_K),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q2_K, .to_float = (ggml_to_float_t) dequantize_row_q2_K,
.from_float = quantize_row_q2_K, .from_float = quantize_row_q2_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
@ -1703,6 +1757,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
}, },
[GGML_TYPE_Q3_K] = { [GGML_TYPE_Q3_K] = {
.type_name = "q3_K",
.blck_size = QK_K,
.type_size = sizeof(block_q3_K),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q3_K, .to_float = (ggml_to_float_t) dequantize_row_q3_K,
.from_float = quantize_row_q3_K, .from_float = quantize_row_q3_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
@ -1710,6 +1768,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
}, },
[GGML_TYPE_Q4_K] = { [GGML_TYPE_Q4_K] = {
.type_name = "q4_K",
.blck_size = QK_K,
.type_size = sizeof(block_q4_K),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q4_K, .to_float = (ggml_to_float_t) dequantize_row_q4_K,
.from_float = quantize_row_q4_K, .from_float = quantize_row_q4_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
@ -1717,6 +1779,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
}, },
[GGML_TYPE_Q5_K] = { [GGML_TYPE_Q5_K] = {
.type_name = "q5_K",
.blck_size = QK_K,
.type_size = sizeof(block_q5_K),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q5_K, .to_float = (ggml_to_float_t) dequantize_row_q5_K,
.from_float = quantize_row_q5_K, .from_float = quantize_row_q5_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
@ -1724,6 +1790,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
}, },
[GGML_TYPE_Q6_K] = { [GGML_TYPE_Q6_K] = {
.type_name = "q6_K",
.blck_size = QK_K,
.type_size = sizeof(block_q6_K),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q6_K, .to_float = (ggml_to_float_t) dequantize_row_q6_K,
.from_float = quantize_row_q6_K, .from_float = quantize_row_q6_K,
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
@ -1731,15 +1801,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
}, },
[GGML_TYPE_Q8_K] = { [GGML_TYPE_Q8_K] = {
.type_name = "q8_K",
.blck_size = QK_K,
.type_size = sizeof(block_q8_K),
.is_quantized = true,
.from_float = quantize_row_q8_K, .from_float = quantize_row_q8_K,
} }
#endif #endif
}; };
// For internal test use // For internal test use
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) { ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
GGML_ASSERT(i < GGML_TYPE_COUNT); GGML_ASSERT(type < GGML_TYPE_COUNT);
return type_traits[i]; return type_traits[type];
} }
@ -3648,98 +3722,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
*s = idx; *s = idx;
} }
//
// data types
//
static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = 1,
[GGML_TYPE_F16] = 1,
[GGML_TYPE_Q4_0] = QK4_0,
[GGML_TYPE_Q4_1] = QK4_1,
[GGML_TYPE_Q5_0] = QK5_0,
[GGML_TYPE_Q5_1] = QK5_1,
[GGML_TYPE_Q8_0] = QK8_0,
[GGML_TYPE_Q8_1] = QK8_1,
#ifdef GGML_USE_K_QUANTS
[GGML_TYPE_Q2_K] = QK_K,
[GGML_TYPE_Q3_K] = QK_K,
[GGML_TYPE_Q4_K] = QK_K,
[GGML_TYPE_Q5_K] = QK_K,
[GGML_TYPE_Q6_K] = QK_K,
[GGML_TYPE_Q8_K] = QK_K,
#endif
[GGML_TYPE_I8] = 1,
[GGML_TYPE_I16] = 1,
[GGML_TYPE_I32] = 1,
};
static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = sizeof(float),
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
[GGML_TYPE_Q5_0] = sizeof(block_q5_0),
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
#ifdef GGML_USE_K_QUANTS
[GGML_TYPE_Q2_K] = sizeof(block_q2_K),
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
#endif
[GGML_TYPE_I8] = sizeof(int8_t),
[GGML_TYPE_I16] = sizeof(int16_t),
[GGML_TYPE_I32] = sizeof(int32_t),
};
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = "f32",
[GGML_TYPE_F16] = "f16",
[GGML_TYPE_Q4_0] = "q4_0",
[GGML_TYPE_Q4_1] = "q4_1",
[GGML_TYPE_Q5_0] = "q5_0",
[GGML_TYPE_Q5_1] = "q5_1",
[GGML_TYPE_Q8_0] = "q8_0",
[GGML_TYPE_Q8_1] = "q8_1",
[GGML_TYPE_Q2_K] = "q2_K",
[GGML_TYPE_Q3_K] = "q3_K",
[GGML_TYPE_Q4_K] = "q4_K",
[GGML_TYPE_Q5_K] = "q5_K",
[GGML_TYPE_Q6_K] = "q6_K",
[GGML_TYPE_Q8_K] = "q8_K",
[GGML_TYPE_I8] = "i8",
[GGML_TYPE_I16] = "i16",
[GGML_TYPE_I32] = "i32",
};
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = false,
[GGML_TYPE_F16] = false,
[GGML_TYPE_Q4_0] = true,
[GGML_TYPE_Q4_1] = true,
[GGML_TYPE_Q5_0] = true,
[GGML_TYPE_Q5_1] = true,
[GGML_TYPE_Q8_0] = true,
[GGML_TYPE_Q8_1] = true,
[GGML_TYPE_Q2_K] = true,
[GGML_TYPE_Q3_K] = true,
[GGML_TYPE_Q4_K] = true,
[GGML_TYPE_Q5_K] = true,
[GGML_TYPE_Q6_K] = true,
[GGML_TYPE_Q8_K] = true,
[GGML_TYPE_I8] = false,
[GGML_TYPE_I16] = false,
[GGML_TYPE_I32] = false,
};
static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
static const char * GGML_OP_NAME[GGML_OP_COUNT] = { static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"NONE", "NONE",
@ -4109,7 +4091,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
// //
// is enough, but just in case, adding the second part // is enough, but just in case, adding the second part
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]); return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
} }
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
@ -4119,23 +4101,27 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]; return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
} }
int ggml_blck_size(enum ggml_type type) { int ggml_blck_size(enum ggml_type type) {
return GGML_BLCK_SIZE[type]; return type_traits[type].blck_size;
} }
size_t ggml_type_size(enum ggml_type type) { size_t ggml_type_size(enum ggml_type type) {
return GGML_TYPE_SIZE[type]; return type_traits[type].type_size;
} }
float ggml_type_sizef(enum ggml_type type) { float ggml_type_sizef(enum ggml_type type) {
return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type]; return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
} }
const char * ggml_type_name(enum ggml_type type) { const char * ggml_type_name(enum ggml_type type) {
return GGML_TYPE_NAME[type]; return type_traits[type].type_name;
}
bool ggml_is_quantized(enum ggml_type type) {
return type_traits[type].is_quantized;
} }
const char * ggml_op_name(enum ggml_op op) { const char * ggml_op_name(enum ggml_op op) {
@ -4147,7 +4133,7 @@ const char * ggml_op_symbol(enum ggml_op op) {
} }
size_t ggml_element_size(const struct ggml_tensor * tensor) { size_t ggml_element_size(const struct ggml_tensor * tensor) {
return GGML_TYPE_SIZE[tensor->type]; return ggml_type_size(tensor->type);
} }
static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) { static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
@ -4185,10 +4171,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct
(t0->ne[3] == t1->ne[3]); (t0->ne[3] == t1->ne[3]);
} }
bool ggml_is_quantized(enum ggml_type type) {
return GGML_IS_QUANTIZED[type];
}
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
enum ggml_type wtype = GGML_TYPE_COUNT; enum ggml_type wtype = GGML_TYPE_COUNT;
@ -4226,8 +4208,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return return
tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] && tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] && tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
} }
@ -4236,7 +4218,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return return
tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] && tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
} }
@ -4251,7 +4233,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return return
tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] && tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
} }
@ -4570,7 +4552,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
size_t data_size = 0; size_t data_size = 0;
if (data == NULL && !ctx->no_alloc) { if (data == NULL && !ctx->no_alloc) {
data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]); data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
for (int i = 1; i < n_dims; i++) { for (int i = 1; i < n_dims; i++) {
data_size *= ne[i]; data_size *= ne[i];
} }
@ -4625,8 +4607,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
result->ne[i] = ne[i]; result->ne[i] = ne[i];
} }
result->nb[0] = GGML_TYPE_SIZE[type]; result->nb[0] = ggml_type_size(type);
result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]); result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
for (int i = 2; i < GGML_MAX_DIMS; i++) { for (int i = 2; i < GGML_MAX_DIMS; i++) {
result->nb[i] = result->nb[i - 1]*result->ne[i - 1]; result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
} }
@ -7748,7 +7730,7 @@ static void ggml_compute_forward_dup_same_cont(
memcpy( memcpy(
((char *) dst->data + ie0*nb0), ((char *) dst->data + ie0*nb0),
((char *) src0->data + ie0*nb00), ((char *) src0->data + ie0*nb00),
(ie1 - ie0) * GGML_TYPE_SIZE[src0->type]); (ie1 - ie0) * ggml_type_size(src0->type));
} }
} }
@ -7782,7 +7764,7 @@ static void ggml_compute_forward_dup_f16(
if (src0->type == dst->type && if (src0->type == dst->type &&
ne00 == ne0 && ne00 == ne0 &&
nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) { nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
// copy by rows // copy by rows
const size_t rs = ne00*nb00; const size_t rs = ne00*nb00;
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -7840,7 +7822,7 @@ static void ggml_compute_forward_dup_f16(
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
size_t id = 0; size_t id = 0;
size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
char * dst_ptr = (char *) dst->data; char * dst_ptr = (char *) dst->data;
for (int i03 = 0; i03 < ne03; i03++) { for (int i03 = 0; i03 < ne03; i03++) {
@ -8053,7 +8035,7 @@ static void ggml_compute_forward_dup_f32(
if (src0->type == dst->type && if (src0->type == dst->type &&
ne00 == ne0 && ne00 == ne0 &&
nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) { nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
// copy by rows // copy by rows
const size_t rs = ne00*nb00; const size_t rs = ne00*nb00;
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -8092,7 +8074,7 @@ static void ggml_compute_forward_dup_f32(
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
size_t id = 0; size_t id = 0;
size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
char * dst_ptr = (char *) dst->data; char * dst_ptr = (char *) dst->data;
for (int i03 = 0; i03 < ne03; i03++) { for (int i03 = 0; i03 < ne03; i03++) {
@ -8504,7 +8486,7 @@ static void ggml_compute_forward_add_q_f32(
ggml_from_float_t const quantize_row_q = type_traits[type].from_float; ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
// we don't support permuted src0 or src1 // we don't support permuted src0 or src1
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); GGML_ASSERT(nb00 == ggml_type_size(type));
GGML_ASSERT(nb10 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float));
// dst cannot be transposed or permuted // dst cannot be transposed or permuted
@ -8778,7 +8760,7 @@ static void ggml_compute_forward_add1_q_f32(
ggml_from_float_t const quantize_row_q = type_traits[type].from_float; ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
// we don't support permuted src0 // we don't support permuted src0
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); GGML_ASSERT(nb00 == ggml_type_size(type));
// dst cannot be transposed or permuted // dst cannot be transposed or permuted
GGML_ASSERT(nb0 <= nb1); GGML_ASSERT(nb0 <= nb1);
@ -10634,7 +10616,7 @@ static void ggml_compute_forward_mul_mat(
GGML_ASSERT(ne3 == ne13); GGML_ASSERT(ne3 == ne13);
// we don't support permuted src0 or src1 // we don't support permuted src0 or src1
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); GGML_ASSERT(nb00 == ggml_type_size(type));
GGML_ASSERT(nb10 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float));
// dst cannot be transposed or permuted // dst cannot be transposed or permuted
@ -10717,7 +10699,7 @@ static void ggml_compute_forward_mul_mat(
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (src1->type != vec_dot_type) { if (src1->type != vec_dot_type) {
char * wdata = params->wdata; char * wdata = params->wdata;
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i12 = 0; i12 < ne12; ++i12) {
@ -10737,7 +10719,7 @@ static void ggml_compute_forward_mul_mat(
} }
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
const int64_t nr0 = ne01; // src0 rows const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = ne11*ne12*ne13; // src1 rows const int64_t nr1 = ne11*ne12*ne13; // src1 rows
@ -11210,7 +11192,7 @@ static void ggml_compute_forward_get_rows_q(
assert( dst->ne[0] == nc); assert( dst->ne[0] == nc);
assert( dst->ne[1] == nr); assert( dst->ne[1] == nr);
assert(src0->nb[0] == GGML_TYPE_SIZE[type]); assert(src0->nb[0] == ggml_type_size(type));
for (int i = 0; i < nr; ++i) { for (int i = 0; i < nr; ++i) {
const int r = ((int32_t *) src1->data)[i]; const int r = ((int32_t *) src1->data)[i];
@ -16387,7 +16369,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
size_t cur = 0; size_t cur = 0;
if (ggml_is_quantized(node->type)) { if (ggml_is_quantized(node->type)) {
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks; cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
} }
work_size = MAX(work_size, cur); work_size = MAX(work_size, cur);
@ -16400,7 +16382,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
size_t cur = 0; size_t cur = 0;
if (ggml_is_quantized(node->src[0]->type)) { if (ggml_is_quantized(node->src[0]->type)) {
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks; cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
} }
work_size = MAX(work_size, cur); work_size = MAX(work_size, cur);
@ -16412,7 +16394,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
size_t cur = 0; size_t cur = 0;
if (ggml_is_quantized(node->src[0]->type)) { if (ggml_is_quantized(node->src[0]->type)) {
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks; cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
} }
work_size = MAX(work_size, cur); work_size = MAX(work_size, cur);
@ -16495,12 +16477,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
// the threads are still spinning // the threads are still spinning
if (node->src[0]->type != GGML_TYPE_F32) { if (node->src[0]->type != GGML_TYPE_F32) {
// here we need memory just for single 2D matrix from src0 // here we need memory just for single 2D matrix from src0
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]); cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
} }
} else } else
#endif #endif
if (node->src[1]->type != vec_dot_type) { if (node->src[1]->type != vec_dot_type) {
cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type]; cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
} else { } else {
cur = 0; cur = 0;
} }
@ -18306,8 +18288,8 @@ enum ggml_opt_result ggml_opt_resume(
struct ggml_tensor * f) { struct ggml_tensor * f) {
// build forward + backward compute graphs // build forward + backward compute graphs
struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0)); struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0)); struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;

6
ggml.h
View file

@ -1856,6 +1856,10 @@ extern "C" {
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
typedef struct { typedef struct {
const char * type_name;
int blck_size;
size_t type_size;
bool is_quantized;
ggml_to_float_t to_float; ggml_to_float_t to_float;
ggml_from_float_t from_float; ggml_from_float_t from_float;
ggml_from_float_t from_float_reference; ggml_from_float_t from_float_reference;
@ -1863,7 +1867,7 @@ extern "C" {
enum ggml_type vec_dot_type; enum ggml_type vec_dot_type;
} ggml_type_traits_t; } ggml_type_traits_t;
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i); ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
#ifdef __cplusplus #ifdef __cplusplus
} }

68
gguf.py
View file

@ -5,7 +5,7 @@ import tempfile
import numpy as np import numpy as np
from enum import IntEnum, auto from enum import IntEnum, auto
from typing import Any, IO, List from typing import Any, IO, List, Optional
# #
# constants # constants
@ -45,7 +45,7 @@ KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
# RoPE # RoPE
KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count" KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
KEY_ROPE_SCALE = "{arch}.rope.scale" KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear"
# tokenization # tokenization
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model" KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
@ -61,6 +61,7 @@ KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json" KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world" KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
# #
# recommended mapping of model tensor names for storage in gguf # recommended mapping of model tensor names for storage in gguf
# #
@ -319,14 +320,35 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
return tensor_map return tensor_map
class TokenType(IntEnum):
NORMAL = 1
UNKNOWN = 2
CONTROL = 3
USER_DEFINED = 4
UNUSED = 5
BYTE = 6
# #
# implementation # implementation
# #
class GGMLQuantizationType(IntEnum): class GGMLQuantizationType(IntEnum):
F32 = 0 F32 = 0
F16 = 1 F16 = 1
Q4_0 = 2
Q4_1 = 3
Q5_0 = 6
Q5_1 = 7
Q8_0 = 8
Q8_1 = 9
Q2_K = 10
Q3_K = 11
Q4_K = 12
Q5_K = 13
Q6_K = 14
Q8_K = 15
class GGUFValueType(IntEnum): class GGUFValueType(IntEnum):
@ -359,7 +381,7 @@ class GGUFValueType(IntEnum):
class GGUFWriter: class GGUFWriter:
def __init__(self, path: str, arch: str): def __init__(self, path: str, arch: str, use_temp_file = True):
self.fout = open(path, "wb") self.fout = open(path, "wb")
self.arch = arch self.arch = arch
self.offset_tensor = 0 self.offset_tensor = 0
@ -369,6 +391,8 @@ class GGUFWriter:
self.ti_data = b"" self.ti_data = b""
self.ti_data_count = 0 self.ti_data_count = 0
self.add_architecture() self.add_architecture()
self.use_temp_file = use_temp_file
self.tensors = []
def write_header_to_file(self): def write_header_to_file(self):
self.fout.write(struct.pack("<I", GGUF_MAGIC)) self.fout.write(struct.pack("<I", GGUF_MAGIC))
@ -476,8 +500,8 @@ class GGUFWriter:
def ggml_pad(x: int, n: int) -> int: def ggml_pad(x: int, n: int) -> int:
return ((x + n - 1) // n) * n return ((x + n - 1) // n) * n
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int): def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now" assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
encoded_name = name.encode("utf8") encoded_name = name.encode("utf8")
self.ti_data += struct.pack("<I", len(encoded_name)) self.ti_data += struct.pack("<I", len(encoded_name))
@ -486,23 +510,30 @@ class GGUFWriter:
self.ti_data += struct.pack("<I", n_dims) self.ti_data += struct.pack("<I", n_dims)
for i in range(n_dims): for i in range(n_dims):
self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i]) self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
if raw_dtype is None:
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16 dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
else:
dtype = raw_dtype
self.ti_data += struct.pack("<I", dtype) self.ti_data += struct.pack("<I", dtype)
self.ti_data += struct.pack("<Q", self.offset_tensor) self.ti_data += struct.pack("<Q", self.offset_tensor)
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment) self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
self.ti_data_count += 1 self.ti_data_count += 1
def add_tensor(self, name: str, tensor: np.ndarray): def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
if not hasattr(self, "temp_file"): if self.use_temp_file and not hasattr(self, "temp_file"):
self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024) self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
self.temp_file.seek(0) self.temp_file.seek(0)
self.add_tensor_info(name, tensor.shape, tensor.dtype, tensor.nbytes) self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
if not self.use_temp_file:
self.tensors.append((tensor, pad))
return
tensor.tofile(self.temp_file) tensor.tofile(self.temp_file)
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
if pad != 0: if pad != 0:
self.temp_file.write(bytes([0] * pad)) self.temp_file.write(bytes([0] * pad))
@ -524,6 +555,13 @@ class GGUFWriter:
if pad != 0: if pad != 0:
self.fout.write(bytes([0] * pad)) self.fout.write(bytes([0] * pad))
if not self.use_temp_file:
for (currtensor, currpad) in self.tensors:
currtensor.tofile(self.fout)
if currpad != 0:
self.fout.write(bytes([0] * currpad))
return
self.temp_file.seek(0) self.temp_file.seek(0)
shutil.copyfileobj(self.temp_file, self.fout) shutil.copyfileobj(self.temp_file, self.fout)
@ -620,8 +658,8 @@ class GGUFWriter:
self.add_uint32( self.add_uint32(
KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count) KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
def add_rope_scale(self, value: float): def add_rope_scale_linear(self, value: float):
self.add_float32(KEY_ROPE_SCALE.format(arch=self.arch), value) self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
def add_tokenizer_model(self, model: str): def add_tokenizer_model(self, model: str):
self.add_string(KEY_TOKENIZER_MODEL, model) self.add_string(KEY_TOKENIZER_MODEL, model)

View file

@ -771,11 +771,12 @@ struct llama_vocab {
using id = int32_t; using id = int32_t;
using token = std::string; using token = std::string;
using ttype = llama_token_type;
struct token_data { struct token_data {
token tok; token text;
float score; float score;
int toktype; ttype type;
}; };
llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@ -1436,6 +1437,14 @@ static void llama_model_load_internal(
hparams.n_head_kv = hparams.n_head; hparams.n_head_kv = hparams.n_head;
GGUF_GET(hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv"); GGUF_GET(hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv");
// TODO: manually setting rope scale should override this
// rope_freq_scale (inverse of the kv) is optional
float ropescale = 1.0f;
GGUF_GET(ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, "llama.rope.scale_linear");
if (ropescale != 1.0f) {
rope_freq_scale = 1.0f/ropescale;
}
// get general kv // get general kv
GGUF_GET(general_name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name"); GGUF_GET(general_name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
GGUF_GET(general_arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture"); GGUF_GET(general_arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
@ -1513,12 +1522,12 @@ static void llama_model_load_internal(
vocab.token_to_id[word] = i; vocab.token_to_id[word] = i;
auto & token_data = vocab.id_to_token[i]; auto & token_data = vocab.id_to_token[i];
token_data.tok = std::move(word); token_data.text = std::move(word);
token_data.score = scores[i]; token_data.score = scores[i];
token_data.toktype = toktypes[i]; token_data.type = (llama_token_type) toktypes[i];
// determine the newline token: 0x0A == 10 == '\n' // determine the newline token: 0x0A == 10 == '\n'
if (token_data.tok == "<0x0A>") { if (token_data.text == "<0x0A>") {
vocab.linefeed_id = i; vocab.linefeed_id = i;
} }
} }
@ -1550,12 +1559,12 @@ static void llama_model_load_internal(
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, general_name.c_str()); LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, general_name.c_str());
// special tokens // special tokens
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].tok.c_str() ); } if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].tok.c_str() ); } if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].tok.c_str() ); } if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].tok.c_str() ); } if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].tok.c_str() ); } if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].tok.c_str() ); } if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
} }
if (vocab_only) { if (vocab_only) {
@ -2347,15 +2356,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
} }
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) { static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 1; return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
} }
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) { static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 2; return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
} }
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) { static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 3; return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
}
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
}
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
}
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
} }
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) { static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
@ -2373,22 +2394,10 @@ static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
return id == vocab.special_pad_id; return id == vocab.special_pad_id;
} }
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 4;
}
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 5;
}
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 6;
}
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) { static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(llama_is_byte_token(vocab, id)); GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto& token_data = vocab.id_to_token.at(id); const auto& token_data = vocab.id_to_token.at(id);
auto buf = token_data.tok.substr(3, 2); auto buf = token_data.text.substr(3, 2);
return strtol(buf.c_str(), NULL, 16); return strtol(buf.c_str(), NULL, 16);
} }
@ -2701,6 +2710,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
bool found = false; bool found = false;
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR; bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
do { do {
@ -4949,25 +4959,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
return ctx->embedding.data(); return ctx->embedding.data();
} }
int llama_get_vocab( const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
const struct llama_context * ctx, return ctx->model.vocab.id_to_token[token].text.c_str();
const char * * strings,
float * scores,
int capacity) {
return llama_model_get_vocab(&ctx->model, strings, scores, capacity);
} }
int llama_model_get_vocab( float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
const struct llama_model * model, return ctx->model.vocab.id_to_token[token].score;
const char * * strings, }
float * scores,
int capacity) { llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
int n = std::min(capacity, (int) model->vocab.id_to_token.size()); return ctx->model.vocab.id_to_token[token].type;
for (int i = 0; i<n; ++i) {
strings[i] = model->vocab.id_to_token[i].tok.c_str();
scores[i] = model->vocab.id_to_token[i].score;
}
return n;
} }
llama_token llama_token_bos(const struct llama_context * ctx) { llama_token llama_token_bos(const struct llama_context * ctx) {
@ -5038,7 +5039,7 @@ int llama_token_to_str(const struct llama_context * ctx, llama_token token, char
int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * buf, int length) { int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * buf, int length) {
if (0 <= token && token < llama_model_n_vocab(&ctx->model)) { if (0 <= token && token < llama_model_n_vocab(&ctx->model)) {
std::string result = ctx->model.vocab.id_to_token[token].tok; std::string result = ctx->model.vocab.id_to_token[token].text;
if (length < (int) result.length()) { if (length < (int) result.length()) {
return -result.length(); return -result.length();
} }
@ -5052,7 +5053,7 @@ int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token,
int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) { int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
if (0 <= token && token < llama_model_n_vocab(model)) { if (0 <= token && token < llama_model_n_vocab(model)) {
if (llama_is_normal_token(model->vocab, token)) { if (llama_is_normal_token(model->vocab, token)) {
std::string result = model->vocab.id_to_token[token].tok; std::string result = model->vocab.id_to_token[token].text;
if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) { if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
result = llama_unescape_whitespace(result); result = llama_unescape_whitespace(result);
} }

26
llama.h
View file

@ -72,6 +72,16 @@ extern "C" {
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
}; };
enum llama_token_type {
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
LLAMA_TOKEN_TYPE_NORMAL = 1,
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
LLAMA_TOKEN_TYPE_CONTROL = 3,
LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
LLAMA_TOKEN_TYPE_UNUSED = 5,
LLAMA_TOKEN_TYPE_BYTE = 6,
};
// model file types // model file types
enum llama_ftype { enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0, LLAMA_FTYPE_ALL_F32 = 0,
@ -330,19 +340,11 @@ extern "C" {
// Vocab // Vocab
// //
// Get the vocabulary as output parameters. LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
// Returns number of results.
LLAMA_API int llama_get_vocab(
const struct llama_context * ctx,
const char * * strings,
float * scores,
int capacity);
LLAMA_API int llama_model_get_vocab( LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
const struct llama_model * model,
const char * * strings, LLAMA_API llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
float * scores,
int capacity);
// Special tokens // Special tokens
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence