Merge branch 'gguf' of https://github.com/goerch/llama.cpp into gguf
This commit is contained in:
commit
a856685648
17 changed files with 765 additions and 309 deletions
13
README.md
13
README.md
|
@ -9,13 +9,13 @@
|
||||||
|
|
||||||
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
**Hot topics:**
|
### 🚧 Incoming breaking change + refactoring:
|
||||||
|
|
||||||
- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
|
See PR https://github.com/ggerganov/llama.cpp/pull/2398 for more info.
|
||||||
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
|
|
||||||
- New roadmap: https://github.com/users/ggerganov/projects/7
|
To devs: avoid making big changes to `llama.h` / `llama.cpp` until merged
|
||||||
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
|
|
||||||
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
|
----
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>Table of Contents</summary>
|
<summary>Table of Contents</summary>
|
||||||
|
@ -99,6 +99,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||||
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||||
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
||||||
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
|
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
|
||||||
|
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
|
||||||
|
|
||||||
**UI:**
|
**UI:**
|
||||||
|
|
||||||
|
|
|
@ -118,6 +118,11 @@ gguf_writer.add_head_count(head_count)
|
||||||
gguf_writer.add_head_count_kv(head_count_kv)
|
gguf_writer.add_head_count_kv(head_count_kv)
|
||||||
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||||
|
|
||||||
|
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
||||||
|
if "type" in hparams["rope_scaling"]:
|
||||||
|
if hparams["rope_scaling"]["type"] == "linear":
|
||||||
|
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
|
||||||
|
|
||||||
|
|
||||||
# TOKENIZATION
|
# TOKENIZATION
|
||||||
|
|
||||||
|
@ -147,9 +152,7 @@ if Path(dir_model + "/tokenizer.model").is_file():
|
||||||
if tokenizer.is_control(i):
|
if tokenizer.is_control(i):
|
||||||
toktype = 3
|
toktype = 3
|
||||||
|
|
||||||
# TODO: How to determinate if a token is user defined?
|
# toktype = 4 is user-defined = tokens from added_tokens.json
|
||||||
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
|
||||||
# if tokenizer.is_user_defined(i): toktype = 4
|
|
||||||
|
|
||||||
if tokenizer.is_unused(i):
|
if tokenizer.is_unused(i):
|
||||||
toktype = 5
|
toktype = 5
|
||||||
|
@ -160,6 +163,17 @@ if Path(dir_model + "/tokenizer.model").is_file():
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
toktypes.append(toktype)
|
toktypes.append(toktype)
|
||||||
|
|
||||||
|
if Path(dir_model + "/added_tokens.json").is_file():
|
||||||
|
with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
|
||||||
|
addtokens_json = json.load(f)
|
||||||
|
|
||||||
|
print("gguf: get added tokens")
|
||||||
|
|
||||||
|
for key in addtokens_json:
|
||||||
|
tokens.append( key.encode("utf-8") )
|
||||||
|
scores.append(-1000.0)
|
||||||
|
toktypes.append(4) # user-defined token type
|
||||||
|
|
||||||
gguf_writer.add_tokenizer_model("llama")
|
gguf_writer.add_tokenizer_model("llama")
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
gguf_writer.add_token_scores(scores)
|
gguf_writer.add_token_scores(scores)
|
||||||
|
|
334
convert-llama-ggmlv3-to-gguf.py
Normal file
334
convert-llama-ggmlv3-to-gguf.py
Normal file
|
@ -0,0 +1,334 @@
|
||||||
|
import sys, struct, math, argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
# Note: Does not support GGML_QKK_64
|
||||||
|
QK_K = 256
|
||||||
|
# Items here are (block size, type size)
|
||||||
|
GGML_QUANT_SIZES = {
|
||||||
|
gguf.GGMLQuantizationType.F32 : (1, 4),
|
||||||
|
gguf.GGMLQuantizationType.F16 : (1, 2),
|
||||||
|
gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
|
||||||
|
gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
|
||||||
|
gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
|
||||||
|
gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
|
||||||
|
gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
|
||||||
|
gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
|
||||||
|
gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
|
||||||
|
gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
|
||||||
|
gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
|
||||||
|
gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
|
||||||
|
gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
|
||||||
|
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
|
||||||
|
}
|
||||||
|
|
||||||
|
class Hyperparameters:
|
||||||
|
def __init__(self):
|
||||||
|
self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
|
||||||
|
self.n_ff = 0
|
||||||
|
|
||||||
|
def set_n_ff(self, model):
|
||||||
|
ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
|
||||||
|
assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
|
||||||
|
ff_tensor = model.tensors[ff_tensor_idx]
|
||||||
|
self.n_ff = ff_tensor.dims[1]
|
||||||
|
|
||||||
|
def load(self, data, offset):
|
||||||
|
(
|
||||||
|
self.n_vocab,
|
||||||
|
self.n_embd,
|
||||||
|
self.n_mult,
|
||||||
|
self.n_head,
|
||||||
|
self.n_layer,
|
||||||
|
self.n_rot,
|
||||||
|
self.ftype,
|
||||||
|
) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
|
||||||
|
return 4 * 7
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
|
||||||
|
|
||||||
|
class Vocab:
|
||||||
|
def __init__(self):
|
||||||
|
self.items = []
|
||||||
|
|
||||||
|
def load(self, data, offset, n_vocab):
|
||||||
|
orig_offset = offset
|
||||||
|
for _ in range(n_vocab):
|
||||||
|
itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
|
||||||
|
assert itemlen < 4096, 'Absurd vocab item length'
|
||||||
|
offset += 4
|
||||||
|
vocab = bytes(data[offset:offset + itemlen])
|
||||||
|
offset += itemlen
|
||||||
|
score = struct.unpack('<f', data[offset:offset + 4])[0]
|
||||||
|
offset += 4
|
||||||
|
self.items.append((vocab, score))
|
||||||
|
return offset - orig_offset
|
||||||
|
|
||||||
|
class Tensor:
|
||||||
|
def __init__(self):
|
||||||
|
self.name = None
|
||||||
|
self.dims = ()
|
||||||
|
self.dtype = None
|
||||||
|
self.start_offset = 0
|
||||||
|
self.len_bytes = 0
|
||||||
|
|
||||||
|
def load(self, data, offset):
|
||||||
|
orig_offset = offset
|
||||||
|
(n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
|
||||||
|
assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
|
||||||
|
assert name_len < 4096, 'Absurd tensor name length'
|
||||||
|
quant = GGML_QUANT_SIZES.get(dtype)
|
||||||
|
assert quant is not None, 'Unknown tensor type'
|
||||||
|
(blksize, tysize) = quant
|
||||||
|
offset += 12
|
||||||
|
self.dtype= dtype
|
||||||
|
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
|
||||||
|
offset += 4 * n_dims
|
||||||
|
self.name = bytes(data[offset:offset + name_len])
|
||||||
|
offset += name_len
|
||||||
|
pad = ((offset + 31) & ~31) - offset
|
||||||
|
offset += pad
|
||||||
|
n_elems = np.prod(self.dims)
|
||||||
|
n_bytes = (n_elems * tysize) // blksize
|
||||||
|
self.start_offset = offset
|
||||||
|
self.len_bytes = n_bytes
|
||||||
|
offset += n_bytes
|
||||||
|
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
||||||
|
return offset - orig_offset
|
||||||
|
|
||||||
|
class GGMLV3Model:
|
||||||
|
def __init__(self):
|
||||||
|
self.hyperparameters = None
|
||||||
|
self.vocab = None
|
||||||
|
self.tensor_map = {}
|
||||||
|
self.tensors = []
|
||||||
|
|
||||||
|
def validate_header(self, data, offset):
|
||||||
|
if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
|
||||||
|
raise ValueError('Only GGJTv3 supported')
|
||||||
|
return 8
|
||||||
|
|
||||||
|
def load(self, data, offset):
|
||||||
|
offset += self.validate_header(data, offset)
|
||||||
|
hp = Hyperparameters()
|
||||||
|
offset += hp.load(data, offset)
|
||||||
|
vocab = Vocab()
|
||||||
|
offset += vocab.load(data, offset, hp.n_vocab)
|
||||||
|
tensors = []
|
||||||
|
tensor_map = {}
|
||||||
|
while offset < len(data):
|
||||||
|
tensor = Tensor()
|
||||||
|
offset += tensor.load(data, offset)
|
||||||
|
tensor_map[tensor.name] = len(tensors)
|
||||||
|
tensors.append(tensor)
|
||||||
|
self.hyperparameters = hp
|
||||||
|
self.vocab = vocab
|
||||||
|
self.tensors = tensors
|
||||||
|
self.tensor_map = tensor_map
|
||||||
|
hp.set_n_ff(self)
|
||||||
|
return offset
|
||||||
|
|
||||||
|
class GGMLToGGUF:
|
||||||
|
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None):
|
||||||
|
hp = ggml_model.hyperparameters
|
||||||
|
self.model = ggml_model
|
||||||
|
self.data = data
|
||||||
|
self.cfg = cfg
|
||||||
|
self.params_override = params_override
|
||||||
|
self.vocab_override = vocab_override
|
||||||
|
if params_override is not None:
|
||||||
|
n_kv_head = params_override.n_head_kv
|
||||||
|
else:
|
||||||
|
if cfg.gqa == 1:
|
||||||
|
n_kv_head = hp.n_head
|
||||||
|
else:
|
||||||
|
gqa = float(cfg.gqa)
|
||||||
|
n_kv_head = None
|
||||||
|
for x in range(1, 256):
|
||||||
|
if float(hp.n_head) / float(x) == gqa:
|
||||||
|
n_kv_head = x
|
||||||
|
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
|
||||||
|
print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
|
||||||
|
self.n_kv_head = n_kv_head
|
||||||
|
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
|
||||||
|
|
||||||
|
def save(self):
|
||||||
|
print('* Preparing to save GGUF file')
|
||||||
|
gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
||||||
|
self.add_params(gguf_writer)
|
||||||
|
self.add_vocab(gguf_writer)
|
||||||
|
self.add_tensors(gguf_writer)
|
||||||
|
print(" gguf: write header")
|
||||||
|
gguf_writer.write_header_to_file()
|
||||||
|
print(" gguf: write metadata")
|
||||||
|
gguf_writer.write_kv_data_to_file()
|
||||||
|
print(" gguf: write tensors")
|
||||||
|
gguf_writer.write_tensors_to_file()
|
||||||
|
gguf_writer.close()
|
||||||
|
|
||||||
|
def add_params(self, gguf_writer):
|
||||||
|
hp = self.model.hyperparameters
|
||||||
|
cfg = self.cfg
|
||||||
|
desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
|
||||||
|
try:
|
||||||
|
# Filenames aren't necessarily valid UTF8.
|
||||||
|
name = cfg.name if cfg.name is not None else cfg.input.name
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
name = None
|
||||||
|
print('* Adding model parameters and KV items')
|
||||||
|
if name is not None:
|
||||||
|
gguf_writer.add_name(name)
|
||||||
|
gguf_writer.add_description(desc)
|
||||||
|
if self.params_override is not None:
|
||||||
|
po = self.params_override
|
||||||
|
assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
|
||||||
|
assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
|
||||||
|
assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
|
||||||
|
gguf_writer.add_context_length (po.n_ctx)
|
||||||
|
gguf_writer.add_embedding_length (po.n_embd)
|
||||||
|
gguf_writer.add_block_count (po.n_layer)
|
||||||
|
gguf_writer.add_feed_forward_length (po.n_ff)
|
||||||
|
gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
|
||||||
|
gguf_writer.add_head_count (po.n_head)
|
||||||
|
gguf_writer.add_head_count_kv (po.n_head_kv)
|
||||||
|
gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps)
|
||||||
|
return
|
||||||
|
gguf_writer.add_context_length(cfg.context_length)
|
||||||
|
gguf_writer.add_embedding_length(hp.n_embd)
|
||||||
|
gguf_writer.add_block_count(hp.n_layer)
|
||||||
|
gguf_writer.add_feed_forward_length(hp.n_ff)
|
||||||
|
gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
|
||||||
|
gguf_writer.add_head_count(hp.n_head)
|
||||||
|
gguf_writer.add_head_count_kv(self.n_kv_head)
|
||||||
|
gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
|
||||||
|
|
||||||
|
def add_vocab(self, gguf_writer):
|
||||||
|
hp = self.model.hyperparameters
|
||||||
|
gguf_writer.add_tokenizer_model('llama')
|
||||||
|
tokens = []
|
||||||
|
scores = []
|
||||||
|
toktypes = []
|
||||||
|
if self.vocab_override is not None:
|
||||||
|
vo = self.vocab_override
|
||||||
|
print('* Adding vocab item(s)')
|
||||||
|
for (idx, vitem) in enumerate(vo.all_tokens()):
|
||||||
|
if len(vitem) == 3:
|
||||||
|
tokens.append(vitem[0])
|
||||||
|
scores.append(vitem[1])
|
||||||
|
toktypes.append(vitem[2])
|
||||||
|
else:
|
||||||
|
# Maybe try to guess the token type here?
|
||||||
|
tokens.append(vitem[0])
|
||||||
|
scores.append(vitem[1])
|
||||||
|
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
|
||||||
|
gguf_writer.add_token_list(tokens)
|
||||||
|
gguf_writer.add_token_scores(scores)
|
||||||
|
if len(toktypes) > 0:
|
||||||
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
return
|
||||||
|
print(f'* Adding {hp.n_vocab} vocab item(s)')
|
||||||
|
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
|
||||||
|
tt = 1 # Normal
|
||||||
|
if len(vbytes) == 0:
|
||||||
|
tt = 3 # Control
|
||||||
|
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
|
||||||
|
hv = hex(vbytes[0])[2:].upper()
|
||||||
|
vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8')
|
||||||
|
tt = 6 # Byte
|
||||||
|
else:
|
||||||
|
vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
|
||||||
|
toktypes.append(tt)
|
||||||
|
tokens.append(vbytes)
|
||||||
|
scores.append(vscore)
|
||||||
|
gguf_writer.add_token_list(tokens)
|
||||||
|
gguf_writer.add_token_scores(scores)
|
||||||
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
def add_tensors(self, gguf_writer):
|
||||||
|
nm = self.name_map
|
||||||
|
data = self.data
|
||||||
|
print(f'* Adding {len(self.model.tensors)} tensor(s)')
|
||||||
|
for tensor in self.model.tensors:
|
||||||
|
name = str(tensor.name, 'UTF-8')
|
||||||
|
if name.endswith('.weight'):
|
||||||
|
name = name[:-7]
|
||||||
|
suffix = '.weight'
|
||||||
|
elif name.endswith('.bias'):
|
||||||
|
name = name[:-5]
|
||||||
|
suffix = '.bias'
|
||||||
|
mapped_name = nm.get(name)
|
||||||
|
assert mapped_name is not None, f'Bad name {name}'
|
||||||
|
mapped_name += suffix
|
||||||
|
tempdims = list(tensor.dims[:])
|
||||||
|
if len(tempdims) > 1:
|
||||||
|
temp = tempdims[1]
|
||||||
|
tempdims[1] = tempdims[0]
|
||||||
|
tempdims[0] = temp
|
||||||
|
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
|
||||||
|
gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
|
||||||
|
|
||||||
|
def handle_metadata(cfg, hp):
|
||||||
|
import convert
|
||||||
|
assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
|
||||||
|
hf_config_path = cfg.model_metadata_dir / "config.json"
|
||||||
|
orig_config_path = cfg.model_metadata_dir / "params.json"
|
||||||
|
# We pass a fake model here. "original" mode will check the shapes of some
|
||||||
|
# tensors if information is missing in the .json file: other than that, the
|
||||||
|
# model data isn't used so this should be safe (at least for now).
|
||||||
|
fakemodel = {
|
||||||
|
'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
|
||||||
|
'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
|
||||||
|
}
|
||||||
|
fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
|
||||||
|
fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
|
||||||
|
if hf_config_path.exists():
|
||||||
|
params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
|
||||||
|
elif orig_config_path.exists():
|
||||||
|
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
|
||||||
|
else:
|
||||||
|
raise ValueError('Unable to load metadata')
|
||||||
|
vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
|
||||||
|
convert.check_vocab_size(params, vocab)
|
||||||
|
return (params, vocab)
|
||||||
|
|
||||||
|
def handle_args():
|
||||||
|
parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
|
||||||
|
parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename')
|
||||||
|
parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
|
||||||
|
parser.add_argument('--name', help = 'Set model name')
|
||||||
|
parser.add_argument('--desc', help = 'Set model description')
|
||||||
|
parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
||||||
|
parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
||||||
|
parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
||||||
|
parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
||||||
|
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
||||||
|
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
cfg = handle_args()
|
||||||
|
print(f'* Using config: {cfg}')
|
||||||
|
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
|
||||||
|
data = np.memmap(cfg.input, mode = 'r')
|
||||||
|
model = GGMLV3Model()
|
||||||
|
print('* Scanning GGML input file')
|
||||||
|
offset = model.load(data, 0)
|
||||||
|
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||||
|
vocab_override = None
|
||||||
|
params_override = None
|
||||||
|
if cfg.model_metadata_dir is not None:
|
||||||
|
(params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
|
||||||
|
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
|
||||||
|
print(f'* Overriding params: {params_override}')
|
||||||
|
print(f'* Overriding vocab: {vocab_override}')
|
||||||
|
else:
|
||||||
|
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
||||||
|
converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
|
||||||
|
converter.save()
|
||||||
|
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||||
|
|
||||||
|
main()
|
|
@ -126,6 +126,11 @@ gguf_writer.add_head_count(head_count)
|
||||||
gguf_writer.add_head_count_kv(head_count_kv)
|
gguf_writer.add_head_count_kv(head_count_kv)
|
||||||
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||||
|
|
||||||
|
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
||||||
|
if "type" in hparams["rope_scaling"]:
|
||||||
|
if hparams["rope_scaling"]["type"] == "linear":
|
||||||
|
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
|
||||||
|
|
||||||
|
|
||||||
# TOKENIZATION
|
# TOKENIZATION
|
||||||
|
|
||||||
|
@ -155,9 +160,7 @@ if Path(dir_model + "/tokenizer.model").is_file():
|
||||||
if tokenizer.is_control(i):
|
if tokenizer.is_control(i):
|
||||||
toktype = 3
|
toktype = 3
|
||||||
|
|
||||||
# TODO: How to determinate if a token is user defined?
|
# toktype = 4 is user-defined = tokens from added_tokens.json
|
||||||
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
|
||||||
# if tokenizer.is_user_defined(i): toktype = 4
|
|
||||||
|
|
||||||
if tokenizer.is_unused(i):
|
if tokenizer.is_unused(i):
|
||||||
toktype = 5
|
toktype = 5
|
||||||
|
@ -168,6 +171,18 @@ if Path(dir_model + "/tokenizer.model").is_file():
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
toktypes.append(toktype)
|
toktypes.append(toktype)
|
||||||
|
|
||||||
|
if Path(dir_model + "/added_tokens.json").is_file():
|
||||||
|
with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
|
||||||
|
addtokens_json = json.load(f)
|
||||||
|
|
||||||
|
print("gguf: get added tokens")
|
||||||
|
|
||||||
|
for key in addtokens_json:
|
||||||
|
tokens.append( key.encode("utf-8") )
|
||||||
|
scores.append(-1000.0)
|
||||||
|
toktypes.append(4) # user-defined token type
|
||||||
|
|
||||||
|
|
||||||
gguf_writer.add_tokenizer_model("llama")
|
gguf_writer.add_tokenizer_model("llama")
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
gguf_writer.add_token_scores(scores)
|
gguf_writer.add_token_scores(scores)
|
||||||
|
@ -264,7 +279,9 @@ for part_name in part_names:
|
||||||
data = data.squeeze().numpy()
|
data = data.squeeze().numpy()
|
||||||
|
|
||||||
# reverse permute these
|
# reverse permute these
|
||||||
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
|
if name.endswith(".q_proj.weight"):
|
||||||
|
data = reverse_hf_permute(data, head_count)
|
||||||
|
if name.endswith(".k_proj.weight"):
|
||||||
data = reverse_hf_permute(data, head_count, head_count_kv)
|
data = reverse_hf_permute(data, head_count, head_count_kv)
|
||||||
|
|
||||||
# map tensor names
|
# map tensor names
|
||||||
|
|
26
convert.py
Executable file → Normal file
26
convert.py
Executable file → Normal file
|
@ -241,11 +241,13 @@ class BpeVocab:
|
||||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
||||||
else:
|
else:
|
||||||
added_tokens = {}
|
added_tokens = {}
|
||||||
|
|
||||||
vocab_size: int = len(self.bpe_tokenizer)
|
vocab_size: int = len(self.bpe_tokenizer)
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||||
actual_ids = sorted(added_tokens.values())
|
actual_ids = sorted(added_tokens.values())
|
||||||
if expected_ids != actual_ids:
|
if expected_ids != actual_ids:
|
||||||
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
|
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
|
||||||
|
|
||||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
||||||
self.added_tokens_list = [text for (text, idx) in items]
|
self.added_tokens_list = [text for (text, idx) in items]
|
||||||
self.vocab_size_base: int = vocab_size
|
self.vocab_size_base: int = vocab_size
|
||||||
|
@ -261,12 +263,12 @@ class BpeVocab:
|
||||||
for i, item in enumerate(tokenizer):
|
for i, item in enumerate(tokenizer):
|
||||||
text: bytes = item.encode("utf-8")
|
text: bytes = item.encode("utf-8")
|
||||||
score: float = -i
|
score: float = -i
|
||||||
yield text, score, 4
|
yield text, score, gguf.TokenType.USER_DEFINED
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
|
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
|
||||||
for text in self.added_tokens_list:
|
for text in self.added_tokens_list:
|
||||||
score = -1000.0
|
score = -1000.0
|
||||||
yield text.encode("utf-8"), score, 4
|
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
|
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
|
||||||
yield from self.bpe_tokens()
|
yield from self.bpe_tokens()
|
||||||
|
@ -304,27 +306,27 @@ class SentencePieceVocab:
|
||||||
text: bytes = piece.encode("utf-8")
|
text: bytes = piece.encode("utf-8")
|
||||||
score: float = tokenizer.get_score(i)
|
score: float = tokenizer.get_score(i)
|
||||||
|
|
||||||
toktype = 1 # defualt to normal token type
|
toktype = gguf.TokenType.NORMAL
|
||||||
if tokenizer.is_unknown(i):
|
if tokenizer.is_unknown(i):
|
||||||
toktype = 2
|
toktype = gguf.TokenType.UNKNOWN
|
||||||
if tokenizer.is_control(i):
|
if tokenizer.is_control(i):
|
||||||
toktype = 3
|
toktype = gguf.TokenType.CONTROL
|
||||||
|
|
||||||
# NOTE: I think added_tokens are user defined.
|
# NOTE: I think added_tokens are user defined.
|
||||||
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
||||||
# if tokenizer.is_user_defined(i): toktype = 4
|
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
||||||
|
|
||||||
if tokenizer.is_unused(i):
|
if tokenizer.is_unused(i):
|
||||||
toktype = 5
|
toktype = gguf.TokenType.UNUSED
|
||||||
if tokenizer.is_byte(i):
|
if tokenizer.is_byte(i):
|
||||||
toktype = 6
|
toktype = gguf.TokenType.BYTE
|
||||||
|
|
||||||
yield text, score, toktype
|
yield text, score, toktype
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
|
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
|
||||||
for text in self.added_tokens_list:
|
for text in self.added_tokens_list:
|
||||||
score = -1000.0
|
score = -1000.0
|
||||||
yield text.encode("utf-8"), score, 4
|
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
|
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
|
||||||
yield from self.sentencepiece_tokens()
|
yield from self.sentencepiece_tokens()
|
||||||
|
@ -342,6 +344,7 @@ Vocab = Union[BpeVocab, SentencePieceVocab]
|
||||||
#
|
#
|
||||||
|
|
||||||
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
||||||
|
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
n_head //= n_head_kv
|
n_head //= n_head_kv
|
||||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
|
@ -724,6 +727,7 @@ class OutputFile:
|
||||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||||
|
|
||||||
def add_meta_arch(self, params: Params) -> None:
|
def add_meta_arch(self, params: Params) -> None:
|
||||||
|
self.gguf.add_name ("llama")
|
||||||
self.gguf.add_context_length (params.n_ctx)
|
self.gguf.add_context_length (params.n_ctx)
|
||||||
self.gguf.add_embedding_length (params.n_embd)
|
self.gguf.add_embedding_length (params.n_embd)
|
||||||
self.gguf.add_block_count (params.n_layer)
|
self.gguf.add_block_count (params.n_layer)
|
||||||
|
@ -836,12 +840,12 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||||
for i in itertools.count():
|
for i in itertools.count():
|
||||||
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
|
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
|
||||||
print(f"Permuting layer {i}")
|
print(f"Permuting layer {i}")
|
||||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head_kv)
|
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
|
||||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
|
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
|
||||||
#tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
#tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
||||||
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
||||||
print(f"Unpacking and permuting layer {i}")
|
print(f"Unpacking and permuting layer {i}")
|
||||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head_kv)
|
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
|
||||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
|
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
|
||||||
tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
|
tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -139,14 +139,16 @@ void print_sample_weights(TransformerWeights *w){
|
||||||
struct llama_vocab {
|
struct llama_vocab {
|
||||||
using id = int32_t;
|
using id = int32_t;
|
||||||
using token = std::string;
|
using token = std::string;
|
||||||
|
using ttype = llama_token_type;
|
||||||
|
|
||||||
struct token_score {
|
struct token_data {
|
||||||
token tok;
|
token text;
|
||||||
float score;
|
float score;
|
||||||
|
ttype type;
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unordered_map<token, id> token_to_id;
|
std::unordered_map<token, id> token_to_id;
|
||||||
std::vector<token_score> id_to_token;
|
std::vector<token_data> id_to_token;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct my_llama_hparams {
|
struct my_llama_hparams {
|
||||||
|
@ -516,36 +518,30 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
|
||||||
struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
|
struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
|
||||||
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
|
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
|
||||||
|
|
||||||
std::vector<const char *> strings;
|
const int n_vocab = llama_n_vocab(lctx);
|
||||||
std::vector<float> scores;
|
|
||||||
int n_vocab = llama_n_vocab(lctx);
|
|
||||||
strings.resize(n_vocab, NULL);
|
|
||||||
scores.resize(n_vocab, 0);
|
|
||||||
n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
|
|
||||||
GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
|
|
||||||
vocab->id_to_token.resize(n_vocab);
|
vocab->id_to_token.resize(n_vocab);
|
||||||
for (int i=0; i<n_vocab; ++i) {
|
for (int i=0; i<n_vocab; ++i) {
|
||||||
std::string tok = std::string(strings[i]);
|
vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
|
||||||
float score = scores[i];
|
vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
|
||||||
vocab->id_to_token[i].tok = tok;
|
vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
|
||||||
vocab->id_to_token[i].score = score;
|
vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
|
||||||
vocab->token_to_id.emplace(tok, i);
|
|
||||||
}
|
}
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(lmodel);
|
llama_free_model(lmodel);
|
||||||
} else { // assume llama2.c vocabulary
|
} else { // assume llama2.c vocabulary
|
||||||
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
|
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
|
||||||
llama_file file(filename, "rb");
|
llama_file file(filename, "rb");
|
||||||
uint32_t n_vocab = config->vocab_size;
|
const int n_vocab = config->vocab_size;
|
||||||
/* uint32_t max_token_length = */ file.read_u32(); // unused
|
/* uint32_t max_token_length = */ file.read_u32(); // unused
|
||||||
vocab->id_to_token.resize(n_vocab);
|
vocab->id_to_token.resize(n_vocab);
|
||||||
for (uint32_t i=0; i<n_vocab; ++i) {
|
for (int i=0; i<n_vocab; ++i) {
|
||||||
float_t score = file.read_f32();
|
float_t score = file.read_f32();
|
||||||
uint32_t len = file.read_u32();
|
uint32_t len = file.read_u32();
|
||||||
std::string tok = file.read_string(len);
|
std::string text = file.read_string(len);
|
||||||
vocab->id_to_token[i].tok = tok;
|
vocab->id_to_token[i].text = text;
|
||||||
vocab->id_to_token[i].score = score;
|
vocab->id_to_token[i].score = score;
|
||||||
vocab->token_to_id.emplace(tok, i);
|
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
|
||||||
|
vocab->token_to_id.emplace(text, i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -611,10 +607,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
||||||
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
|
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
|
||||||
// uint32_t n_vocab = model->hparams.n_vocab;
|
// uint32_t n_vocab = model->hparams.n_vocab;
|
||||||
// for (uint32_t i = 0; i < n_vocab; i++) {
|
// for (uint32_t i = 0; i < n_vocab; i++) {
|
||||||
// const auto & token_score = vocab->id_to_token.at(i);
|
// const auto & token_data = vocab->id_to_token.at(i);
|
||||||
// file.write_u32((uint32_t) token_score.tok.size());
|
// file.write_u32((uint32_t) token_data.tok.size());
|
||||||
// file.write_raw(token_score.tok.data(), token_score.tok.size());
|
// file.write_raw(token_data.tok.data(), token_data.tok.size());
|
||||||
// file.write_raw(&token_score.score, sizeof(token_score.score));
|
// file.write_raw(&token_data.score, sizeof(token_data.score));
|
||||||
// }
|
// }
|
||||||
//
|
//
|
||||||
// // stuff AK weights into GG weights one by one.
|
// // stuff AK weights into GG weights one by one.
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
@ -121,6 +122,27 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
|
||||||
|
int n_vocab, int n_thread) {
|
||||||
|
std::vector<float> result;
|
||||||
|
result.reserve(tokens.size() * n_vocab);
|
||||||
|
size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
|
||||||
|
for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
|
||||||
|
size_t n_tokens = tokens.size() - i_chunk * n_batch;
|
||||||
|
n_tokens = std::min(n_tokens, size_t(n_batch));
|
||||||
|
if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) {
|
||||||
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto logits = llama_get_logits(ctx);
|
||||||
|
result.insert(result.end(), logits, logits + n_tokens * n_vocab);
|
||||||
|
|
||||||
|
n_past += n_tokens;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
// Calculates hellaswag score (acc_norm) from prompt
|
// Calculates hellaswag score (acc_norm) from prompt
|
||||||
//
|
//
|
||||||
|
@ -209,17 +231,19 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
double acc = 0.0f;
|
double acc = 0.0f;
|
||||||
const int n_vocab = llama_n_vocab(ctx);
|
const int n_vocab = llama_n_vocab(ctx);
|
||||||
|
|
||||||
|
std::vector<float> tok_logits(n_vocab);
|
||||||
|
|
||||||
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
||||||
|
|
||||||
// Tokenize the context to count tokens
|
// Tokenize the context to count tokens
|
||||||
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
|
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
|
||||||
size_t context_size = context_embd.size();
|
size_t context_size = context_embd.size();
|
||||||
|
|
||||||
for (size_t ending_idx=0;ending_idx<4;ending_idx++) {
|
// Do the 1st ending
|
||||||
|
// In this case we include the context when evaluating
|
||||||
// Tokenize the query
|
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos);
|
||||||
std::vector<int> query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos);
|
auto query_size = query_embd.size();
|
||||||
size_t query_size = query_embd.size();
|
//printf("First query: %d\n",(int)query_size);
|
||||||
|
|
||||||
// Stop if query wont fit the ctx window
|
// Stop if query wont fit the ctx window
|
||||||
if (query_size > (size_t)params.n_ctx) {
|
if (query_size > (size_t)params.n_ctx) {
|
||||||
|
@ -232,27 +256,68 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
query_embd.resize(32);
|
query_embd.resize(32);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Evaluate the query
|
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
|
||||||
if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) {
|
if (logits.empty()) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto query_logits = llama_get_logits(ctx);
|
std::memcpy(tok_logits.data(), logits.data() + (context_size-1)*n_vocab, n_vocab*sizeof(float));
|
||||||
std::vector<float> logits;
|
const auto first_probs = softmax(tok_logits);
|
||||||
logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab);
|
|
||||||
|
|
||||||
hs_data[task_idx].ending_logprob_count[ending_idx] = 0;
|
hs_data[task_idx].ending_logprob_count[0] = 1;
|
||||||
hs_data[task_idx].ending_logprob[ending_idx] = 0.0f;
|
hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]);
|
||||||
|
|
||||||
// Calculate the logprobs over the ending
|
// Calculate the logprobs over the ending
|
||||||
for (size_t j = context_size-1; j < query_size - 1; j++) {
|
for (size_t j = context_size; j < query_size - 1; j++) {
|
||||||
// Calculate probability of next token, given the previous ones.
|
|
||||||
const std::vector<float> tok_logits(
|
|
||||||
logits.begin() + (j + 0) * n_vocab,
|
|
||||||
logits.begin() + (j + 1) * n_vocab);
|
|
||||||
|
|
||||||
const float prob = softmax(tok_logits)[query_embd[ j + 1]];
|
std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
|
||||||
|
|
||||||
|
const float prob = softmax(tok_logits)[query_embd[j + 1]];
|
||||||
|
|
||||||
|
hs_data[task_idx].ending_logprob[0] += std::log(prob);
|
||||||
|
hs_data[task_idx].ending_logprob_count[0]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the mean token logprob for acc_norm
|
||||||
|
hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0];
|
||||||
|
|
||||||
|
// Do the remaining endings
|
||||||
|
// For these, we use the bare ending with n_past = context_size
|
||||||
|
//
|
||||||
|
for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
|
||||||
|
|
||||||
|
// Tokenize the query
|
||||||
|
query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
|
||||||
|
query_size = query_embd.size();
|
||||||
|
|
||||||
|
// Stop if query wont fit the ctx window
|
||||||
|
if (context_size + query_size > (size_t)params.n_ctx) {
|
||||||
|
fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Speedup small evaluations by evaluating atleast 32 tokens
|
||||||
|
// No, resizing to 32 is actually slightly slower (at least on CUDA)
|
||||||
|
//if (query_size < 32) {
|
||||||
|
// query_embd.resize(32);
|
||||||
|
//}
|
||||||
|
|
||||||
|
// Evaluate the query
|
||||||
|
logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab, params.n_threads);
|
||||||
|
if (logits.empty()) {
|
||||||
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
|
||||||
|
hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);
|
||||||
|
|
||||||
|
// Calculate the logprobs over the ending
|
||||||
|
for (size_t j = 0; j < query_size - 1; j++) {
|
||||||
|
std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
|
||||||
|
|
||||||
|
const float prob = softmax(tok_logits)[query_embd[j + 1]];
|
||||||
|
|
||||||
hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
|
hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
|
||||||
hs_data[task_idx].ending_logprob_count[ending_idx]++;
|
hs_data[task_idx].ending_logprob_count[ending_idx]++;
|
||||||
|
@ -267,9 +332,9 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the ending with maximum logprob
|
// Find the ending with maximum logprob
|
||||||
size_t ending_logprob_max_idx = -1;
|
size_t ending_logprob_max_idx = 0;
|
||||||
double ending_logprob_max_val = -INFINITY;
|
double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];
|
||||||
for (size_t j=0; j < 4; j++) {
|
for (size_t j = 1; j < 4; j++) {
|
||||||
if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
|
if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
|
||||||
ending_logprob_max_idx = j;
|
ending_logprob_max_idx = j;
|
||||||
ending_logprob_max_val = hs_data[task_idx].ending_logprob[j];
|
ending_logprob_max_val = hs_data[task_idx].ending_logprob[j];
|
||||||
|
|
|
@ -11,8 +11,10 @@ echo >> $PUBLIC/index.js # add newline
|
||||||
|
|
||||||
FILES=$(ls $PUBLIC)
|
FILES=$(ls $PUBLIC)
|
||||||
|
|
||||||
|
cd $PUBLIC
|
||||||
for FILE in $FILES; do
|
for FILE in $FILES; do
|
||||||
func=$(echo $FILE | tr '.' '_')
|
echo "generate $FILE.hpp"
|
||||||
echo "generate $FILE.hpp ($func)"
|
|
||||||
xxd -n $func -i $PUBLIC/$FILE > $DIR/$FILE.hpp
|
# use simple flag for old version of xxd
|
||||||
|
xxd -i $FILE > $DIR/$FILE.hpp
|
||||||
done
|
done
|
||||||
|
|
|
@ -144,12 +144,12 @@
|
||||||
import { SchemaConverter } from '/json-schema-to-grammar.mjs';
|
import { SchemaConverter } from '/json-schema-to-grammar.mjs';
|
||||||
|
|
||||||
const session = signal({
|
const session = signal({
|
||||||
prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.",
|
prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
|
||||||
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
|
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
|
||||||
historyTemplate: "{{name}}: {{message}}",
|
historyTemplate: "{{name}}: {{message}}",
|
||||||
transcript: [],
|
transcript: [],
|
||||||
type: "chat",
|
type: "chat",
|
||||||
char: "llama",
|
char: "Llama",
|
||||||
user: "User",
|
user: "User",
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
|
@ -170,14 +170,16 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
|
||||||
struct llama_vocab {
|
struct llama_vocab {
|
||||||
using id = int32_t;
|
using id = int32_t;
|
||||||
using token = std::string;
|
using token = std::string;
|
||||||
|
using ttype = llama_token_type;
|
||||||
|
|
||||||
struct token_score {
|
struct token_data {
|
||||||
token tok;
|
token text;
|
||||||
float score;
|
float score;
|
||||||
|
ttype type;
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unordered_map<token, id> token_to_id;
|
std::unordered_map<token, id> token_to_id;
|
||||||
std::vector<token_score> id_to_token;
|
std::vector<token_data> id_to_token;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct my_llama_hparams {
|
struct my_llama_hparams {
|
||||||
|
@ -2629,10 +2631,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
||||||
// // write_vocab
|
// // write_vocab
|
||||||
// uint32_t n_vocab = model->hparams.n_vocab;
|
// uint32_t n_vocab = model->hparams.n_vocab;
|
||||||
// for (uint32_t i = 0; i < n_vocab; i++) {
|
// for (uint32_t i = 0; i < n_vocab; i++) {
|
||||||
// const auto & token_score = vocab->id_to_token.at(i);
|
// const auto & token_data = vocab->id_to_token.at(i);
|
||||||
// file.write_u32((uint32_t) token_score.tok.size());
|
// file.write_u32((uint32_t) token_data.tok.size());
|
||||||
// file.write_raw(token_score.tok.data(), token_score.tok.size());
|
// file.write_raw(token_data.tok.data(), token_data.tok.size());
|
||||||
// file.write_raw(&token_score.score, sizeof(token_score.score));
|
// file.write_raw(&token_data.score, sizeof(token_data.score));
|
||||||
// }
|
// }
|
||||||
// // write tensors
|
// // write tensors
|
||||||
// write_tensor(&file, model->tok_embeddings);
|
// write_tensor(&file, model->tok_embeddings);
|
||||||
|
@ -3055,20 +3057,13 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
struct llama_vocab vocab;
|
struct llama_vocab vocab;
|
||||||
{
|
{
|
||||||
std::vector<const char *> strings;
|
const int n_vocab = llama_n_vocab(lctx);
|
||||||
std::vector<float> scores;
|
|
||||||
int n_vocab = llama_n_vocab(lctx);
|
|
||||||
strings.resize(n_vocab, NULL);
|
|
||||||
scores.resize(n_vocab, 0);
|
|
||||||
n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
|
|
||||||
GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
|
|
||||||
vocab.id_to_token.resize(n_vocab);
|
vocab.id_to_token.resize(n_vocab);
|
||||||
for (int i=0; i<n_vocab; ++i) {
|
for (int i=0; i<n_vocab; ++i) {
|
||||||
std::string tok = std::string(strings[i]);
|
vocab.id_to_token[i].text = llama_token_get_text(lctx, i);
|
||||||
float score = scores[i];
|
vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
|
||||||
vocab.id_to_token[i].tok = tok;
|
vocab.id_to_token[i].type = llama_token_get_type(lctx, i);
|
||||||
vocab.id_to_token[i].score = score;
|
vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
|
||||||
vocab.token_to_id.emplace(tok, i);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
||||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
||||||
for (int i = 0; i < 8; i++) {
|
for (int i = 0; i < 8; i++) {
|
||||||
|
threadgroup_barrier(mem_flags::mem_device);
|
||||||
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
||||||
}
|
}
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_device);
|
||||||
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
||||||
if (sgitg==0) {
|
if (sgitg==0) {
|
||||||
for (int i = 0; i < n_rows; i++) {
|
for (int i = 0; i < n_rows; i++) {
|
||||||
|
|
244
ggml.c
244
ggml.c
|
@ -1643,11 +1643,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
||||||
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
|
||||||
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
|
[GGML_TYPE_I8] = {
|
||||||
|
.type_name = "i8",
|
||||||
|
.blck_size = 1,
|
||||||
|
.type_size = sizeof(int8_t),
|
||||||
|
.is_quantized = false,
|
||||||
|
},
|
||||||
|
[GGML_TYPE_I16] = {
|
||||||
|
.type_name = "i16",
|
||||||
|
.blck_size = 1,
|
||||||
|
.type_size = sizeof(int16_t),
|
||||||
|
.is_quantized = false,
|
||||||
|
},
|
||||||
|
[GGML_TYPE_I32] = {
|
||||||
|
.type_name = "i32",
|
||||||
|
.blck_size = 1,
|
||||||
|
.type_size = sizeof(int32_t),
|
||||||
|
.is_quantized = false,
|
||||||
|
},
|
||||||
[GGML_TYPE_F32] = {
|
[GGML_TYPE_F32] = {
|
||||||
|
.type_name = "f32",
|
||||||
|
.blck_size = 1,
|
||||||
|
.type_size = sizeof(float),
|
||||||
|
.is_quantized = false,
|
||||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
||||||
.vec_dot_type = GGML_TYPE_F32,
|
.vec_dot_type = GGML_TYPE_F32,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_F16] = {
|
[GGML_TYPE_F16] = {
|
||||||
|
.type_name = "f16",
|
||||||
|
.blck_size = 1,
|
||||||
|
.type_size = sizeof(ggml_fp16_t),
|
||||||
|
.is_quantized = false,
|
||||||
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
||||||
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||||
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||||
|
@ -1655,6 +1681,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_F16,
|
.vec_dot_type = GGML_TYPE_F16,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_0] = {
|
[GGML_TYPE_Q4_0] = {
|
||||||
|
.type_name = "q4_0",
|
||||||
|
.blck_size = QK4_0,
|
||||||
|
.type_size = sizeof(block_q4_0),
|
||||||
|
.is_quantized = true,
|
||||||
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
||||||
.from_float = quantize_row_q4_0,
|
.from_float = quantize_row_q4_0,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
||||||
|
@ -1662,6 +1692,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_1] = {
|
[GGML_TYPE_Q4_1] = {
|
||||||
|
.type_name = "q4_1",
|
||||||
|
.blck_size = QK4_1,
|
||||||
|
.type_size = sizeof(block_q4_1),
|
||||||
|
.is_quantized = true,
|
||||||
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
||||||
.from_float = quantize_row_q4_1,
|
.from_float = quantize_row_q4_1,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
||||||
|
@ -1669,6 +1703,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_0] = {
|
[GGML_TYPE_Q5_0] = {
|
||||||
|
.type_name = "q5_0",
|
||||||
|
.blck_size = QK5_0,
|
||||||
|
.type_size = sizeof(block_q5_0),
|
||||||
|
.is_quantized = true,
|
||||||
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
||||||
.from_float = quantize_row_q5_0,
|
.from_float = quantize_row_q5_0,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
||||||
|
@ -1676,6 +1714,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_1] = {
|
[GGML_TYPE_Q5_1] = {
|
||||||
|
.type_name = "q5_1",
|
||||||
|
.blck_size = QK5_1,
|
||||||
|
.type_size = sizeof(block_q5_1),
|
||||||
|
.is_quantized = true,
|
||||||
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
||||||
.from_float = quantize_row_q5_1,
|
.from_float = quantize_row_q5_1,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
||||||
|
@ -1683,6 +1725,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_0] = {
|
[GGML_TYPE_Q8_0] = {
|
||||||
|
.type_name = "q8_0",
|
||||||
|
.blck_size = QK8_0,
|
||||||
|
.type_size = sizeof(block_q8_0),
|
||||||
|
.is_quantized = true,
|
||||||
.to_float = dequantize_row_q8_0,
|
.to_float = dequantize_row_q8_0,
|
||||||
.from_float = quantize_row_q8_0,
|
.from_float = quantize_row_q8_0,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
||||||
|
@ -1690,12 +1736,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_1] = {
|
[GGML_TYPE_Q8_1] = {
|
||||||
|
.type_name = "q8_1",
|
||||||
|
.blck_size = QK8_1,
|
||||||
|
.type_size = sizeof(block_q8_1),
|
||||||
|
.is_quantized = true,
|
||||||
.from_float = quantize_row_q8_1,
|
.from_float = quantize_row_q8_1,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
},
|
},
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
[GGML_TYPE_Q2_K] = {
|
[GGML_TYPE_Q2_K] = {
|
||||||
|
.type_name = "q2_K",
|
||||||
|
.blck_size = QK_K,
|
||||||
|
.type_size = sizeof(block_q2_K),
|
||||||
|
.is_quantized = true,
|
||||||
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
||||||
.from_float = quantize_row_q2_K,
|
.from_float = quantize_row_q2_K,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
||||||
|
@ -1703,6 +1757,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q3_K] = {
|
[GGML_TYPE_Q3_K] = {
|
||||||
|
.type_name = "q3_K",
|
||||||
|
.blck_size = QK_K,
|
||||||
|
.type_size = sizeof(block_q3_K),
|
||||||
|
.is_quantized = true,
|
||||||
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
||||||
.from_float = quantize_row_q3_K,
|
.from_float = quantize_row_q3_K,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
||||||
|
@ -1710,6 +1768,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_K] = {
|
[GGML_TYPE_Q4_K] = {
|
||||||
|
.type_name = "q4_K",
|
||||||
|
.blck_size = QK_K,
|
||||||
|
.type_size = sizeof(block_q4_K),
|
||||||
|
.is_quantized = true,
|
||||||
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
||||||
.from_float = quantize_row_q4_K,
|
.from_float = quantize_row_q4_K,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
||||||
|
@ -1717,6 +1779,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_K] = {
|
[GGML_TYPE_Q5_K] = {
|
||||||
|
.type_name = "q5_K",
|
||||||
|
.blck_size = QK_K,
|
||||||
|
.type_size = sizeof(block_q5_K),
|
||||||
|
.is_quantized = true,
|
||||||
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
||||||
.from_float = quantize_row_q5_K,
|
.from_float = quantize_row_q5_K,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
||||||
|
@ -1724,6 +1790,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q6_K] = {
|
[GGML_TYPE_Q6_K] = {
|
||||||
|
.type_name = "q6_K",
|
||||||
|
.blck_size = QK_K,
|
||||||
|
.type_size = sizeof(block_q6_K),
|
||||||
|
.is_quantized = true,
|
||||||
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
||||||
.from_float = quantize_row_q6_K,
|
.from_float = quantize_row_q6_K,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
||||||
|
@ -1731,15 +1801,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_K] = {
|
[GGML_TYPE_Q8_K] = {
|
||||||
|
.type_name = "q8_K",
|
||||||
|
.blck_size = QK_K,
|
||||||
|
.type_size = sizeof(block_q8_K),
|
||||||
|
.is_quantized = true,
|
||||||
.from_float = quantize_row_q8_K,
|
.from_float = quantize_row_q8_K,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
// For internal test use
|
// For internal test use
|
||||||
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) {
|
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||||
GGML_ASSERT(i < GGML_TYPE_COUNT);
|
GGML_ASSERT(type < GGML_TYPE_COUNT);
|
||||||
return type_traits[i];
|
return type_traits[type];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -3648,98 +3722,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
|
||||||
*s = idx;
|
*s = idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// data types
|
|
||||||
//
|
|
||||||
|
|
||||||
static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
|
||||||
[GGML_TYPE_F32] = 1,
|
|
||||||
[GGML_TYPE_F16] = 1,
|
|
||||||
[GGML_TYPE_Q4_0] = QK4_0,
|
|
||||||
[GGML_TYPE_Q4_1] = QK4_1,
|
|
||||||
[GGML_TYPE_Q5_0] = QK5_0,
|
|
||||||
[GGML_TYPE_Q5_1] = QK5_1,
|
|
||||||
[GGML_TYPE_Q8_0] = QK8_0,
|
|
||||||
[GGML_TYPE_Q8_1] = QK8_1,
|
|
||||||
#ifdef GGML_USE_K_QUANTS
|
|
||||||
[GGML_TYPE_Q2_K] = QK_K,
|
|
||||||
[GGML_TYPE_Q3_K] = QK_K,
|
|
||||||
[GGML_TYPE_Q4_K] = QK_K,
|
|
||||||
[GGML_TYPE_Q5_K] = QK_K,
|
|
||||||
[GGML_TYPE_Q6_K] = QK_K,
|
|
||||||
[GGML_TYPE_Q8_K] = QK_K,
|
|
||||||
#endif
|
|
||||||
[GGML_TYPE_I8] = 1,
|
|
||||||
[GGML_TYPE_I16] = 1,
|
|
||||||
[GGML_TYPE_I32] = 1,
|
|
||||||
};
|
|
||||||
static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
|
|
||||||
|
|
||||||
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
|
||||||
[GGML_TYPE_F32] = sizeof(float),
|
|
||||||
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
|
|
||||||
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
|
|
||||||
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
|
|
||||||
[GGML_TYPE_Q5_0] = sizeof(block_q5_0),
|
|
||||||
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
|
||||||
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
|
||||||
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
|
||||||
#ifdef GGML_USE_K_QUANTS
|
|
||||||
[GGML_TYPE_Q2_K] = sizeof(block_q2_K),
|
|
||||||
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
|
|
||||||
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
|
||||||
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
|
|
||||||
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
|
|
||||||
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
|
|
||||||
#endif
|
|
||||||
[GGML_TYPE_I8] = sizeof(int8_t),
|
|
||||||
[GGML_TYPE_I16] = sizeof(int16_t),
|
|
||||||
[GGML_TYPE_I32] = sizeof(int32_t),
|
|
||||||
};
|
|
||||||
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
|
|
||||||
|
|
||||||
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
|
||||||
[GGML_TYPE_F32] = "f32",
|
|
||||||
[GGML_TYPE_F16] = "f16",
|
|
||||||
[GGML_TYPE_Q4_0] = "q4_0",
|
|
||||||
[GGML_TYPE_Q4_1] = "q4_1",
|
|
||||||
[GGML_TYPE_Q5_0] = "q5_0",
|
|
||||||
[GGML_TYPE_Q5_1] = "q5_1",
|
|
||||||
[GGML_TYPE_Q8_0] = "q8_0",
|
|
||||||
[GGML_TYPE_Q8_1] = "q8_1",
|
|
||||||
[GGML_TYPE_Q2_K] = "q2_K",
|
|
||||||
[GGML_TYPE_Q3_K] = "q3_K",
|
|
||||||
[GGML_TYPE_Q4_K] = "q4_K",
|
|
||||||
[GGML_TYPE_Q5_K] = "q5_K",
|
|
||||||
[GGML_TYPE_Q6_K] = "q6_K",
|
|
||||||
[GGML_TYPE_Q8_K] = "q8_K",
|
|
||||||
[GGML_TYPE_I8] = "i8",
|
|
||||||
[GGML_TYPE_I16] = "i16",
|
|
||||||
[GGML_TYPE_I32] = "i32",
|
|
||||||
};
|
|
||||||
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
|
|
||||||
|
|
||||||
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
|
||||||
[GGML_TYPE_F32] = false,
|
|
||||||
[GGML_TYPE_F16] = false,
|
|
||||||
[GGML_TYPE_Q4_0] = true,
|
|
||||||
[GGML_TYPE_Q4_1] = true,
|
|
||||||
[GGML_TYPE_Q5_0] = true,
|
|
||||||
[GGML_TYPE_Q5_1] = true,
|
|
||||||
[GGML_TYPE_Q8_0] = true,
|
|
||||||
[GGML_TYPE_Q8_1] = true,
|
|
||||||
[GGML_TYPE_Q2_K] = true,
|
|
||||||
[GGML_TYPE_Q3_K] = true,
|
|
||||||
[GGML_TYPE_Q4_K] = true,
|
|
||||||
[GGML_TYPE_Q5_K] = true,
|
|
||||||
[GGML_TYPE_Q6_K] = true,
|
|
||||||
[GGML_TYPE_Q8_K] = true,
|
|
||||||
[GGML_TYPE_I8] = false,
|
|
||||||
[GGML_TYPE_I16] = false,
|
|
||||||
[GGML_TYPE_I32] = false,
|
|
||||||
};
|
|
||||||
static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
|
|
||||||
|
|
||||||
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"NONE",
|
"NONE",
|
||||||
|
|
||||||
|
@ -4109,7 +4091,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
||||||
//
|
//
|
||||||
// is enough, but just in case, adding the second part
|
// is enough, but just in case, adding the second part
|
||||||
|
|
||||||
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
||||||
|
@ -4119,23 +4101,27 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
||||||
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
|
return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_blck_size(enum ggml_type type) {
|
int ggml_blck_size(enum ggml_type type) {
|
||||||
return GGML_BLCK_SIZE[type];
|
return type_traits[type].blck_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_type_size(enum ggml_type type) {
|
size_t ggml_type_size(enum ggml_type type) {
|
||||||
return GGML_TYPE_SIZE[type];
|
return type_traits[type].type_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
float ggml_type_sizef(enum ggml_type type) {
|
float ggml_type_sizef(enum ggml_type type) {
|
||||||
return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
|
return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * ggml_type_name(enum ggml_type type) {
|
const char * ggml_type_name(enum ggml_type type) {
|
||||||
return GGML_TYPE_NAME[type];
|
return type_traits[type].type_name;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_is_quantized(enum ggml_type type) {
|
||||||
|
return type_traits[type].is_quantized;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * ggml_op_name(enum ggml_op op) {
|
const char * ggml_op_name(enum ggml_op op) {
|
||||||
|
@ -4147,7 +4133,7 @@ const char * ggml_op_symbol(enum ggml_op op) {
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
||||||
return GGML_TYPE_SIZE[tensor->type];
|
return ggml_type_size(tensor->type);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
||||||
|
@ -4185,10 +4171,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct
|
||||||
(t0->ne[3] == t1->ne[3]);
|
(t0->ne[3] == t1->ne[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_is_quantized(enum ggml_type type) {
|
|
||||||
return GGML_IS_QUANTIZED[type];
|
|
||||||
}
|
|
||||||
|
|
||||||
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||||
enum ggml_type wtype = GGML_TYPE_COUNT;
|
enum ggml_type wtype = GGML_TYPE_COUNT;
|
||||||
|
|
||||||
|
@ -4226,8 +4208,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return
|
return
|
||||||
tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
|
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
||||||
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] &&
|
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
||||||
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
}
|
}
|
||||||
|
@ -4236,7 +4218,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return
|
return
|
||||||
tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
|
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
||||||
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
}
|
}
|
||||||
|
@ -4251,7 +4233,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return
|
return
|
||||||
tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
|
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
||||||
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
}
|
}
|
||||||
|
@ -4570,7 +4552,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
size_t data_size = 0;
|
size_t data_size = 0;
|
||||||
|
|
||||||
if (data == NULL && !ctx->no_alloc) {
|
if (data == NULL && !ctx->no_alloc) {
|
||||||
data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
|
data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
|
||||||
for (int i = 1; i < n_dims; i++) {
|
for (int i = 1; i < n_dims; i++) {
|
||||||
data_size *= ne[i];
|
data_size *= ne[i];
|
||||||
}
|
}
|
||||||
|
@ -4625,8 +4607,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
result->ne[i] = ne[i];
|
result->ne[i] = ne[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
result->nb[0] = GGML_TYPE_SIZE[type];
|
result->nb[0] = ggml_type_size(type);
|
||||||
result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]);
|
result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
|
||||||
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
||||||
result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
|
result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
|
||||||
}
|
}
|
||||||
|
@ -7748,7 +7730,7 @@ static void ggml_compute_forward_dup_same_cont(
|
||||||
memcpy(
|
memcpy(
|
||||||
((char *) dst->data + ie0*nb0),
|
((char *) dst->data + ie0*nb0),
|
||||||
((char *) src0->data + ie0*nb00),
|
((char *) src0->data + ie0*nb00),
|
||||||
(ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
|
(ie1 - ie0) * ggml_type_size(src0->type));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -7782,7 +7764,7 @@ static void ggml_compute_forward_dup_f16(
|
||||||
|
|
||||||
if (src0->type == dst->type &&
|
if (src0->type == dst->type &&
|
||||||
ne00 == ne0 &&
|
ne00 == ne0 &&
|
||||||
nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
|
nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
|
||||||
// copy by rows
|
// copy by rows
|
||||||
const size_t rs = ne00*nb00;
|
const size_t rs = ne00*nb00;
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
|
@ -7840,7 +7822,7 @@ static void ggml_compute_forward_dup_f16(
|
||||||
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
||||||
|
|
||||||
size_t id = 0;
|
size_t id = 0;
|
||||||
size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
|
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
||||||
char * dst_ptr = (char *) dst->data;
|
char * dst_ptr = (char *) dst->data;
|
||||||
|
|
||||||
for (int i03 = 0; i03 < ne03; i03++) {
|
for (int i03 = 0; i03 < ne03; i03++) {
|
||||||
|
@ -8053,7 +8035,7 @@ static void ggml_compute_forward_dup_f32(
|
||||||
|
|
||||||
if (src0->type == dst->type &&
|
if (src0->type == dst->type &&
|
||||||
ne00 == ne0 &&
|
ne00 == ne0 &&
|
||||||
nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
|
nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
|
||||||
// copy by rows
|
// copy by rows
|
||||||
const size_t rs = ne00*nb00;
|
const size_t rs = ne00*nb00;
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
|
@ -8092,7 +8074,7 @@ static void ggml_compute_forward_dup_f32(
|
||||||
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
||||||
|
|
||||||
size_t id = 0;
|
size_t id = 0;
|
||||||
size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
|
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
||||||
char * dst_ptr = (char *) dst->data;
|
char * dst_ptr = (char *) dst->data;
|
||||||
|
|
||||||
for (int i03 = 0; i03 < ne03; i03++) {
|
for (int i03 = 0; i03 < ne03; i03++) {
|
||||||
|
@ -8504,7 +8486,7 @@ static void ggml_compute_forward_add_q_f32(
|
||||||
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
||||||
|
|
||||||
// we don't support permuted src0 or src1
|
// we don't support permuted src0 or src1
|
||||||
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
|
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||||
GGML_ASSERT(nb10 == sizeof(float));
|
GGML_ASSERT(nb10 == sizeof(float));
|
||||||
|
|
||||||
// dst cannot be transposed or permuted
|
// dst cannot be transposed or permuted
|
||||||
|
@ -8778,7 +8760,7 @@ static void ggml_compute_forward_add1_q_f32(
|
||||||
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
||||||
|
|
||||||
// we don't support permuted src0
|
// we don't support permuted src0
|
||||||
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
|
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||||
|
|
||||||
// dst cannot be transposed or permuted
|
// dst cannot be transposed or permuted
|
||||||
GGML_ASSERT(nb0 <= nb1);
|
GGML_ASSERT(nb0 <= nb1);
|
||||||
|
@ -10634,7 +10616,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
GGML_ASSERT(ne3 == ne13);
|
GGML_ASSERT(ne3 == ne13);
|
||||||
|
|
||||||
// we don't support permuted src0 or src1
|
// we don't support permuted src0 or src1
|
||||||
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
|
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||||
GGML_ASSERT(nb10 == sizeof(float));
|
GGML_ASSERT(nb10 == sizeof(float));
|
||||||
|
|
||||||
// dst cannot be transposed or permuted
|
// dst cannot be transposed or permuted
|
||||||
|
@ -10717,7 +10699,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
if (params->type == GGML_TASK_INIT) {
|
if (params->type == GGML_TASK_INIT) {
|
||||||
if (src1->type != vec_dot_type) {
|
if (src1->type != vec_dot_type) {
|
||||||
char * wdata = params->wdata;
|
char * wdata = params->wdata;
|
||||||
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
||||||
|
|
||||||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||||
|
@ -10737,7 +10719,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
}
|
}
|
||||||
|
|
||||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||||
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
||||||
|
|
||||||
const int64_t nr0 = ne01; // src0 rows
|
const int64_t nr0 = ne01; // src0 rows
|
||||||
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
||||||
|
@ -11210,7 +11192,7 @@ static void ggml_compute_forward_get_rows_q(
|
||||||
|
|
||||||
assert( dst->ne[0] == nc);
|
assert( dst->ne[0] == nc);
|
||||||
assert( dst->ne[1] == nr);
|
assert( dst->ne[1] == nr);
|
||||||
assert(src0->nb[0] == GGML_TYPE_SIZE[type]);
|
assert(src0->nb[0] == ggml_type_size(type));
|
||||||
|
|
||||||
for (int i = 0; i < nr; ++i) {
|
for (int i = 0; i < nr; ++i) {
|
||||||
const int r = ((int32_t *) src1->data)[i];
|
const int r = ((int32_t *) src1->data)[i];
|
||||||
|
@ -16387,7 +16369,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
|
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
if (ggml_is_quantized(node->type)) {
|
if (ggml_is_quantized(node->type)) {
|
||||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
|
|
||||||
work_size = MAX(work_size, cur);
|
work_size = MAX(work_size, cur);
|
||||||
|
@ -16400,7 +16382,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
|
|
||||||
if (ggml_is_quantized(node->src[0]->type)) {
|
if (ggml_is_quantized(node->src[0]->type)) {
|
||||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
|
|
||||||
work_size = MAX(work_size, cur);
|
work_size = MAX(work_size, cur);
|
||||||
|
@ -16412,7 +16394,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
|
|
||||||
if (ggml_is_quantized(node->src[0]->type)) {
|
if (ggml_is_quantized(node->src[0]->type)) {
|
||||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
|
|
||||||
work_size = MAX(work_size, cur);
|
work_size = MAX(work_size, cur);
|
||||||
|
@ -16495,12 +16477,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
// the threads are still spinning
|
// the threads are still spinning
|
||||||
if (node->src[0]->type != GGML_TYPE_F32) {
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
||||||
// here we need memory just for single 2D matrix from src0
|
// here we need memory just for single 2D matrix from src0
|
||||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
||||||
}
|
}
|
||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
if (node->src[1]->type != vec_dot_type) {
|
if (node->src[1]->type != vec_dot_type) {
|
||||||
cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type];
|
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
||||||
} else {
|
} else {
|
||||||
cur = 0;
|
cur = 0;
|
||||||
}
|
}
|
||||||
|
@ -18306,8 +18288,8 @@ enum ggml_opt_result ggml_opt_resume(
|
||||||
struct ggml_tensor * f) {
|
struct ggml_tensor * f) {
|
||||||
|
|
||||||
// build forward + backward compute graphs
|
// build forward + backward compute graphs
|
||||||
struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
|
struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
|
||||||
struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
|
struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
|
||||||
|
|
||||||
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
|
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
|
||||||
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
|
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
|
||||||
|
|
6
ggml.h
6
ggml.h
|
@ -1856,6 +1856,10 @@ extern "C" {
|
||||||
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
const char * type_name;
|
||||||
|
int blck_size;
|
||||||
|
size_t type_size;
|
||||||
|
bool is_quantized;
|
||||||
ggml_to_float_t to_float;
|
ggml_to_float_t to_float;
|
||||||
ggml_from_float_t from_float;
|
ggml_from_float_t from_float;
|
||||||
ggml_from_float_t from_float_reference;
|
ggml_from_float_t from_float_reference;
|
||||||
|
@ -1863,7 +1867,7 @@ extern "C" {
|
||||||
enum ggml_type vec_dot_type;
|
enum ggml_type vec_dot_type;
|
||||||
} ggml_type_traits_t;
|
} ggml_type_traits_t;
|
||||||
|
|
||||||
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
|
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
62
gguf.py
62
gguf.py
|
@ -5,7 +5,7 @@ import tempfile
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from enum import IntEnum, auto
|
from enum import IntEnum, auto
|
||||||
from typing import Any, IO, List
|
from typing import Any, IO, List, Optional
|
||||||
|
|
||||||
#
|
#
|
||||||
# constants
|
# constants
|
||||||
|
@ -45,7 +45,7 @@ KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
||||||
|
|
||||||
# RoPE
|
# RoPE
|
||||||
KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
KEY_ROPE_SCALE = "{arch}.rope.scale"
|
KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear"
|
||||||
|
|
||||||
# tokenization
|
# tokenization
|
||||||
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
|
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
|
||||||
|
@ -61,6 +61,7 @@ KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||||
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
|
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
|
||||||
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
|
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# recommended mapping of model tensor names for storage in gguf
|
# recommended mapping of model tensor names for storage in gguf
|
||||||
#
|
#
|
||||||
|
@ -319,6 +320,15 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
|
||||||
|
|
||||||
return tensor_map
|
return tensor_map
|
||||||
|
|
||||||
|
|
||||||
|
class TokenType(IntEnum):
|
||||||
|
NORMAL = 1
|
||||||
|
UNKNOWN = 2
|
||||||
|
CONTROL = 3
|
||||||
|
USER_DEFINED = 4
|
||||||
|
UNUSED = 5
|
||||||
|
BYTE = 6
|
||||||
|
|
||||||
#
|
#
|
||||||
# implementation
|
# implementation
|
||||||
#
|
#
|
||||||
|
@ -327,6 +337,18 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
|
||||||
class GGMLQuantizationType(IntEnum):
|
class GGMLQuantizationType(IntEnum):
|
||||||
F32 = 0
|
F32 = 0
|
||||||
F16 = 1
|
F16 = 1
|
||||||
|
Q4_0 = 2
|
||||||
|
Q4_1 = 3
|
||||||
|
Q5_0 = 6
|
||||||
|
Q5_1 = 7
|
||||||
|
Q8_0 = 8
|
||||||
|
Q8_1 = 9
|
||||||
|
Q2_K = 10
|
||||||
|
Q3_K = 11
|
||||||
|
Q4_K = 12
|
||||||
|
Q5_K = 13
|
||||||
|
Q6_K = 14
|
||||||
|
Q8_K = 15
|
||||||
|
|
||||||
|
|
||||||
class GGUFValueType(IntEnum):
|
class GGUFValueType(IntEnum):
|
||||||
|
@ -359,7 +381,7 @@ class GGUFValueType(IntEnum):
|
||||||
|
|
||||||
|
|
||||||
class GGUFWriter:
|
class GGUFWriter:
|
||||||
def __init__(self, path: str, arch: str):
|
def __init__(self, path: str, arch: str, use_temp_file = True):
|
||||||
self.fout = open(path, "wb")
|
self.fout = open(path, "wb")
|
||||||
self.arch = arch
|
self.arch = arch
|
||||||
self.offset_tensor = 0
|
self.offset_tensor = 0
|
||||||
|
@ -369,6 +391,8 @@ class GGUFWriter:
|
||||||
self.ti_data = b""
|
self.ti_data = b""
|
||||||
self.ti_data_count = 0
|
self.ti_data_count = 0
|
||||||
self.add_architecture()
|
self.add_architecture()
|
||||||
|
self.use_temp_file = use_temp_file
|
||||||
|
self.tensors = []
|
||||||
|
|
||||||
def write_header_to_file(self):
|
def write_header_to_file(self):
|
||||||
self.fout.write(struct.pack("<I", GGUF_MAGIC))
|
self.fout.write(struct.pack("<I", GGUF_MAGIC))
|
||||||
|
@ -476,8 +500,8 @@ class GGUFWriter:
|
||||||
def ggml_pad(x: int, n: int) -> int:
|
def ggml_pad(x: int, n: int) -> int:
|
||||||
return ((x + n - 1) // n) * n
|
return ((x + n - 1) // n) * n
|
||||||
|
|
||||||
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int):
|
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
|
||||||
assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
|
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
|
||||||
|
|
||||||
encoded_name = name.encode("utf8")
|
encoded_name = name.encode("utf8")
|
||||||
self.ti_data += struct.pack("<I", len(encoded_name))
|
self.ti_data += struct.pack("<I", len(encoded_name))
|
||||||
|
@ -486,23 +510,30 @@ class GGUFWriter:
|
||||||
self.ti_data += struct.pack("<I", n_dims)
|
self.ti_data += struct.pack("<I", n_dims)
|
||||||
for i in range(n_dims):
|
for i in range(n_dims):
|
||||||
self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
|
self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
|
||||||
|
if raw_dtype is None:
|
||||||
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
|
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
|
||||||
|
else:
|
||||||
|
dtype = raw_dtype
|
||||||
self.ti_data += struct.pack("<I", dtype)
|
self.ti_data += struct.pack("<I", dtype)
|
||||||
self.ti_data += struct.pack("<Q", self.offset_tensor)
|
self.ti_data += struct.pack("<Q", self.offset_tensor)
|
||||||
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
|
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
|
||||||
self.ti_data_count += 1
|
self.ti_data_count += 1
|
||||||
|
|
||||||
def add_tensor(self, name: str, tensor: np.ndarray):
|
def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
|
||||||
if not hasattr(self, "temp_file"):
|
if self.use_temp_file and not hasattr(self, "temp_file"):
|
||||||
self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
|
self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
|
||||||
self.temp_file.seek(0)
|
self.temp_file.seek(0)
|
||||||
|
|
||||||
self.add_tensor_info(name, tensor.shape, tensor.dtype, tensor.nbytes)
|
self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
|
||||||
|
|
||||||
|
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
|
||||||
|
|
||||||
|
if not self.use_temp_file:
|
||||||
|
self.tensors.append((tensor, pad))
|
||||||
|
return
|
||||||
|
|
||||||
tensor.tofile(self.temp_file)
|
tensor.tofile(self.temp_file)
|
||||||
|
|
||||||
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
|
|
||||||
if pad != 0:
|
if pad != 0:
|
||||||
self.temp_file.write(bytes([0] * pad))
|
self.temp_file.write(bytes([0] * pad))
|
||||||
|
|
||||||
|
@ -524,6 +555,13 @@ class GGUFWriter:
|
||||||
if pad != 0:
|
if pad != 0:
|
||||||
self.fout.write(bytes([0] * pad))
|
self.fout.write(bytes([0] * pad))
|
||||||
|
|
||||||
|
if not self.use_temp_file:
|
||||||
|
for (currtensor, currpad) in self.tensors:
|
||||||
|
currtensor.tofile(self.fout)
|
||||||
|
if currpad != 0:
|
||||||
|
self.fout.write(bytes([0] * currpad))
|
||||||
|
return
|
||||||
|
|
||||||
self.temp_file.seek(0)
|
self.temp_file.seek(0)
|
||||||
|
|
||||||
shutil.copyfileobj(self.temp_file, self.fout)
|
shutil.copyfileobj(self.temp_file, self.fout)
|
||||||
|
@ -620,8 +658,8 @@ class GGUFWriter:
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
|
KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
def add_rope_scale(self, value: float):
|
def add_rope_scale_linear(self, value: float):
|
||||||
self.add_float32(KEY_ROPE_SCALE.format(arch=self.arch), value)
|
self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_tokenizer_model(self, model: str):
|
def add_tokenizer_model(self, model: str):
|
||||||
self.add_string(KEY_TOKENIZER_MODEL, model)
|
self.add_string(KEY_TOKENIZER_MODEL, model)
|
||||||
|
|
93
llama.cpp
93
llama.cpp
|
@ -771,11 +771,12 @@ struct llama_vocab {
|
||||||
|
|
||||||
using id = int32_t;
|
using id = int32_t;
|
||||||
using token = std::string;
|
using token = std::string;
|
||||||
|
using ttype = llama_token_type;
|
||||||
|
|
||||||
struct token_data {
|
struct token_data {
|
||||||
token tok;
|
token text;
|
||||||
float score;
|
float score;
|
||||||
int toktype;
|
ttype type;
|
||||||
};
|
};
|
||||||
|
|
||||||
llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
@ -1436,6 +1437,14 @@ static void llama_model_load_internal(
|
||||||
hparams.n_head_kv = hparams.n_head;
|
hparams.n_head_kv = hparams.n_head;
|
||||||
GGUF_GET(hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv");
|
GGUF_GET(hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv");
|
||||||
|
|
||||||
|
// TODO: manually setting rope scale should override this
|
||||||
|
// rope_freq_scale (inverse of the kv) is optional
|
||||||
|
float ropescale = 1.0f;
|
||||||
|
GGUF_GET(ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, "llama.rope.scale_linear");
|
||||||
|
if (ropescale != 1.0f) {
|
||||||
|
rope_freq_scale = 1.0f/ropescale;
|
||||||
|
}
|
||||||
|
|
||||||
// get general kv
|
// get general kv
|
||||||
GGUF_GET(general_name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
|
GGUF_GET(general_name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
|
||||||
GGUF_GET(general_arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
|
GGUF_GET(general_arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
|
||||||
|
@ -1513,12 +1522,12 @@ static void llama_model_load_internal(
|
||||||
vocab.token_to_id[word] = i;
|
vocab.token_to_id[word] = i;
|
||||||
|
|
||||||
auto & token_data = vocab.id_to_token[i];
|
auto & token_data = vocab.id_to_token[i];
|
||||||
token_data.tok = std::move(word);
|
token_data.text = std::move(word);
|
||||||
token_data.score = scores[i];
|
token_data.score = scores[i];
|
||||||
token_data.toktype = toktypes[i];
|
token_data.type = (llama_token_type) toktypes[i];
|
||||||
|
|
||||||
// determine the newline token: 0x0A == 10 == '\n'
|
// determine the newline token: 0x0A == 10 == '\n'
|
||||||
if (token_data.tok == "<0x0A>") {
|
if (token_data.text == "<0x0A>") {
|
||||||
vocab.linefeed_id = i;
|
vocab.linefeed_id = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1550,12 +1559,12 @@ static void llama_model_load_internal(
|
||||||
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, general_name.c_str());
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, general_name.c_str());
|
||||||
|
|
||||||
// special tokens
|
// special tokens
|
||||||
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].tok.c_str() ); }
|
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
||||||
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].tok.c_str() ); }
|
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
||||||
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].tok.c_str() ); }
|
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
||||||
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].tok.c_str() ); }
|
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
||||||
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].tok.c_str() ); }
|
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
||||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].tok.c_str() ); }
|
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||||
}
|
}
|
||||||
|
|
||||||
if (vocab_only) {
|
if (vocab_only) {
|
||||||
|
@ -2347,15 +2356,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
||||||
return vocab.id_to_token[id].toktype == 1;
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
||||||
return vocab.id_to_token[id].toktype == 2;
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
||||||
return vocab.id_to_token[id].toktype == 3;
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
|
||||||
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
|
||||||
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
||||||
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
|
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
|
||||||
|
@ -2373,22 +2394,10 @@ static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
|
||||||
return id == vocab.special_pad_id;
|
return id == vocab.special_pad_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
|
|
||||||
return vocab.id_to_token[id].toktype == 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
|
|
||||||
return vocab.id_to_token[id].toktype == 5;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
|
||||||
return vocab.id_to_token[id].toktype == 6;
|
|
||||||
}
|
|
||||||
|
|
||||||
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
|
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
|
||||||
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
||||||
const auto& token_data = vocab.id_to_token.at(id);
|
const auto& token_data = vocab.id_to_token.at(id);
|
||||||
auto buf = token_data.tok.substr(3, 2);
|
auto buf = token_data.text.substr(3, 2);
|
||||||
return strtol(buf.c_str(), NULL, 16);
|
return strtol(buf.c_str(), NULL, 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2701,6 +2710,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
||||||
|
|
||||||
bool found = false;
|
bool found = false;
|
||||||
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
||||||
|
|
||||||
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
|
||||||
|
|
||||||
do {
|
do {
|
||||||
|
@ -4949,25 +4959,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
||||||
return ctx->embedding.data();
|
return ctx->embedding.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_get_vocab(
|
const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
|
||||||
const struct llama_context * ctx,
|
return ctx->model.vocab.id_to_token[token].text.c_str();
|
||||||
const char * * strings,
|
|
||||||
float * scores,
|
|
||||||
int capacity) {
|
|
||||||
return llama_model_get_vocab(&ctx->model, strings, scores, capacity);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_model_get_vocab(
|
float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
|
||||||
const struct llama_model * model,
|
return ctx->model.vocab.id_to_token[token].score;
|
||||||
const char * * strings,
|
}
|
||||||
float * scores,
|
|
||||||
int capacity) {
|
llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
|
||||||
int n = std::min(capacity, (int) model->vocab.id_to_token.size());
|
return ctx->model.vocab.id_to_token[token].type;
|
||||||
for (int i = 0; i<n; ++i) {
|
|
||||||
strings[i] = model->vocab.id_to_token[i].tok.c_str();
|
|
||||||
scores[i] = model->vocab.id_to_token[i].score;
|
|
||||||
}
|
|
||||||
return n;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_token_bos(const struct llama_context * ctx) {
|
llama_token llama_token_bos(const struct llama_context * ctx) {
|
||||||
|
@ -5038,7 +5039,7 @@ int llama_token_to_str(const struct llama_context * ctx, llama_token token, char
|
||||||
|
|
||||||
int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
||||||
if (0 <= token && token < llama_model_n_vocab(&ctx->model)) {
|
if (0 <= token && token < llama_model_n_vocab(&ctx->model)) {
|
||||||
std::string result = ctx->model.vocab.id_to_token[token].tok;
|
std::string result = ctx->model.vocab.id_to_token[token].text;
|
||||||
if (length < (int) result.length()) {
|
if (length < (int) result.length()) {
|
||||||
return -result.length();
|
return -result.length();
|
||||||
}
|
}
|
||||||
|
@ -5052,7 +5053,7 @@ int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token,
|
||||||
int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
||||||
if (0 <= token && token < llama_model_n_vocab(model)) {
|
if (0 <= token && token < llama_model_n_vocab(model)) {
|
||||||
if (llama_is_normal_token(model->vocab, token)) {
|
if (llama_is_normal_token(model->vocab, token)) {
|
||||||
std::string result = model->vocab.id_to_token[token].tok;
|
std::string result = model->vocab.id_to_token[token].text;
|
||||||
if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
|
if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
|
||||||
result = llama_unescape_whitespace(result);
|
result = llama_unescape_whitespace(result);
|
||||||
}
|
}
|
||||||
|
|
26
llama.h
26
llama.h
|
@ -72,6 +72,16 @@ extern "C" {
|
||||||
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum llama_token_type {
|
||||||
|
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
||||||
|
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
||||||
|
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
||||||
|
LLAMA_TOKEN_TYPE_CONTROL = 3,
|
||||||
|
LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
|
||||||
|
LLAMA_TOKEN_TYPE_UNUSED = 5,
|
||||||
|
LLAMA_TOKEN_TYPE_BYTE = 6,
|
||||||
|
};
|
||||||
|
|
||||||
// model file types
|
// model file types
|
||||||
enum llama_ftype {
|
enum llama_ftype {
|
||||||
LLAMA_FTYPE_ALL_F32 = 0,
|
LLAMA_FTYPE_ALL_F32 = 0,
|
||||||
|
@ -330,19 +340,11 @@ extern "C" {
|
||||||
// Vocab
|
// Vocab
|
||||||
//
|
//
|
||||||
|
|
||||||
// Get the vocabulary as output parameters.
|
LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
|
||||||
// Returns number of results.
|
|
||||||
LLAMA_API int llama_get_vocab(
|
|
||||||
const struct llama_context * ctx,
|
|
||||||
const char * * strings,
|
|
||||||
float * scores,
|
|
||||||
int capacity);
|
|
||||||
|
|
||||||
LLAMA_API int llama_model_get_vocab(
|
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
|
||||||
const struct llama_model * model,
|
|
||||||
const char * * strings,
|
LLAMA_API llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
|
||||||
float * scores,
|
|
||||||
int capacity);
|
|
||||||
|
|
||||||
// Special tokens
|
// Special tokens
|
||||||
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue