Merge branch 'master' into concedo_experimental

# Conflicts:
#	Makefile
#	tests/test-tokenizer-0-falcon.py
#	tests/test-tokenizer-0-llama.py
This commit is contained in:
Concedo 2023-11-20 22:37:06 +08:00
commit 56a5fa7a60
21 changed files with 275 additions and 422 deletions

20
.github/workflows/python-lint.yml vendored Normal file
View file

@ -0,0 +1,20 @@
name: flake8 Lint
on: [push, pull_request]
jobs:
flake8-lint:
runs-on: ubuntu-latest
name: Lint
steps:
- name: Check out source repository
uses: actions/checkout@v3
- name: Set up Python environment
uses: actions/setup-python@v4
with:
python-version: "3.11"
- name: flake8 Lint
uses: py-actions/flake8@v2
with:
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
exclude: "examples/*,examples/*/**,*/**/__init__.py"

1
.gitignore vendored
View file

@ -53,6 +53,7 @@ models-mnt
/speculative
/parallel
/train-text-from-scratch
/tokenize
/vdot
/common/build-info.cpp
arm_neon.h

View file

@ -492,6 +492,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.interactive_first = true;
} else if (arg == "-ins" || arg == "--instruct") {
params.instruct = true;
} else if (arg == "-cml" || arg == "--chatml") {
params.chatml = true;
} else if (arg == "--infill") {
params.infill = true;
} else if (arg == "--multiline-input") {
@ -731,6 +733,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -i, --interactive run in interactive mode\n");
printf(" --interactive-first run in interactive mode and wait for input right away\n");
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
printf(" -r PROMPT, --reverse-prompt PROMPT\n");
printf(" halt generation at PROMPT, return control in interactive mode\n");
@ -932,7 +935,7 @@ void llama_batch_add(
const std::vector<llama_seq_id> & seq_ids,
bool logits) {
batch.token [batch.n_tokens] = id;
batch.pos [batch.n_tokens] = pos,
batch.pos [batch.n_tokens] = pos;
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
for (size_t i = 0; i < seq_ids.size(); ++i) {
batch.seq_id[batch.n_tokens][i] = seq_ids[i];

View file

@ -110,6 +110,7 @@ struct gpt_params {
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it

View file

@ -1,315 +0,0 @@
#!/usr/bin/env python3
# HF baichuan --> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Any
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
if TYPE_CHECKING:
from typing import TypeAlias
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
# reverse HF permute back to original pth layout
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
r = weights.shape[0] // 3
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
r = weights.shape[0] // 3
return weights[r * n_part : r * n_part + r, ...]
def count_model_parts(dir_model: str) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
endianess = gguf.GGUFEndian.LITTLE
if args.bigendian:
endianess = gguf.GGUFEndian.BIG
endianess_str = "Big Endian" if args.bigendian else "Little Endian"
print(f"gguf: Conversion Endianess {endianess}")
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
print("hello print: ",hparams["architectures"][0])
if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
# get number of model parts
num_parts = count_model_parts(dir_model)
print(f"num_parts:{num_parts}\n")
ARCH=gguf.MODEL_ARCH.BAICHUAN
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"]
head_count = hparams["num_attention_heads"]
if "num_key_value_heads" in hparams:
head_count_kv = hparams["num_key_value_heads"]
else:
head_count_kv = head_count
if "_name_or_path" in hparams:
hf_repo = hparams["_name_or_path"]
else:
hf_repo = ""
if "max_sequence_length" in hparams:
ctx_length = hparams["max_sequence_length"]
elif "max_position_embeddings" in hparams:
ctx_length = hparams["max_position_embeddings"]
elif "model_max_length" in hparams:
ctx_length = hparams["model_max_length"]
else:
print("gguf: can not find ctx length parameter.")
sys.exit()
gguf_writer.add_name(dir_model.name)
gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout("Meta AI original pth")
gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
tokenizer_model_file = dir_model / 'tokenizer.model'
if not tokenizer_model_file.is_file():
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
sys.exit(1)
# vocab type sentencepiece
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
vocab_size = hparams.get('vocab_size')
if vocab_size is None:
vocab_size = tokenizer.vocab_size()
for i in range(vocab_size):
text: bytes
score: float
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)
toktype = 1 # defualt to normal token type
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3
# toktype = 4 is user-defined = tokens from added_tokens.json
if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
added_tokens_file = dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
addtokens_json = json.load(f)
print("gguf: get added tokens")
for key in addtokens_json:
tokens.append( key.encode("utf-8") )
scores.append(-1000.0)
toktypes.append(4) # user-defined token type
gguf_writer.add_tokenizer_model("llama")
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens))
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
tmp=model_part
for i in range(block_count):
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
print(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
for name in model_part.keys():
data = model_part[name]
# we don't need these
if name.endswith(".rotary_emb.inv_freq"):
continue
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View file

@ -827,13 +827,14 @@ class StableLMModel(Model):
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"]*(hparams["hidden_size"] // hparams["num_attention_heads"])))
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
self.gguf_writer.add_layer_norm_eps(1e-5)
###### CONVERSION LOGIC ######
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
parser.add_argument(

View file

@ -14,11 +14,13 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
class GGMLFormat(IntEnum):
GGML = 0
GGMF = 1
GGJT = 2
class GGMLFType(IntEnum):
ALL_F32 = 0
MOSTLY_F16 = 1
@ -38,6 +40,7 @@ class GGMLFType(IntEnum):
MOSTLY_Q5_K_M = 17
MOSTLY_Q6_K = 18
class Hyperparameters:
def __init__(self):
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
@ -69,6 +72,7 @@ class Hyperparameters:
def __str__(self):
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
class Vocab:
def __init__(self, load_scores = True):
self.items = []
@ -90,6 +94,7 @@ class Vocab:
self.items.append((item_text, item_score))
return offset - orig_offset
class Tensor:
def __init__(self, use_padding = True):
self.name = None
@ -123,6 +128,7 @@ class Tensor:
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
return offset - orig_offset
class GGMLModel:
def __init__(self):
self.hyperparameters = None
@ -159,8 +165,8 @@ class GGMLModel:
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
if len(err) > 0:
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
@ -187,6 +193,7 @@ class GGMLModel:
hp.set_n_ff(self)
return offset
class GGMLToGGUF:
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
hp = ggml_model.hyperparameters
@ -217,7 +224,7 @@ class GGMLToGGUF:
gguf_writer = gguf.GGUFWriter(
self.cfg.output,
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
use_temp_file = False )
use_temp_file = False)
self.add_params(gguf_writer)
self.add_vocab(gguf_writer)
if self.special_vocab is not None:
@ -341,7 +348,8 @@ class GGMLToGGUF:
mapped_name,
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
raw_shape = tempdims,
raw_dtype = tensor.dtype )
raw_dtype = tensor.dtype)
def handle_metadata(cfg, hp):
import convert
@ -365,38 +373,40 @@ def handle_metadata(cfg, hp):
raise ValueError('Unable to load metadata')
vocab = convert.load_vocab(
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
cfg.vocabtype )
cfg.vocabtype)
# FIXME: Respect cfg.vocab_dir?
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
load_merges = cfg.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
load_merges = cfg.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
convert.check_vocab_size(params, vocab)
return (params, vocab, svocab)
def handle_args():
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
parser.add_argument('--input', '-i', type = Path, required = True,
help = 'Input GGMLv3 filename')
help = 'Input GGMLv3 filename')
parser.add_argument('--output', '-o', type = Path, required = True,
help ='Output GGUF filename')
help ='Output GGUF filename')
parser.add_argument('--name',
help = 'Set model name')
help = 'Set model name')
parser.add_argument('--desc',
help = 'Set model description')
help = 'Set model description')
parser.add_argument('--gqa', type = int, default = 1,
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
parser.add_argument('--eps', default = '5.0e-06',
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
parser.add_argument('--context-length', '-c', type=int, default = 2048,
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
parser.add_argument('--model-metadata-dir', '-m', type = Path,
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path,
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
return parser.parse_args()
def main():
cfg = handle_args()
print(f'* Using config: {cfg}')
@ -406,7 +416,7 @@ def main():
data = np.memmap(cfg.input, mode = 'r')
model = GGMLModel()
print('* Scanning GGML input file')
offset = model.load(data, 0)
offset = model.load(data, 0) # noqa
print(f'* GGML model hyperparameters: {model.hyperparameters}')
vocab_override = None
params_override = None
@ -421,12 +431,15 @@ def main():
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
if model.file_format == GGMLFormat.GGML:
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
converter = GGMLToGGUF(model, data, cfg,
converter = GGMLToGGUF(
model, data, cfg,
params_override = params_override,
vocab_override = vocab_override,
special_vocab = special_vocab )
special_vocab = special_vocab
)
converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}')
if __name__ == '__main__':
main()

View file

@ -9,6 +9,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
def _flatten_dict(dct, tensors, prefix=None):
assert isinstance(dct, dict)
for key in dct.keys():
@ -21,6 +22,7 @@ def _flatten_dict(dct, tensors, prefix=None):
raise ValueError(type(dct[key]))
return None
def _get_sentencepiece_tokenizer_info(dir_model: Path):
tokenizer_path = dir_model / 'adept_vocab.model'
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
@ -54,6 +56,7 @@ def _get_sentencepiece_tokenizer_info(dir_model: Path):
pass
return tokens, scores, toktypes
def main():
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
@ -125,6 +128,5 @@ def main():
print("")
if __name__ == '__main__':
main()

60
convert.py Executable file → Normal file
View file

@ -46,6 +46,7 @@ DEFAULT_CONCURRENCY = 8
# data types
#
@dataclass(frozen=True)
class DataType:
name: str
@ -55,15 +56,18 @@ class DataType:
def elements_to_bytes(self, n_elements: int) -> int:
return n_elements * self.dtype.itemsize
@dataclass(frozen=True)
class UnquantizedDataType(DataType):
pass
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
@dataclass(frozen=True)
class QuantizedDataType(DataType):
block_size: int
@ -77,6 +81,7 @@ class QuantizedDataType(DataType):
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
@dataclass(frozen=True)
class Q8_0QuantizedDataType(QuantizedDataType):
# Mini Q8_0 quantization in Python!
@ -86,6 +91,7 @@ class Q8_0QuantizedDataType(QuantizedDataType):
n_blocks = arr.size // self.block_size
blocks = arr.reshape((n_blocks, self.block_size))
# Much faster implementation of block quantization contributed by @Cebtenzzre
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
d = abs(blocks).max(axis = 1) / np.float32(127)
with np.errstate(divide = 'ignore'):
@ -94,10 +100,11 @@ class Q8_0QuantizedDataType(QuantizedDataType):
yield from zip(d, qs)
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
dtype = np.dtype(np.float32), valid_conversions = [],
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
dtype = np.dtype(np.float32), valid_conversions = [],
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
# Quantized types skipped here because they may also map to np.float32
NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
@ -116,6 +123,8 @@ SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
# TODO: match this with `llama_ftype`
# TODO: rename to LLAMAFileType
# TODO: move to `gguf.py`
class GGMLFileType(enum.IntEnum):
AllF32 = 0
MostlyF16 = 1 # except 1d tensors
@ -128,6 +137,7 @@ class GGMLFileType(enum.IntEnum):
# 1D tensors are always F32.
return dt if len(tensor.shape) > 1 else DT_F32
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
GGMLFileType.AllF32 : DT_F32,
GGMLFileType.MostlyF16 : DT_F16,
@ -138,6 +148,7 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
# hparams loading
#
@dataclass
class Params:
n_vocab: int
@ -167,11 +178,11 @@ class Params:
# try transformer naming first
if "model.layers.0.self_attn.q_proj.weight" in model:
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
else:
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
if n_layer < 1:
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
@ -308,7 +319,7 @@ class BpeVocab:
(item['content'], item['id'])
for item in tokenizer_json.get('added_tokens', [])
# Added tokens here can be duplicates of the main vocabulary.
if item['content'] not in self.bpe_tokenizer )
if item['content'] not in self.bpe_tokenizer)
vocab_size: int = len(self.bpe_tokenizer)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
@ -326,7 +337,6 @@ class BpeVocab:
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.bpe_tokenizer
from transformers.models.gpt2 import tokenization_gpt2
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
for i, _ in enumerate(tokenizer):
@ -406,6 +416,7 @@ class SentencePieceVocab:
def __repr__(self) -> str:
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
#
@ -413,13 +424,14 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
# TODO: reuse (probably move to gguf.py?)
#
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
# print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
.swapaxes(1, 2)
.reshape(weights.shape))
class Tensor(metaclass=ABCMeta):
@ -500,7 +512,7 @@ class LazyTensor:
ret = self._load()
# Should be okay if it maps to the same numpy type?
assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
(self.data_type, ret.data_type, self.description)
(self.data_type, ret.data_type, self.description)
return ret
def astype(self, data_type: DataType) -> LazyTensor:
@ -588,6 +600,7 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe
return lazy_tensor.load().permute(n_head, n_head_kv)
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
def load() -> Tensor:
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
@ -595,6 +608,7 @@ def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_
s[0] = s[0] // 3
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
def load() -> Tensor:
return lazy_tensor.load().part(n_part)
@ -744,6 +758,7 @@ def lazy_load_file(path: Path) -> ModelPlus:
In = TypeVar('In')
Out = TypeVar('Out')
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
'''Parallel map, but with backpressure. If the caller doesn't call `next`
fast enough, this will stop calling `func` at some point rather than
@ -778,6 +793,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
break
yield result
def check_vocab_size(params: Params, vocab: Vocab) -> None:
if params.n_vocab != vocab.vocab_size:
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
@ -796,7 +812,7 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
class OutputFile:
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def add_meta_arch(self, params: Params) -> None:
@ -876,7 +892,7 @@ class OutputFile:
self.gguf.close()
@staticmethod
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
check_vocab_size(params, vocab)
of = OutputFile(fname_out, endianess=endianess)
@ -938,8 +954,9 @@ class OutputFile:
of.close()
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
return GGMLFileType.AllF32
@ -952,10 +969,12 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
raise Exception(f"Unexpected combination of types: {name_to_type}")
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
for (name, tensor) in model.items()}
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
@ -968,7 +987,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
print(f"Permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
#tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
# tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
print(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
@ -993,6 +1012,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
return out
def nth_multifile_path(path: Path, n: int) -> Path | None:
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
the nth path in the model.
@ -1174,8 +1194,8 @@ def main(args_in: list[str] | None = None) -> None:
# FIXME: Try to respect vocab_dir somehow?
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
load_merges = args.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
load_merges = args.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
outfile = args.outfile
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
print(f"Wrote {outfile}")
@ -1188,8 +1208,8 @@ def main(args_in: list[str] | None = None) -> None:
vocab = load_vocab(vocab_dir, args.vocabtype)
# FIXME: Try to respect vocab_dir somehow?
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
load_merges = args.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
load_merges = args.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
model = model_plus.model
model = convert_model_names(model, params)

View file

@ -147,6 +147,13 @@ int main(int argc, char ** argv) {
return 0;
}
if (params.chatml) {
printf("\n************\n");
printf("%s: please use the 'main' tool for chatml mode\n", __func__);
printf("************\n\n");
return 0;
}
if (!params.antiprompt.empty()) {
printf("\n************\n");
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);

View file

@ -235,8 +235,11 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_inp;
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
if (params.chatml) {
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
}
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
} else {
LOG("use session tokens\n");
@ -314,7 +317,7 @@ int main(int argc, char ** argv) {
}
// number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
params.n_keep = (int)embd_inp.size();
}
@ -325,11 +328,23 @@ int main(int argc, char ** argv) {
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
// chatml prefix & suffix
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
// in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) {
params.interactive_first = true;
params.antiprompt.push_back("### Instruction:\n\n");
}
// similar for chatml mode
else if (params.chatml) {
params.interactive_first = true;
params.antiprompt.push_back("<|im_start|>user\n");
}
// enable interactive mode if interactive start is specified
if (params.interactive_first) {
@ -706,7 +721,7 @@ int main(int argc, char ** argv) {
is_interacting = true;
printf("\n");
} else if (params.instruct) {
} else if (params.instruct || params.chatml) {
is_interacting = true;
}
}
@ -714,7 +729,7 @@ int main(int argc, char ** argv) {
if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n");
if (params.instruct) {
if (params.instruct || params.chatml) {
printf("\n> ");
}
@ -761,6 +776,12 @@ int main(int argc, char ** argv) {
n_consumed = embd_inp.size();
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
}
// chatml mode: insert user chat prefix
if (params.chatml && !is_antiprompt) {
LOG("inserting chatml prefix\n");
n_consumed = embd_inp.size();
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
}
if (params.escape) {
process_escapes(buffer);
}
@ -779,6 +800,11 @@ int main(int argc, char ** argv) {
LOG("inserting instruction suffix\n");
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
}
// chatml mode: insert assistant chat suffix
if (params.chatml) {
LOG("inserting chatml suffix\n");
embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
}
for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i];
@ -804,7 +830,7 @@ int main(int argc, char ** argv) {
}
// end of text token
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
LOG_TEE(" [end of text]\n");
break;
}

View file

@ -94,6 +94,10 @@ export async function* llama(prompt, params = {}, config = {}) {
break;
}
}
if (result.error) {
result.error = JSON.parse(result.error);
console.error(`llama.cpp error: ${result.error.content}`);
}
}
}
}

View file

@ -2369,6 +2369,17 @@ int main(int argc, char **argv)
break;
}
} else {
const std::string str =
"error: " +
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n";
LOG_VERBOSE("data stream", {
{ "to_send", str }
});
if (!sink.write(str.c_str(), str.size()))
{
return false;
}
break;
}
}

View file

@ -96,9 +96,22 @@ int main(int argc, char ** argv) {
}
}
// tokenize the prompt
// Tokenize the prompt
const bool add_bos_tgt = llama_should_add_bos_token(model_tgt);
LOG("add_bos tgt: %d\n", add_bos_tgt);
const bool add_bos_dft = llama_should_add_bos_token(model_dft);
LOG("add_bos dft: %d\n", add_bos_dft);
if (add_bos_tgt != add_bos_dft) {
fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__);
fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt);
return 1;
}
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
const int max_context_size = llama_n_ctx(ctx_tgt);
const int max_tokens_list_size = max_context_size - 4;

View file

@ -26,7 +26,7 @@ int main(int argc, char ** argv) {
llama_context_params ctx_params = llama_context_default_params();
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
const bool add_bos = true;
const bool add_bos = llama_should_add_bos_token(model);
std::vector<llama_token> tokens;

View file

@ -235,7 +235,7 @@ typedef float2 dfloat2;
#endif //GGML_CUDA_F16
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
int x32 = 0;
x32 |= x16[0] << 0;
@ -245,7 +245,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
}
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
int x32 = 0;
x32 |= x16[0] << 0;
@ -255,11 +255,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
}
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
}
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
}
template<typename T>
@ -469,7 +469,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
#define MUL_MAT_SRC1_COL_STRIDE 128
#define MAX_STREAMS 8
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@ -2249,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
}
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
(void)x_qh; (void)x_sc;
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
@ -2260,7 +2261,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
(void)x_qh; (void)x_sc;
GGML_CUDA_ASSUME(i_offset >= 0);
GGML_CUDA_ASSUME(i_offset < nwarps);
GGML_CUDA_ASSUME(k >= 0);
@ -2269,7 +2270,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const int kbx = k / QI4_0;
const int kqsx = k % QI4_0;
const block_q4_0 * bx0 = (block_q4_0 *) vx;
const block_q4_0 * bx0 = (const block_q4_0 *) vx;
float * x_dmf = (float *) x_dm;
@ -2307,9 +2308,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
(void)x_qh; (void)x_sc;
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
const float * x_dmf = (float *) x_dm;
const float * x_dmf = (const float *) x_dm;
int u[2*VDR_Q4_0_Q8_1_MMQ];
@ -2343,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
}
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
(void)x_qh; (void)x_sc;
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
@ -2354,6 +2357,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
(void)x_qh; (void)x_sc;
GGML_CUDA_ASSUME(i_offset >= 0);
GGML_CUDA_ASSUME(i_offset < nwarps);
@ -2363,7 +2367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const int kbx = k / QI4_1;
const int kqsx = k % QI4_1;
const block_q4_1 * bx0 = (block_q4_1 *) vx;
const block_q4_1 * bx0 = (const block_q4_1 *) vx;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@ -2398,6 +2402,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
(void)x_qh; (void)x_sc;
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
@ -2435,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
}
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
(void)x_qh; (void)x_sc;
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
@ -2446,6 +2452,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
(void)x_qh; (void)x_sc;
GGML_CUDA_ASSUME(i_offset >= 0);
GGML_CUDA_ASSUME(i_offset < nwarps);
@ -2455,7 +2462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const int kbx = k / QI5_0;
const int kqsx = k % QI5_0;
const block_q5_0 * bx0 = (block_q5_0 *) vx;
const block_q5_0 * bx0 = (const block_q5_0 *) vx;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@ -2510,6 +2517,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
(void)x_qh; (void)x_sc;
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
@ -2549,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
}
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
(void)x_qh; (void)x_sc;
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
@ -2560,6 +2569,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
(void)x_qh; (void)x_sc;
GGML_CUDA_ASSUME(i_offset >= 0);
GGML_CUDA_ASSUME(i_offset < nwarps);
@ -2569,7 +2579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const int kbx = k / QI5_1;
const int kqsx = k % QI5_1;
const block_q5_1 * bx0 = (block_q5_1 *) vx;
const block_q5_1 * bx0 = (const block_q5_1 *) vx;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@ -2621,6 +2631,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
(void)x_qh; (void)x_sc;
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
@ -2655,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
}
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
(void)x_qh; (void)x_sc;
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
@ -2666,6 +2678,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
(void)x_qh; (void)x_sc;
GGML_CUDA_ASSUME(i_offset >= 0);
GGML_CUDA_ASSUME(i_offset < nwarps);
@ -2676,7 +2689,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const int kqsx = k % QI8_0;
float * x_dmf = (float *) x_dm;
const block_q8_0 * bx0 = (block_q8_0 *) vx;
const block_q8_0 * bx0 = (const block_q8_0 *) vx;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@ -2711,6 +2724,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
(void)x_qh; (void)x_sc;
const float * x_dmf = (const float *) x_dm;
const float * y_df = (const float *) y_ds;
@ -2744,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
}
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
(void)x_qh;
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
@ -2757,6 +2772,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
(void)x_qh;
GGML_CUDA_ASSUME(i_offset >= 0);
GGML_CUDA_ASSUME(i_offset < nwarps);
@ -2766,7 +2782,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const int kbx = k / QI2_K;
const int kqsx = k % QI2_K;
const block_q2_K * bx0 = (block_q2_K *) vx;
const block_q2_K * bx0 = (const block_q2_K *) vx;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@ -2814,6 +2830,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
(void)x_qh;
const int kbx = k / QI2_K;
const int ky = (k % QI2_K) * QR2_K;
@ -2887,7 +2904,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const int kbx = k / QI3_K;
const int kqsx = k % QI3_K;
const block_q3_K * bx0 = (block_q3_K *) vx;
const block_q3_K * bx0 = (const block_q3_K *) vx;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@ -2968,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
const float * x_dmf = (const float *) x_dm;
const float * y_df = (const float *) y_ds;
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
@ -3083,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
}
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
(void)x_qh;
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
@ -3096,6 +3114,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
(void)x_qh;
GGML_CUDA_ASSUME(i_offset >= 0);
GGML_CUDA_ASSUME(i_offset < nwarps);
@ -3105,7 +3124,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const int kbx = k / QI4_K; // == 0 if QK_K == 256
const int kqsx = k % QI4_K; // == k if QK_K == 256
const block_q4_K * bx0 = (block_q4_K *) vx;
const block_q4_K * bx0 = (const block_q4_K *) vx;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@ -3150,7 +3169,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
const int * scales = (int *) bxi->scales;
const int * scales = (const int *) bxi->scales;
const int ksc = k % (WARP_SIZE/8);
@ -3165,6 +3184,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
(void)x_qh;
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
@ -3264,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
}
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
(void)x_qh;
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
@ -3277,6 +3298,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
(void)x_qh;
GGML_CUDA_ASSUME(i_offset >= 0);
GGML_CUDA_ASSUME(i_offset < nwarps);
@ -3286,7 +3308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const int kbx = k / QI5_K; // == 0 if QK_K == 256
const int kqsx = k % QI5_K; // == k if QK_K == 256
const block_q5_K * bx0 = (block_q5_K *) vx;
const block_q5_K * bx0 = (const block_q5_K *) vx;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@ -3342,7 +3364,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
const int * scales = (int *) bxi->scales;
const int * scales = (const int *) bxi->scales;
const int ksc = k % (WARP_SIZE/8);
@ -3357,6 +3379,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
(void)x_qh;
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
@ -3393,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
}
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
(void)x_qh;
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
@ -3406,6 +3430,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
(void)x_qh;
GGML_CUDA_ASSUME(i_offset >= 0);
GGML_CUDA_ASSUME(i_offset < nwarps);
@ -3415,7 +3440,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
const int kbx = k / QI6_K; // == 0 if QK_K == 256
const int kqsx = k % QI6_K; // == k if QK_K == 256
const block_q6_K * bx0 = (block_q6_K *) vx;
const block_q6_K * bx0 = (const block_q6_K *) vx;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
@ -3477,6 +3502,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
(void)x_qh;
const float * x_dmf = (const float *) x_dm;
const float * y_df = (const float *) y_ds;
@ -3519,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q(
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
@ -6012,18 +6038,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
if (nb0 == ts && nb1 == ts*ne0/bs) {
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
} else if (nb0 == ts) {
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
} else {
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
const void * rx = (const void *) ((const char *) x + i1*nb1);
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
// pretend the row is a matrix with cols=1
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
if (r != cudaSuccess) return r;
}
return cudaSuccess;
}
if (nb0 == ts) {
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
}
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
const void * rx = (const void *) ((const char *) x + i1*nb1);
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
// pretend the row is a matrix with cols=1
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
if (r != cudaSuccess) { return r; }
}
return cudaSuccess;
}
static void ggml_cuda_op_repeat(
@ -6978,7 +7004,7 @@ static void ggml_cuda_op_mul_mat(
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t nrows0 = ggml_nrows(src0);
// const int64_t nrows0 = ggml_nrows(src0);
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
@ -7079,7 +7105,7 @@ static void ggml_cuda_op_mul_mat(
if (src0_on_device && src0_is_contiguous) {
src0_dd[id] = (char *) src0_extra->data_device[id];
} else {
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
// const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
}
@ -7312,7 +7338,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
}
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
if (!g_cublas_loaded) return false;
if (!g_cublas_loaded) { return false; }
const int64_t ne10 = src1->ne[0];
@ -7390,7 +7416,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
}
__global__ void k_compute_batched_ptrs(
__global__ static void k_compute_batched_ptrs(
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
const void ** ptrs_src, void ** ptrs_dst,
int ne12, int ne13,
@ -8013,7 +8039,7 @@ void ggml_cuda_free_scratch() {
}
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
if (!g_cublas_loaded) return false;
if (!g_cublas_loaded) { return false; }
ggml_cuda_func_t func;
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@ -8312,14 +8338,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
UNUSED(cgraph);
}
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
[[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
GGML_ASSERT(!"not implemented");
UNUSED(backend);
UNUSED(plan);
}
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
[[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
GGML_ASSERT(!"not implemented");
UNUSED(backend);
@ -8335,8 +8361,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
continue;
}
assert(node->backend == GGML_BACKEND_GPU);
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != nullptr) {

View file

@ -56,20 +56,21 @@ class Keys:
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
class Tokenizer:
MODEL = "tokenizer.ggml.model"
LIST = "tokenizer.ggml.tokens"
TOKEN_TYPE = "tokenizer.ggml.token_type"
SCORES = "tokenizer.ggml.scores"
MERGES = "tokenizer.ggml.merges"
BOS_ID = "tokenizer.ggml.bos_token_id"
EOS_ID = "tokenizer.ggml.eos_token_id"
UNK_ID = "tokenizer.ggml.unknown_token_id"
SEP_ID = "tokenizer.ggml.seperator_token_id"
PAD_ID = "tokenizer.ggml.padding_token_id"
ADD_BOS = "tokenizer.ggml.add_bos_token"
ADD_EOS = "tokenizer.ggml.add_eos_token"
HF_JSON = "tokenizer.huggingface.json"
RWKV = "tokenizer.rwkv.world"
MODEL = "tokenizer.ggml.model"
LIST = "tokenizer.ggml.tokens"
TOKEN_TYPE = "tokenizer.ggml.token_type"
SCORES = "tokenizer.ggml.scores"
MERGES = "tokenizer.ggml.merges"
BOS_ID = "tokenizer.ggml.bos_token_id"
EOS_ID = "tokenizer.ggml.eos_token_id"
UNK_ID = "tokenizer.ggml.unknown_token_id"
SEP_ID = "tokenizer.ggml.seperator_token_id"
PAD_ID = "tokenizer.ggml.padding_token_id"
ADD_BOS = "tokenizer.ggml.add_bos_token"
ADD_EOS = "tokenizer.ggml.add_eos_token"
HF_JSON = "tokenizer.huggingface.json"
RWKV = "tokenizer.rwkv.world"
CHAT_TEMPLATE = "tokenizer.chat_template"
#

View file

@ -221,7 +221,7 @@ class GGUFWriter:
if self.endianess == GGUFEndian.BIG:
tensor.byteswap(inplace=True)
if self.use_temp_file and self.temp_file is None:
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024)
fp.seek(0)
self.temp_file = fp
@ -399,6 +399,9 @@ class GGUFWriter:
def add_add_eos_token(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
def add_chat_template(self, value: str) -> None:
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
pack_prefix = ''
if not skip_pack_prefix:

View file

@ -13,6 +13,7 @@ class SpecialVocab:
merges: list[str]
add_special_token: dict[str, bool]
special_token_ids: dict[str, int]
chat_template: str | None
def __init__(
self, path: str | os.PathLike[str], load_merges: bool = False,
@ -24,6 +25,7 @@ class SpecialVocab:
self.n_vocab = n_vocab
self.load_merges = load_merges
self.merges = []
self.chat_template = None
if special_token_types is not None:
self.special_token_types = special_token_types
else:
@ -67,6 +69,10 @@ class SpecialVocab:
if not quiet:
print(f'gguf: Setting add_{typ}_token to {value}')
add_handler(value)
if self.chat_template is not None:
if not quiet:
print(f'gguf: Setting chat_template to {self.chat_template}')
gw.add_chat_template(self.chat_template)
def _load(self, path: Path) -> None:
self._try_load_from_tokenizer_json(path)
@ -132,6 +138,14 @@ class SpecialVocab:
return True
with open(tokenizer_config_file, encoding = 'utf-8') as f:
tokenizer_config = json.load(f)
chat_template = tokenizer_config.get('chat_template')
if chat_template is None or isinstance(chat_template, str):
self.chat_template = chat_template
else:
print(
f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring',
file = sys.stderr
)
for typ in self.special_token_types:
add_entry = tokenizer_config.get(f'add_{typ}_token')
if isinstance(add_entry, bool):

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
version = "0.5.3"
version = "0.6.0"
description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [

View file

@ -1882,6 +1882,7 @@ struct llama_model_loader {
if (value.size() > MAX_VALUE_LEN) {
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
}
replace_all(value, "\n", "\\n");
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
}