Merge branch 'master' into concedo_experimental
# Conflicts: # Makefile # tests/test-tokenizer-0-falcon.py # tests/test-tokenizer-0-llama.py
This commit is contained in:
commit
56a5fa7a60
21 changed files with 275 additions and 422 deletions
20
.github/workflows/python-lint.yml
vendored
Normal file
20
.github/workflows/python-lint.yml
vendored
Normal file
|
@ -0,0 +1,20 @@
|
|||
name: flake8 Lint
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
flake8-lint:
|
||||
runs-on: ubuntu-latest
|
||||
name: Lint
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
uses: actions/checkout@v3
|
||||
- name: Set up Python environment
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: flake8 Lint
|
||||
uses: py-actions/flake8@v2
|
||||
with:
|
||||
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
|
||||
exclude: "examples/*,examples/*/**,*/**/__init__.py"
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -53,6 +53,7 @@ models-mnt
|
|||
/speculative
|
||||
/parallel
|
||||
/train-text-from-scratch
|
||||
/tokenize
|
||||
/vdot
|
||||
/common/build-info.cpp
|
||||
arm_neon.h
|
||||
|
|
|
@ -492,6 +492,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
params.interactive_first = true;
|
||||
} else if (arg == "-ins" || arg == "--instruct") {
|
||||
params.instruct = true;
|
||||
} else if (arg == "-cml" || arg == "--chatml") {
|
||||
params.chatml = true;
|
||||
} else if (arg == "--infill") {
|
||||
params.infill = true;
|
||||
} else if (arg == "--multiline-input") {
|
||||
|
@ -731,6 +733,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" -i, --interactive run in interactive mode\n");
|
||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||
printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
|
||||
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
||||
printf(" -r PROMPT, --reverse-prompt PROMPT\n");
|
||||
printf(" halt generation at PROMPT, return control in interactive mode\n");
|
||||
|
@ -932,7 +935,7 @@ void llama_batch_add(
|
|||
const std::vector<llama_seq_id> & seq_ids,
|
||||
bool logits) {
|
||||
batch.token [batch.n_tokens] = id;
|
||||
batch.pos [batch.n_tokens] = pos,
|
||||
batch.pos [batch.n_tokens] = pos;
|
||||
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
||||
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
||||
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
||||
|
|
|
@ -110,6 +110,7 @@ struct gpt_params {
|
|||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||
|
||||
|
|
|
@ -1,315 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# HF baichuan --> gguf conversion
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any
|
||||
import numpy as np
|
||||
import torch
|
||||
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||
import gguf
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import TypeAlias
|
||||
|
||||
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
||||
|
||||
# reverse HF permute back to original pth layout
|
||||
|
||||
|
||||
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
|
||||
if n_kv_head is not None and n_head != n_kv_head:
|
||||
n_head //= n_kv_head
|
||||
|
||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(weights.shape))
|
||||
|
||||
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
|
||||
r = weights.shape[0] // 3
|
||||
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
|
||||
|
||||
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
|
||||
r = weights.shape[0] // 3
|
||||
return weights[r * n_part : r * n_part + r, ...]
|
||||
|
||||
def count_model_parts(dir_model: str) -> int:
|
||||
num_parts = 0
|
||||
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("pytorch_model-"):
|
||||
num_parts += 1
|
||||
|
||||
if num_parts > 0:
|
||||
print("gguf: found " + str(num_parts) + " model parts")
|
||||
|
||||
return num_parts
|
||||
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
|
||||
parser.add_argument(
|
||||
"--vocab-only", action="store_true",
|
||||
help="extract only the vocab",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outfile", type=Path,
|
||||
help="path to write to; default: based on input",
|
||||
)
|
||||
parser.add_argument(
|
||||
"model", type=Path,
|
||||
help="directory containing model file, or model file itself (*.bin)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
||||
help="output format - use 0 for float32, 1 for float16",
|
||||
)
|
||||
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
|
||||
dir_model = args.model
|
||||
ftype = args.ftype
|
||||
if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
endianess = gguf.GGUFEndian.LITTLE
|
||||
if args.bigendian:
|
||||
endianess = gguf.GGUFEndian.BIG
|
||||
endianess_str = "Big Endian" if args.bigendian else "Little Endian"
|
||||
print(f"gguf: Conversion Endianess {endianess}")
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
||||
|
||||
print("gguf: loading model "+dir_model.name)
|
||||
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
print("hello print: ",hparams["architectures"][0])
|
||||
if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit()
|
||||
|
||||
# get number of model parts
|
||||
num_parts = count_model_parts(dir_model)
|
||||
print(f"num_parts:{num_parts}\n")
|
||||
ARCH=gguf.MODEL_ARCH.BAICHUAN
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
head_count = hparams["num_attention_heads"]
|
||||
|
||||
if "num_key_value_heads" in hparams:
|
||||
head_count_kv = hparams["num_key_value_heads"]
|
||||
else:
|
||||
head_count_kv = head_count
|
||||
|
||||
if "_name_or_path" in hparams:
|
||||
hf_repo = hparams["_name_or_path"]
|
||||
else:
|
||||
hf_repo = ""
|
||||
|
||||
if "max_sequence_length" in hparams:
|
||||
ctx_length = hparams["max_sequence_length"]
|
||||
elif "max_position_embeddings" in hparams:
|
||||
ctx_length = hparams["max_position_embeddings"]
|
||||
elif "model_max_length" in hparams:
|
||||
ctx_length = hparams["model_max_length"]
|
||||
else:
|
||||
print("gguf: can not find ctx length parameter.")
|
||||
|
||||
sys.exit()
|
||||
|
||||
|
||||
gguf_writer.add_name(dir_model.name)
|
||||
gguf_writer.add_source_hf_repo(hf_repo)
|
||||
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
gguf_writer.add_context_length(ctx_length)
|
||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||
gguf_writer.add_head_count(head_count)
|
||||
gguf_writer.add_head_count_kv(head_count_kv)
|
||||
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||
|
||||
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
||||
if "type" in hparams["rope_scaling"]:
|
||||
if hparams["rope_scaling"]["type"] == "linear":
|
||||
gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
||||
|
||||
|
||||
# TOKENIZATION
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: list[bytes] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
tokenizer_model_file = dir_model / 'tokenizer.model'
|
||||
if not tokenizer_model_file.is_file():
|
||||
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# vocab type sentencepiece
|
||||
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
||||
|
||||
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
||||
vocab_size = hparams.get('vocab_size')
|
||||
if vocab_size is None:
|
||||
vocab_size = tokenizer.vocab_size()
|
||||
|
||||
for i in range(vocab_size):
|
||||
text: bytes
|
||||
score: float
|
||||
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.get_score(i)
|
||||
|
||||
toktype = 1 # defualt to normal token type
|
||||
if tokenizer.is_unknown(i):
|
||||
toktype = 2
|
||||
if tokenizer.is_control(i):
|
||||
toktype = 3
|
||||
|
||||
# toktype = 4 is user-defined = tokens from added_tokens.json
|
||||
|
||||
if tokenizer.is_unused(i):
|
||||
toktype = 5
|
||||
if tokenizer.is_byte(i):
|
||||
toktype = 6
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
added_tokens_file = dir_model / 'added_tokens.json'
|
||||
if added_tokens_file.is_file():
|
||||
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||
addtokens_json = json.load(f)
|
||||
|
||||
print("gguf: get added tokens")
|
||||
|
||||
for key in addtokens_json:
|
||||
tokens.append( key.encode("utf-8") )
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(4) # user-defined token type
|
||||
|
||||
|
||||
gguf_writer.add_tokenizer_model("llama")
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens))
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||
|
||||
# tensor info
|
||||
print("gguf: get tensor metadata")
|
||||
|
||||
if num_parts == 0:
|
||||
part_names = iter(("pytorch_model.bin",))
|
||||
else:
|
||||
part_names = (
|
||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||
)
|
||||
|
||||
|
||||
for part_name in part_names:
|
||||
if args.vocab_only:
|
||||
break
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
||||
|
||||
tmp=model_part
|
||||
for i in range(block_count):
|
||||
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
|
||||
print(f"Unpacking and permuting layer {i}")
|
||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
|
||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
|
||||
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
|
||||
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part[name]
|
||||
# we don't need these
|
||||
if name.endswith(".rotary_emb.inv_freq"):
|
||||
continue
|
||||
|
||||
old_dtype = data.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||
data = data.to(torch.float32)
|
||||
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
if not args.vocab_only:
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||
print("")
|
|
@ -827,13 +827,14 @@ class StableLMModel(Model):
|
|||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"]*(hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
|
||||
parser.add_argument(
|
||||
|
|
|
@ -14,11 +14,13 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||
import gguf
|
||||
|
||||
|
||||
class GGMLFormat(IntEnum):
|
||||
GGML = 0
|
||||
GGMF = 1
|
||||
GGJT = 2
|
||||
|
||||
|
||||
class GGMLFType(IntEnum):
|
||||
ALL_F32 = 0
|
||||
MOSTLY_F16 = 1
|
||||
|
@ -38,6 +40,7 @@ class GGMLFType(IntEnum):
|
|||
MOSTLY_Q5_K_M = 17
|
||||
MOSTLY_Q6_K = 18
|
||||
|
||||
|
||||
class Hyperparameters:
|
||||
def __init__(self):
|
||||
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
||||
|
@ -69,6 +72,7 @@ class Hyperparameters:
|
|||
def __str__(self):
|
||||
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
||||
|
||||
|
||||
class Vocab:
|
||||
def __init__(self, load_scores = True):
|
||||
self.items = []
|
||||
|
@ -90,6 +94,7 @@ class Vocab:
|
|||
self.items.append((item_text, item_score))
|
||||
return offset - orig_offset
|
||||
|
||||
|
||||
class Tensor:
|
||||
def __init__(self, use_padding = True):
|
||||
self.name = None
|
||||
|
@ -123,6 +128,7 @@ class Tensor:
|
|||
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
||||
return offset - orig_offset
|
||||
|
||||
|
||||
class GGMLModel:
|
||||
def __init__(self):
|
||||
self.hyperparameters = None
|
||||
|
@ -159,8 +165,8 @@ class GGMLModel:
|
|||
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
|
||||
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
|
||||
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
|
||||
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
|
||||
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
|
||||
if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
|
||||
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
|
||||
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
|
||||
if len(err) > 0:
|
||||
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
|
||||
|
@ -187,6 +193,7 @@ class GGMLModel:
|
|||
hp.set_n_ff(self)
|
||||
return offset
|
||||
|
||||
|
||||
class GGMLToGGUF:
|
||||
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
||||
hp = ggml_model.hyperparameters
|
||||
|
@ -217,7 +224,7 @@ class GGMLToGGUF:
|
|||
gguf_writer = gguf.GGUFWriter(
|
||||
self.cfg.output,
|
||||
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
||||
use_temp_file = False )
|
||||
use_temp_file = False)
|
||||
self.add_params(gguf_writer)
|
||||
self.add_vocab(gguf_writer)
|
||||
if self.special_vocab is not None:
|
||||
|
@ -341,7 +348,8 @@ class GGMLToGGUF:
|
|||
mapped_name,
|
||||
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
|
||||
raw_shape = tempdims,
|
||||
raw_dtype = tensor.dtype )
|
||||
raw_dtype = tensor.dtype)
|
||||
|
||||
|
||||
def handle_metadata(cfg, hp):
|
||||
import convert
|
||||
|
@ -365,38 +373,40 @@ def handle_metadata(cfg, hp):
|
|||
raise ValueError('Unable to load metadata')
|
||||
vocab = convert.load_vocab(
|
||||
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
||||
cfg.vocabtype )
|
||||
cfg.vocabtype)
|
||||
# FIXME: Respect cfg.vocab_dir?
|
||||
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
|
||||
load_merges = cfg.vocabtype == 'bpe',
|
||||
n_vocab = vocab.vocab_size)
|
||||
load_merges = cfg.vocabtype == 'bpe',
|
||||
n_vocab = vocab.vocab_size)
|
||||
convert.check_vocab_size(params, vocab)
|
||||
return (params, vocab, svocab)
|
||||
|
||||
|
||||
def handle_args():
|
||||
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
||||
parser.add_argument('--input', '-i', type = Path, required = True,
|
||||
help = 'Input GGMLv3 filename')
|
||||
help = 'Input GGMLv3 filename')
|
||||
parser.add_argument('--output', '-o', type = Path, required = True,
|
||||
help ='Output GGUF filename')
|
||||
help ='Output GGUF filename')
|
||||
parser.add_argument('--name',
|
||||
help = 'Set model name')
|
||||
help = 'Set model name')
|
||||
parser.add_argument('--desc',
|
||||
help = 'Set model description')
|
||||
help = 'Set model description')
|
||||
parser.add_argument('--gqa', type = int, default = 1,
|
||||
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
||||
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
||||
parser.add_argument('--eps', default = '5.0e-06',
|
||||
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
||||
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
||||
parser.add_argument('--context-length', '-c', type=int, default = 2048,
|
||||
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
||||
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
||||
parser.add_argument('--model-metadata-dir', '-m', type = Path,
|
||||
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
||||
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
||||
parser.add_argument("--vocab-dir", type=Path,
|
||||
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
||||
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
||||
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
|
||||
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
||||
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
cfg = handle_args()
|
||||
print(f'* Using config: {cfg}')
|
||||
|
@ -406,7 +416,7 @@ def main():
|
|||
data = np.memmap(cfg.input, mode = 'r')
|
||||
model = GGMLModel()
|
||||
print('* Scanning GGML input file')
|
||||
offset = model.load(data, 0)
|
||||
offset = model.load(data, 0) # noqa
|
||||
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||
vocab_override = None
|
||||
params_override = None
|
||||
|
@ -421,12 +431,15 @@ def main():
|
|||
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
||||
if model.file_format == GGMLFormat.GGML:
|
||||
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
||||
converter = GGMLToGGUF(model, data, cfg,
|
||||
converter = GGMLToGGUF(
|
||||
model, data, cfg,
|
||||
params_override = params_override,
|
||||
vocab_override = vocab_override,
|
||||
special_vocab = special_vocab )
|
||||
special_vocab = special_vocab
|
||||
)
|
||||
converter.save()
|
||||
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
@ -9,6 +9,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||
import gguf
|
||||
|
||||
|
||||
def _flatten_dict(dct, tensors, prefix=None):
|
||||
assert isinstance(dct, dict)
|
||||
for key in dct.keys():
|
||||
|
@ -21,6 +22,7 @@ def _flatten_dict(dct, tensors, prefix=None):
|
|||
raise ValueError(type(dct[key]))
|
||||
return None
|
||||
|
||||
|
||||
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
||||
tokenizer_path = dir_model / 'adept_vocab.model'
|
||||
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
||||
|
@ -54,6 +56,7 @@ def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
|||
pass
|
||||
return tokens, scores, toktypes
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
|
@ -125,6 +128,5 @@ def main():
|
|||
print("")
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
60
convert.py
Executable file → Normal file
60
convert.py
Executable file → Normal file
|
@ -46,6 +46,7 @@ DEFAULT_CONCURRENCY = 8
|
|||
# data types
|
||||
#
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DataType:
|
||||
name: str
|
||||
|
@ -55,15 +56,18 @@ class DataType:
|
|||
def elements_to_bytes(self, n_elements: int) -> int:
|
||||
return n_elements * self.dtype.itemsize
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class UnquantizedDataType(DataType):
|
||||
pass
|
||||
|
||||
|
||||
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
||||
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
||||
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
||||
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class QuantizedDataType(DataType):
|
||||
block_size: int
|
||||
|
@ -77,6 +81,7 @@ class QuantizedDataType(DataType):
|
|||
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
|
||||
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Q8_0QuantizedDataType(QuantizedDataType):
|
||||
# Mini Q8_0 quantization in Python!
|
||||
|
@ -86,6 +91,7 @@ class Q8_0QuantizedDataType(QuantizedDataType):
|
|||
n_blocks = arr.size // self.block_size
|
||||
blocks = arr.reshape((n_blocks, self.block_size))
|
||||
# Much faster implementation of block quantization contributed by @Cebtenzzre
|
||||
|
||||
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
|
||||
d = abs(blocks).max(axis = 1) / np.float32(127)
|
||||
with np.errstate(divide = 'ignore'):
|
||||
|
@ -94,10 +100,11 @@ class Q8_0QuantizedDataType(QuantizedDataType):
|
|||
yield from zip(d, qs)
|
||||
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
|
||||
|
||||
|
||||
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
|
||||
dtype = np.dtype(np.float32), valid_conversions = [],
|
||||
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
||||
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
|
||||
dtype = np.dtype(np.float32), valid_conversions = [],
|
||||
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
||||
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
|
||||
|
||||
# Quantized types skipped here because they may also map to np.float32
|
||||
NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
|
||||
|
@ -116,6 +123,8 @@ SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
|
|||
# TODO: match this with `llama_ftype`
|
||||
# TODO: rename to LLAMAFileType
|
||||
# TODO: move to `gguf.py`
|
||||
|
||||
|
||||
class GGMLFileType(enum.IntEnum):
|
||||
AllF32 = 0
|
||||
MostlyF16 = 1 # except 1d tensors
|
||||
|
@ -128,6 +137,7 @@ class GGMLFileType(enum.IntEnum):
|
|||
# 1D tensors are always F32.
|
||||
return dt if len(tensor.shape) > 1 else DT_F32
|
||||
|
||||
|
||||
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
||||
GGMLFileType.AllF32 : DT_F32,
|
||||
GGMLFileType.MostlyF16 : DT_F16,
|
||||
|
@ -138,6 +148,7 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
|||
# hparams loading
|
||||
#
|
||||
|
||||
|
||||
@dataclass
|
||||
class Params:
|
||||
n_vocab: int
|
||||
|
@ -167,11 +178,11 @@ class Params:
|
|||
|
||||
# try transformer naming first
|
||||
if "model.layers.0.self_attn.q_proj.weight" in model:
|
||||
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
||||
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
||||
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
|
||||
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
||||
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
||||
else:
|
||||
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||
|
||||
if n_layer < 1:
|
||||
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
||||
|
@ -308,7 +319,7 @@ class BpeVocab:
|
|||
(item['content'], item['id'])
|
||||
for item in tokenizer_json.get('added_tokens', [])
|
||||
# Added tokens here can be duplicates of the main vocabulary.
|
||||
if item['content'] not in self.bpe_tokenizer )
|
||||
if item['content'] not in self.bpe_tokenizer)
|
||||
|
||||
vocab_size: int = len(self.bpe_tokenizer)
|
||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||
|
@ -326,7 +337,6 @@ class BpeVocab:
|
|||
|
||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
tokenizer = self.bpe_tokenizer
|
||||
from transformers.models.gpt2 import tokenization_gpt2
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
||||
|
||||
for i, _ in enumerate(tokenizer):
|
||||
|
@ -406,6 +416,7 @@ class SentencePieceVocab:
|
|||
def __repr__(self) -> str:
|
||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||
|
||||
|
||||
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
||||
|
||||
#
|
||||
|
@ -413,13 +424,14 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
|||
# TODO: reuse (probably move to gguf.py?)
|
||||
#
|
||||
|
||||
|
||||
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
||||
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
||||
# print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
||||
if n_head_kv is not None and n_head != n_head_kv:
|
||||
n_head = n_head_kv
|
||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(weights.shape))
|
||||
.swapaxes(1, 2)
|
||||
.reshape(weights.shape))
|
||||
|
||||
|
||||
class Tensor(metaclass=ABCMeta):
|
||||
|
@ -500,7 +512,7 @@ class LazyTensor:
|
|||
ret = self._load()
|
||||
# Should be okay if it maps to the same numpy type?
|
||||
assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
|
||||
(self.data_type, ret.data_type, self.description)
|
||||
(self.data_type, ret.data_type, self.description)
|
||||
return ret
|
||||
|
||||
def astype(self, data_type: DataType) -> LazyTensor:
|
||||
|
@ -588,6 +600,7 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe
|
|||
return lazy_tensor.load().permute(n_head, n_head_kv)
|
||||
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
||||
|
||||
|
||||
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
|
||||
def load() -> Tensor:
|
||||
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
|
||||
|
@ -595,6 +608,7 @@ def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_
|
|||
s[0] = s[0] // 3
|
||||
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
||||
|
||||
|
||||
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
||||
def load() -> Tensor:
|
||||
return lazy_tensor.load().part(n_part)
|
||||
|
@ -744,6 +758,7 @@ def lazy_load_file(path: Path) -> ModelPlus:
|
|||
In = TypeVar('In')
|
||||
Out = TypeVar('Out')
|
||||
|
||||
|
||||
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
|
||||
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
||||
fast enough, this will stop calling `func` at some point rather than
|
||||
|
@ -778,6 +793,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
|||
break
|
||||
yield result
|
||||
|
||||
|
||||
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||
if params.n_vocab != vocab.vocab_size:
|
||||
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
||||
|
@ -796,7 +812,7 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
|||
|
||||
|
||||
class OutputFile:
|
||||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
||||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||
|
||||
def add_meta_arch(self, params: Params) -> None:
|
||||
|
@ -876,7 +892,7 @@ class OutputFile:
|
|||
self.gguf.close()
|
||||
|
||||
@staticmethod
|
||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||
check_vocab_size(params, vocab)
|
||||
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
|
@ -938,8 +954,9 @@ class OutputFile:
|
|||
|
||||
of.close()
|
||||
|
||||
|
||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
|
||||
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
|
||||
|
||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
||||
return GGMLFileType.AllF32
|
||||
|
@ -952,10 +969,12 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
|
|||
|
||||
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
||||
|
||||
|
||||
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
||||
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
||||
for (name, tensor) in model.items()}
|
||||
|
||||
|
||||
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
||||
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
||||
|
@ -968,7 +987,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
|||
print(f"Permuting layer {i}")
|
||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
|
||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
|
||||
#tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
||||
# tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
||||
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
||||
print(f"Unpacking and permuting layer {i}")
|
||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
|
||||
|
@ -993,6 +1012,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
|||
|
||||
return out
|
||||
|
||||
|
||||
def nth_multifile_path(path: Path, n: int) -> Path | None:
|
||||
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
||||
the nth path in the model.
|
||||
|
@ -1174,8 +1194,8 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
# FIXME: Try to respect vocab_dir somehow?
|
||||
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
|
||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||
load_merges = args.vocabtype == 'bpe',
|
||||
n_vocab = vocab.vocab_size)
|
||||
load_merges = args.vocabtype == 'bpe',
|
||||
n_vocab = vocab.vocab_size)
|
||||
outfile = args.outfile
|
||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
|
||||
print(f"Wrote {outfile}")
|
||||
|
@ -1188,8 +1208,8 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
vocab = load_vocab(vocab_dir, args.vocabtype)
|
||||
# FIXME: Try to respect vocab_dir somehow?
|
||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||
load_merges = args.vocabtype == 'bpe',
|
||||
n_vocab = vocab.vocab_size)
|
||||
load_merges = args.vocabtype == 'bpe',
|
||||
n_vocab = vocab.vocab_size)
|
||||
|
||||
model = model_plus.model
|
||||
model = convert_model_names(model, params)
|
||||
|
|
|
@ -147,6 +147,13 @@ int main(int argc, char ** argv) {
|
|||
|
||||
return 0;
|
||||
}
|
||||
if (params.chatml) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use the 'main' tool for chatml mode\n", __func__);
|
||||
printf("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
if (!params.antiprompt.empty()) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
|
||||
|
|
|
@ -235,8 +235,11 @@ int main(int argc, char ** argv) {
|
|||
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
||||
if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
|
||||
LOG("tokenize the prompt\n");
|
||||
if (params.chatml) {
|
||||
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
|
||||
}
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
} else {
|
||||
LOG("use session tokens\n");
|
||||
|
@ -314,7 +317,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// number of tokens to keep when resetting context
|
||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
|
||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
|
||||
params.n_keep = (int)embd_inp.size();
|
||||
}
|
||||
|
||||
|
@ -325,11 +328,23 @@ int main(int argc, char ** argv) {
|
|||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
||||
|
||||
// chatml prefix & suffix
|
||||
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
|
||||
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
||||
|
||||
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
||||
LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
|
||||
|
||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||
if (params.instruct) {
|
||||
params.interactive_first = true;
|
||||
params.antiprompt.push_back("### Instruction:\n\n");
|
||||
}
|
||||
// similar for chatml mode
|
||||
else if (params.chatml) {
|
||||
params.interactive_first = true;
|
||||
params.antiprompt.push_back("<|im_start|>user\n");
|
||||
}
|
||||
|
||||
// enable interactive mode if interactive start is specified
|
||||
if (params.interactive_first) {
|
||||
|
@ -706,7 +721,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
is_interacting = true;
|
||||
printf("\n");
|
||||
} else if (params.instruct) {
|
||||
} else if (params.instruct || params.chatml) {
|
||||
is_interacting = true;
|
||||
}
|
||||
}
|
||||
|
@ -714,7 +729,7 @@ int main(int argc, char ** argv) {
|
|||
if (n_past > 0 && is_interacting) {
|
||||
LOG("waiting for user input\n");
|
||||
|
||||
if (params.instruct) {
|
||||
if (params.instruct || params.chatml) {
|
||||
printf("\n> ");
|
||||
}
|
||||
|
||||
|
@ -761,6 +776,12 @@ int main(int argc, char ** argv) {
|
|||
n_consumed = embd_inp.size();
|
||||
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
||||
}
|
||||
// chatml mode: insert user chat prefix
|
||||
if (params.chatml && !is_antiprompt) {
|
||||
LOG("inserting chatml prefix\n");
|
||||
n_consumed = embd_inp.size();
|
||||
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
|
||||
}
|
||||
if (params.escape) {
|
||||
process_escapes(buffer);
|
||||
}
|
||||
|
@ -779,6 +800,11 @@ int main(int argc, char ** argv) {
|
|||
LOG("inserting instruction suffix\n");
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
}
|
||||
// chatml mode: insert assistant chat suffix
|
||||
if (params.chatml) {
|
||||
LOG("inserting chatml suffix\n");
|
||||
embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
|
||||
}
|
||||
|
||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||
const llama_token token = embd_inp[i];
|
||||
|
@ -804,7 +830,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// end of text token
|
||||
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
|
||||
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
|
||||
LOG_TEE(" [end of text]\n");
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -94,6 +94,10 @@ export async function* llama(prompt, params = {}, config = {}) {
|
|||
break;
|
||||
}
|
||||
}
|
||||
if (result.error) {
|
||||
result.error = JSON.parse(result.error);
|
||||
console.error(`llama.cpp error: ${result.error.content}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2369,6 +2369,17 @@ int main(int argc, char **argv)
|
|||
break;
|
||||
}
|
||||
} else {
|
||||
const std::string str =
|
||||
"error: " +
|
||||
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||
"\n\n";
|
||||
LOG_VERBOSE("data stream", {
|
||||
{ "to_send", str }
|
||||
});
|
||||
if (!sink.write(str.c_str(), str.size()))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -96,9 +96,22 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
// tokenize the prompt
|
||||
|
||||
// Tokenize the prompt
|
||||
const bool add_bos_tgt = llama_should_add_bos_token(model_tgt);
|
||||
LOG("add_bos tgt: %d\n", add_bos_tgt);
|
||||
|
||||
const bool add_bos_dft = llama_should_add_bos_token(model_dft);
|
||||
LOG("add_bos dft: %d\n", add_bos_dft);
|
||||
|
||||
if (add_bos_tgt != add_bos_dft) {
|
||||
fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt);
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
|
||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
|
||||
|
||||
const int max_context_size = llama_n_ctx(ctx_tgt);
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
|
|
|
@ -26,7 +26,7 @@ int main(int argc, char ** argv) {
|
|||
llama_context_params ctx_params = llama_context_default_params();
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
const bool add_bos = true;
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
|
||||
|
|
107
ggml-cuda.cu
107
ggml-cuda.cu
|
@ -235,7 +235,7 @@ typedef float2 dfloat2;
|
|||
#endif //GGML_CUDA_F16
|
||||
|
||||
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
||||
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
||||
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
||||
|
||||
int x32 = 0;
|
||||
x32 |= x16[0] << 0;
|
||||
|
@ -245,7 +245,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
|
|||
}
|
||||
|
||||
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
||||
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
||||
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
||||
|
||||
int x32 = 0;
|
||||
x32 |= x16[0] << 0;
|
||||
|
@ -255,11 +255,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
|
|||
}
|
||||
|
||||
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
||||
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
||||
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
||||
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
||||
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
|
@ -469,7 +469,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
|||
#define MUL_MAT_SRC1_COL_STRIDE 128
|
||||
|
||||
#define MAX_STREAMS 8
|
||||
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
||||
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
|
||||
|
||||
struct ggml_tensor_extra_gpu {
|
||||
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
||||
|
@ -2249,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|||
}
|
||||
|
||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||
(void)x_qh; (void)x_sc;
|
||||
|
||||
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
||||
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
||||
|
@ -2260,7 +2261,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
|
|||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||
|
||||
(void)x_qh; (void)x_sc;
|
||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||
GGML_CUDA_ASSUME(k >= 0);
|
||||
|
@ -2269,7 +2270,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
const int kbx = k / QI4_0;
|
||||
const int kqsx = k % QI4_0;
|
||||
|
||||
const block_q4_0 * bx0 = (block_q4_0 *) vx;
|
||||
const block_q4_0 * bx0 = (const block_q4_0 *) vx;
|
||||
|
||||
float * x_dmf = (float *) x_dm;
|
||||
|
||||
|
@ -2307,9 +2308,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||
(void)x_qh; (void)x_sc;
|
||||
|
||||
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
||||
const float * x_dmf = (float *) x_dm;
|
||||
const float * x_dmf = (const float *) x_dm;
|
||||
|
||||
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
||||
|
||||
|
@ -2343,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
|||
}
|
||||
|
||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||
(void)x_qh; (void)x_sc;
|
||||
|
||||
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
||||
|
@ -2354,6 +2357,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
|
|||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||
(void)x_qh; (void)x_sc;
|
||||
|
||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||
|
@ -2363,7 +2367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
const int kbx = k / QI4_1;
|
||||
const int kqsx = k % QI4_1;
|
||||
|
||||
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
||||
const block_q4_1 * bx0 = (const block_q4_1 *) vx;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
|
@ -2398,6 +2402,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||
(void)x_qh; (void)x_sc;
|
||||
|
||||
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
||||
|
||||
|
@ -2435,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
|||
}
|
||||
|
||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||
(void)x_qh; (void)x_sc;
|
||||
|
||||
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
||||
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
||||
|
@ -2446,6 +2452,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
|
|||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||
(void)x_qh; (void)x_sc;
|
||||
|
||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||
|
@ -2455,7 +2462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
const int kbx = k / QI5_0;
|
||||
const int kqsx = k % QI5_0;
|
||||
|
||||
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
||||
const block_q5_0 * bx0 = (const block_q5_0 *) vx;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
|
@ -2510,6 +2517,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||
(void)x_qh; (void)x_sc;
|
||||
|
||||
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
||||
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
||||
|
@ -2549,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
|||
}
|
||||
|
||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||
(void)x_qh; (void)x_sc;
|
||||
|
||||
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
||||
|
@ -2560,6 +2569,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
|
|||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||
(void)x_qh; (void)x_sc;
|
||||
|
||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||
|
@ -2569,7 +2579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
const int kbx = k / QI5_1;
|
||||
const int kqsx = k % QI5_1;
|
||||
|
||||
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
||||
const block_q5_1 * bx0 = (const block_q5_1 *) vx;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
|
@ -2621,6 +2631,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||
(void)x_qh; (void)x_sc;
|
||||
|
||||
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
||||
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
||||
|
@ -2655,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|||
}
|
||||
|
||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||
(void)x_qh; (void)x_sc;
|
||||
|
||||
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
||||
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
||||
|
@ -2666,6 +2678,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
|
|||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||
(void)x_qh; (void)x_sc;
|
||||
|
||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||
|
@ -2676,7 +2689,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
const int kqsx = k % QI8_0;
|
||||
float * x_dmf = (float *) x_dm;
|
||||
|
||||
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
||||
const block_q8_0 * bx0 = (const block_q8_0 *) vx;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
|
@ -2711,6 +2724,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||
(void)x_qh; (void)x_sc;
|
||||
|
||||
const float * x_dmf = (const float *) x_dm;
|
||||
const float * y_df = (const float *) y_ds;
|
||||
|
@ -2744,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|||
}
|
||||
|
||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||
(void)x_qh;
|
||||
|
||||
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
||||
|
@ -2757,6 +2772,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
|
|||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||
(void)x_qh;
|
||||
|
||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||
|
@ -2766,7 +2782,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
const int kbx = k / QI2_K;
|
||||
const int kqsx = k % QI2_K;
|
||||
|
||||
const block_q2_K * bx0 = (block_q2_K *) vx;
|
||||
const block_q2_K * bx0 = (const block_q2_K *) vx;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
|
@ -2814,6 +2830,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||
(void)x_qh;
|
||||
|
||||
const int kbx = k / QI2_K;
|
||||
const int ky = (k % QI2_K) * QR2_K;
|
||||
|
@ -2887,7 +2904,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
const int kbx = k / QI3_K;
|
||||
const int kqsx = k % QI3_K;
|
||||
|
||||
const block_q3_K * bx0 = (block_q3_K *) vx;
|
||||
const block_q3_K * bx0 = (const block_q3_K *) vx;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
|
@ -2968,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
|||
const float * x_dmf = (const float *) x_dm;
|
||||
const float * y_df = (const float *) y_ds;
|
||||
|
||||
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
||||
const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
||||
|
||||
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
||||
|
||||
|
@ -3083,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|||
}
|
||||
|
||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||
(void)x_qh;
|
||||
|
||||
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
||||
|
@ -3096,6 +3114,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
|
|||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||
(void)x_qh;
|
||||
|
||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||
|
@ -3105,7 +3124,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
||||
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
||||
|
||||
const block_q4_K * bx0 = (block_q4_K *) vx;
|
||||
const block_q4_K * bx0 = (const block_q4_K *) vx;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
|
@ -3150,7 +3169,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
|
||||
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
||||
|
||||
const int * scales = (int *) bxi->scales;
|
||||
const int * scales = (const int *) bxi->scales;
|
||||
|
||||
const int ksc = k % (WARP_SIZE/8);
|
||||
|
||||
|
@ -3165,6 +3184,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||
(void)x_qh;
|
||||
|
||||
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
||||
|
||||
|
@ -3264,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|||
}
|
||||
|
||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||
(void)x_qh;
|
||||
|
||||
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
||||
|
@ -3277,6 +3298,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
|
|||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||
(void)x_qh;
|
||||
|
||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||
|
@ -3286,7 +3308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
||||
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
||||
|
||||
const block_q5_K * bx0 = (block_q5_K *) vx;
|
||||
const block_q5_K * bx0 = (const block_q5_K *) vx;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
|
@ -3342,7 +3364,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
|
||||
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
||||
|
||||
const int * scales = (int *) bxi->scales;
|
||||
const int * scales = (const int *) bxi->scales;
|
||||
|
||||
const int ksc = k % (WARP_SIZE/8);
|
||||
|
||||
|
@ -3357,6 +3379,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||
(void)x_qh;
|
||||
|
||||
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
||||
|
||||
|
@ -3393,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|||
}
|
||||
|
||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||
(void)x_qh;
|
||||
|
||||
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
||||
|
@ -3406,6 +3430,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
|
|||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||
(void)x_qh;
|
||||
|
||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||
|
@ -3415,7 +3440,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
||||
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
||||
|
||||
const block_q6_K * bx0 = (block_q6_K *) vx;
|
||||
const block_q6_K * bx0 = (const block_q6_K *) vx;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
|
@ -3477,6 +3502,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|||
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||
(void)x_qh;
|
||||
|
||||
const float * x_dmf = (const float *) x_dm;
|
||||
const float * y_df = (const float *) y_ds;
|
||||
|
@ -3519,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|||
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
||||
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
||||
|
||||
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
||||
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
|
||||
|
||||
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
||||
|
||||
|
@ -6012,18 +6038,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|||
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
||||
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
||||
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
||||
} else if (nb0 == ts) {
|
||||
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
||||
} else {
|
||||
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
||||
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
||||
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
||||
// pretend the row is a matrix with cols=1
|
||||
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
||||
if (r != cudaSuccess) return r;
|
||||
}
|
||||
return cudaSuccess;
|
||||
}
|
||||
if (nb0 == ts) {
|
||||
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
||||
}
|
||||
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
||||
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
||||
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
||||
// pretend the row is a matrix with cols=1
|
||||
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
||||
if (r != cudaSuccess) { return r; }
|
||||
}
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_repeat(
|
||||
|
@ -6978,7 +7004,7 @@ static void ggml_cuda_op_mul_mat(
|
|||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
const int64_t nrows0 = ggml_nrows(src0);
|
||||
// const int64_t nrows0 = ggml_nrows(src0);
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
|
@ -7079,7 +7105,7 @@ static void ggml_cuda_op_mul_mat(
|
|||
if (src0_on_device && src0_is_contiguous) {
|
||||
src0_dd[id] = (char *) src0_extra->data_device[id];
|
||||
} else {
|
||||
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
||||
// const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
||||
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
||||
}
|
||||
|
||||
|
@ -7312,7 +7338,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|||
}
|
||||
|
||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
if (!g_cublas_loaded) return false;
|
||||
if (!g_cublas_loaded) { return false; }
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
|
||||
|
@ -7390,7 +7416,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|||
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
||||
}
|
||||
|
||||
__global__ void k_compute_batched_ptrs(
|
||||
__global__ static void k_compute_batched_ptrs(
|
||||
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
||||
const void ** ptrs_src, void ** ptrs_dst,
|
||||
int ne12, int ne13,
|
||||
|
@ -8013,7 +8039,7 @@ void ggml_cuda_free_scratch() {
|
|||
}
|
||||
|
||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||
if (!g_cublas_loaded) return false;
|
||||
if (!g_cublas_loaded) { return false; }
|
||||
|
||||
ggml_cuda_func_t func;
|
||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
||||
|
@ -8312,14 +8338,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|||
UNUSED(cgraph);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
[[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
GGML_ASSERT(!"not implemented");
|
||||
|
||||
UNUSED(backend);
|
||||
UNUSED(plan);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
[[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
GGML_ASSERT(!"not implemented");
|
||||
|
||||
UNUSED(backend);
|
||||
|
@ -8335,8 +8361,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
||||
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
|
||||
continue;
|
||||
}
|
||||
assert(node->backend == GGML_BACKEND_GPU);
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
if (node->src[j] != nullptr) {
|
||||
|
|
|
@ -56,20 +56,21 @@ class Keys:
|
|||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||
|
||||
class Tokenizer:
|
||||
MODEL = "tokenizer.ggml.model"
|
||||
LIST = "tokenizer.ggml.tokens"
|
||||
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
||||
SCORES = "tokenizer.ggml.scores"
|
||||
MERGES = "tokenizer.ggml.merges"
|
||||
BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||
EOS_ID = "tokenizer.ggml.eos_token_id"
|
||||
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||
HF_JSON = "tokenizer.huggingface.json"
|
||||
RWKV = "tokenizer.rwkv.world"
|
||||
MODEL = "tokenizer.ggml.model"
|
||||
LIST = "tokenizer.ggml.tokens"
|
||||
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
||||
SCORES = "tokenizer.ggml.scores"
|
||||
MERGES = "tokenizer.ggml.merges"
|
||||
BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||
EOS_ID = "tokenizer.ggml.eos_token_id"
|
||||
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||
HF_JSON = "tokenizer.huggingface.json"
|
||||
RWKV = "tokenizer.rwkv.world"
|
||||
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||
|
||||
|
||||
#
|
||||
|
|
|
@ -221,7 +221,7 @@ class GGUFWriter:
|
|||
if self.endianess == GGUFEndian.BIG:
|
||||
tensor.byteswap(inplace=True)
|
||||
if self.use_temp_file and self.temp_file is None:
|
||||
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
|
||||
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024)
|
||||
fp.seek(0)
|
||||
self.temp_file = fp
|
||||
|
||||
|
@ -399,6 +399,9 @@ class GGUFWriter:
|
|||
def add_add_eos_token(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
||||
|
||||
def add_chat_template(self, value: str) -> None:
|
||||
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
|
||||
|
||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||
pack_prefix = ''
|
||||
if not skip_pack_prefix:
|
||||
|
|
|
@ -13,6 +13,7 @@ class SpecialVocab:
|
|||
merges: list[str]
|
||||
add_special_token: dict[str, bool]
|
||||
special_token_ids: dict[str, int]
|
||||
chat_template: str | None
|
||||
|
||||
def __init__(
|
||||
self, path: str | os.PathLike[str], load_merges: bool = False,
|
||||
|
@ -24,6 +25,7 @@ class SpecialVocab:
|
|||
self.n_vocab = n_vocab
|
||||
self.load_merges = load_merges
|
||||
self.merges = []
|
||||
self.chat_template = None
|
||||
if special_token_types is not None:
|
||||
self.special_token_types = special_token_types
|
||||
else:
|
||||
|
@ -67,6 +69,10 @@ class SpecialVocab:
|
|||
if not quiet:
|
||||
print(f'gguf: Setting add_{typ}_token to {value}')
|
||||
add_handler(value)
|
||||
if self.chat_template is not None:
|
||||
if not quiet:
|
||||
print(f'gguf: Setting chat_template to {self.chat_template}')
|
||||
gw.add_chat_template(self.chat_template)
|
||||
|
||||
def _load(self, path: Path) -> None:
|
||||
self._try_load_from_tokenizer_json(path)
|
||||
|
@ -132,6 +138,14 @@ class SpecialVocab:
|
|||
return True
|
||||
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
||||
tokenizer_config = json.load(f)
|
||||
chat_template = tokenizer_config.get('chat_template')
|
||||
if chat_template is None or isinstance(chat_template, str):
|
||||
self.chat_template = chat_template
|
||||
else:
|
||||
print(
|
||||
f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring',
|
||||
file = sys.stderr
|
||||
)
|
||||
for typ in self.special_token_types:
|
||||
add_entry = tokenizer_config.get(f'add_{typ}_token')
|
||||
if isinstance(add_entry, bool):
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.5.3"
|
||||
version = "0.6.0"
|
||||
description = "Read and write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
|
|
|
@ -1882,6 +1882,7 @@ struct llama_model_loader {
|
|||
if (value.size() > MAX_VALUE_LEN) {
|
||||
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
||||
}
|
||||
replace_all(value, "\n", "\\n");
|
||||
|
||||
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue