Merge branch 'master' into concedo_experimental
# Conflicts: # Makefile # tests/test-tokenizer-0-falcon.py # tests/test-tokenizer-0-llama.py
This commit is contained in:
commit
56a5fa7a60
21 changed files with 275 additions and 422 deletions
20
.github/workflows/python-lint.yml
vendored
Normal file
20
.github/workflows/python-lint.yml
vendored
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
name: flake8 Lint
|
||||||
|
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
flake8-lint:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
name: Lint
|
||||||
|
steps:
|
||||||
|
- name: Check out source repository
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
- name: Set up Python environment
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.11"
|
||||||
|
- name: flake8 Lint
|
||||||
|
uses: py-actions/flake8@v2
|
||||||
|
with:
|
||||||
|
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
|
||||||
|
exclude: "examples/*,examples/*/**,*/**/__init__.py"
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -53,6 +53,7 @@ models-mnt
|
||||||
/speculative
|
/speculative
|
||||||
/parallel
|
/parallel
|
||||||
/train-text-from-scratch
|
/train-text-from-scratch
|
||||||
|
/tokenize
|
||||||
/vdot
|
/vdot
|
||||||
/common/build-info.cpp
|
/common/build-info.cpp
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
|
|
|
@ -492,6 +492,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
params.interactive_first = true;
|
params.interactive_first = true;
|
||||||
} else if (arg == "-ins" || arg == "--instruct") {
|
} else if (arg == "-ins" || arg == "--instruct") {
|
||||||
params.instruct = true;
|
params.instruct = true;
|
||||||
|
} else if (arg == "-cml" || arg == "--chatml") {
|
||||||
|
params.chatml = true;
|
||||||
} else if (arg == "--infill") {
|
} else if (arg == "--infill") {
|
||||||
params.infill = true;
|
params.infill = true;
|
||||||
} else if (arg == "--multiline-input") {
|
} else if (arg == "--multiline-input") {
|
||||||
|
@ -731,6 +733,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -i, --interactive run in interactive mode\n");
|
printf(" -i, --interactive run in interactive mode\n");
|
||||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||||
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||||
|
printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
|
||||||
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
||||||
printf(" -r PROMPT, --reverse-prompt PROMPT\n");
|
printf(" -r PROMPT, --reverse-prompt PROMPT\n");
|
||||||
printf(" halt generation at PROMPT, return control in interactive mode\n");
|
printf(" halt generation at PROMPT, return control in interactive mode\n");
|
||||||
|
@ -932,7 +935,7 @@ void llama_batch_add(
|
||||||
const std::vector<llama_seq_id> & seq_ids,
|
const std::vector<llama_seq_id> & seq_ids,
|
||||||
bool logits) {
|
bool logits) {
|
||||||
batch.token [batch.n_tokens] = id;
|
batch.token [batch.n_tokens] = id;
|
||||||
batch.pos [batch.n_tokens] = pos,
|
batch.pos [batch.n_tokens] = pos;
|
||||||
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
||||||
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
||||||
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
||||||
|
|
|
@ -110,6 +110,7 @@ struct gpt_params {
|
||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool random_prompt = false; // do not randomize prompt if none provided
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
|
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||||
|
|
||||||
|
|
|
@ -1,315 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# HF baichuan --> gguf conversion
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import TYPE_CHECKING, Any
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
|
||||||
import gguf
|
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from typing import TypeAlias
|
|
||||||
|
|
||||||
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
|
||||||
|
|
||||||
# reverse HF permute back to original pth layout
|
|
||||||
|
|
||||||
|
|
||||||
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
|
|
||||||
if n_kv_head is not None and n_head != n_kv_head:
|
|
||||||
n_head //= n_kv_head
|
|
||||||
|
|
||||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
|
||||||
.swapaxes(1, 2)
|
|
||||||
.reshape(weights.shape))
|
|
||||||
|
|
||||||
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
|
|
||||||
r = weights.shape[0] // 3
|
|
||||||
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
|
|
||||||
|
|
||||||
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
|
|
||||||
r = weights.shape[0] // 3
|
|
||||||
return weights[r * n_part : r * n_part + r, ...]
|
|
||||||
|
|
||||||
def count_model_parts(dir_model: str) -> int:
|
|
||||||
num_parts = 0
|
|
||||||
|
|
||||||
for filename in os.listdir(dir_model):
|
|
||||||
if filename.startswith("pytorch_model-"):
|
|
||||||
num_parts += 1
|
|
||||||
|
|
||||||
if num_parts > 0:
|
|
||||||
print("gguf: found " + str(num_parts) + " model parts")
|
|
||||||
|
|
||||||
return num_parts
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
|
||||||
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
|
|
||||||
parser.add_argument(
|
|
||||||
"--vocab-only", action="store_true",
|
|
||||||
help="extract only the vocab",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--outfile", type=Path,
|
|
||||||
help="path to write to; default: based on input",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"model", type=Path,
|
|
||||||
help="directory containing model file, or model file itself (*.bin)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
|
||||||
help="output format - use 0 for float32, 1 for float16",
|
|
||||||
)
|
|
||||||
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
dir_model = args.model
|
|
||||||
ftype = args.ftype
|
|
||||||
if not dir_model.is_dir():
|
|
||||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
endianess = gguf.GGUFEndian.LITTLE
|
|
||||||
if args.bigendian:
|
|
||||||
endianess = gguf.GGUFEndian.BIG
|
|
||||||
endianess_str = "Big Endian" if args.bigendian else "Little Endian"
|
|
||||||
print(f"gguf: Conversion Endianess {endianess}")
|
|
||||||
# possible tensor data types
|
|
||||||
# ftype == 0 -> float32
|
|
||||||
# ftype == 1 -> float16
|
|
||||||
|
|
||||||
# map from ftype to string
|
|
||||||
ftype_str = ["f32", "f16"]
|
|
||||||
|
|
||||||
if args.outfile is not None:
|
|
||||||
fname_out = args.outfile
|
|
||||||
else:
|
|
||||||
# output in the same directory as the model by default
|
|
||||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
|
||||||
|
|
||||||
print("gguf: loading model "+dir_model.name)
|
|
||||||
|
|
||||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
||||||
hparams = json.load(f)
|
|
||||||
print("hello print: ",hparams["architectures"][0])
|
|
||||||
if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM":
|
|
||||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
|
||||||
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
# get number of model parts
|
|
||||||
num_parts = count_model_parts(dir_model)
|
|
||||||
print(f"num_parts:{num_parts}\n")
|
|
||||||
ARCH=gguf.MODEL_ARCH.BAICHUAN
|
|
||||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
|
||||||
|
|
||||||
print("gguf: get model metadata")
|
|
||||||
|
|
||||||
block_count = hparams["num_hidden_layers"]
|
|
||||||
head_count = hparams["num_attention_heads"]
|
|
||||||
|
|
||||||
if "num_key_value_heads" in hparams:
|
|
||||||
head_count_kv = hparams["num_key_value_heads"]
|
|
||||||
else:
|
|
||||||
head_count_kv = head_count
|
|
||||||
|
|
||||||
if "_name_or_path" in hparams:
|
|
||||||
hf_repo = hparams["_name_or_path"]
|
|
||||||
else:
|
|
||||||
hf_repo = ""
|
|
||||||
|
|
||||||
if "max_sequence_length" in hparams:
|
|
||||||
ctx_length = hparams["max_sequence_length"]
|
|
||||||
elif "max_position_embeddings" in hparams:
|
|
||||||
ctx_length = hparams["max_position_embeddings"]
|
|
||||||
elif "model_max_length" in hparams:
|
|
||||||
ctx_length = hparams["model_max_length"]
|
|
||||||
else:
|
|
||||||
print("gguf: can not find ctx length parameter.")
|
|
||||||
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
|
|
||||||
gguf_writer.add_name(dir_model.name)
|
|
||||||
gguf_writer.add_source_hf_repo(hf_repo)
|
|
||||||
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
|
||||||
gguf_writer.add_context_length(ctx_length)
|
|
||||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|
||||||
gguf_writer.add_block_count(block_count)
|
|
||||||
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
|
||||||
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
|
||||||
gguf_writer.add_head_count(head_count)
|
|
||||||
gguf_writer.add_head_count_kv(head_count_kv)
|
|
||||||
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
|
||||||
|
|
||||||
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
|
||||||
if "type" in hparams["rope_scaling"]:
|
|
||||||
if hparams["rope_scaling"]["type"] == "linear":
|
|
||||||
gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
|
||||||
|
|
||||||
|
|
||||||
# TOKENIZATION
|
|
||||||
|
|
||||||
print("gguf: get tokenizer metadata")
|
|
||||||
|
|
||||||
tokens: list[bytes] = []
|
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
tokenizer_model_file = dir_model / 'tokenizer.model'
|
|
||||||
if not tokenizer_model_file.is_file():
|
|
||||||
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# vocab type sentencepiece
|
|
||||||
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
|
||||||
vocab_size = hparams.get('vocab_size')
|
|
||||||
if vocab_size is None:
|
|
||||||
vocab_size = tokenizer.vocab_size()
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
|
||||||
text: bytes
|
|
||||||
score: float
|
|
||||||
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
|
||||||
text = piece.encode("utf-8")
|
|
||||||
score = tokenizer.get_score(i)
|
|
||||||
|
|
||||||
toktype = 1 # defualt to normal token type
|
|
||||||
if tokenizer.is_unknown(i):
|
|
||||||
toktype = 2
|
|
||||||
if tokenizer.is_control(i):
|
|
||||||
toktype = 3
|
|
||||||
|
|
||||||
# toktype = 4 is user-defined = tokens from added_tokens.json
|
|
||||||
|
|
||||||
if tokenizer.is_unused(i):
|
|
||||||
toktype = 5
|
|
||||||
if tokenizer.is_byte(i):
|
|
||||||
toktype = 6
|
|
||||||
|
|
||||||
tokens.append(text)
|
|
||||||
scores.append(score)
|
|
||||||
toktypes.append(toktype)
|
|
||||||
|
|
||||||
added_tokens_file = dir_model / 'added_tokens.json'
|
|
||||||
if added_tokens_file.is_file():
|
|
||||||
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
|
||||||
addtokens_json = json.load(f)
|
|
||||||
|
|
||||||
print("gguf: get added tokens")
|
|
||||||
|
|
||||||
for key in addtokens_json:
|
|
||||||
tokens.append( key.encode("utf-8") )
|
|
||||||
scores.append(-1000.0)
|
|
||||||
toktypes.append(4) # user-defined token type
|
|
||||||
|
|
||||||
|
|
||||||
gguf_writer.add_tokenizer_model("llama")
|
|
||||||
gguf_writer.add_token_list(tokens)
|
|
||||||
gguf_writer.add_token_scores(scores)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens))
|
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
|
||||||
|
|
||||||
# TENSORS
|
|
||||||
|
|
||||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
|
||||||
|
|
||||||
# tensor info
|
|
||||||
print("gguf: get tensor metadata")
|
|
||||||
|
|
||||||
if num_parts == 0:
|
|
||||||
part_names = iter(("pytorch_model.bin",))
|
|
||||||
else:
|
|
||||||
part_names = (
|
|
||||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
for part_name in part_names:
|
|
||||||
if args.vocab_only:
|
|
||||||
break
|
|
||||||
print("gguf: loading model part '" + part_name + "'")
|
|
||||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
|
||||||
|
|
||||||
tmp=model_part
|
|
||||||
for i in range(block_count):
|
|
||||||
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
|
|
||||||
print(f"Unpacking and permuting layer {i}")
|
|
||||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
|
|
||||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
|
|
||||||
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
|
|
||||||
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
|
|
||||||
|
|
||||||
for name in model_part.keys():
|
|
||||||
data = model_part[name]
|
|
||||||
# we don't need these
|
|
||||||
if name.endswith(".rotary_emb.inv_freq"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
old_dtype = data.dtype
|
|
||||||
|
|
||||||
# convert any unsupported data types to float32
|
|
||||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
|
||||||
data = data.to(torch.float32)
|
|
||||||
|
|
||||||
data = data.squeeze().numpy()
|
|
||||||
|
|
||||||
# map tensor names
|
|
||||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
|
||||||
if new_name is None:
|
|
||||||
print("Can not map tensor '" + name + "'")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
n_dims = len(data.shape)
|
|
||||||
data_dtype = data.dtype
|
|
||||||
|
|
||||||
# if f32 desired, convert any float16 to float32
|
|
||||||
if ftype == 0 and data_dtype == np.float16:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
|
||||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
|
||||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
|
||||||
data = data.astype(np.float16)
|
|
||||||
|
|
||||||
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
|
||||||
gguf_writer.add_tensor(new_name, data)
|
|
||||||
|
|
||||||
|
|
||||||
print("gguf: write header")
|
|
||||||
gguf_writer.write_header_to_file()
|
|
||||||
print("gguf: write metadata")
|
|
||||||
gguf_writer.write_kv_data_to_file()
|
|
||||||
if not args.vocab_only:
|
|
||||||
print("gguf: write tensors")
|
|
||||||
gguf_writer.write_tensors_to_file()
|
|
||||||
|
|
||||||
gguf_writer.close()
|
|
||||||
|
|
||||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
|
||||||
print("")
|
|
|
@ -827,13 +827,14 @@ class StableLMModel(Model):
|
||||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
self.gguf_writer.add_block_count(block_count)
|
self.gguf_writer.add_block_count(block_count)
|
||||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"]*(hparams["hidden_size"] // hparams["num_attention_heads"])))
|
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
|
parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|
|
@ -14,11 +14,13 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
class GGMLFormat(IntEnum):
|
class GGMLFormat(IntEnum):
|
||||||
GGML = 0
|
GGML = 0
|
||||||
GGMF = 1
|
GGMF = 1
|
||||||
GGJT = 2
|
GGJT = 2
|
||||||
|
|
||||||
|
|
||||||
class GGMLFType(IntEnum):
|
class GGMLFType(IntEnum):
|
||||||
ALL_F32 = 0
|
ALL_F32 = 0
|
||||||
MOSTLY_F16 = 1
|
MOSTLY_F16 = 1
|
||||||
|
@ -38,6 +40,7 @@ class GGMLFType(IntEnum):
|
||||||
MOSTLY_Q5_K_M = 17
|
MOSTLY_Q5_K_M = 17
|
||||||
MOSTLY_Q6_K = 18
|
MOSTLY_Q6_K = 18
|
||||||
|
|
||||||
|
|
||||||
class Hyperparameters:
|
class Hyperparameters:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
||||||
|
@ -69,6 +72,7 @@ class Hyperparameters:
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
||||||
|
|
||||||
|
|
||||||
class Vocab:
|
class Vocab:
|
||||||
def __init__(self, load_scores = True):
|
def __init__(self, load_scores = True):
|
||||||
self.items = []
|
self.items = []
|
||||||
|
@ -90,6 +94,7 @@ class Vocab:
|
||||||
self.items.append((item_text, item_score))
|
self.items.append((item_text, item_score))
|
||||||
return offset - orig_offset
|
return offset - orig_offset
|
||||||
|
|
||||||
|
|
||||||
class Tensor:
|
class Tensor:
|
||||||
def __init__(self, use_padding = True):
|
def __init__(self, use_padding = True):
|
||||||
self.name = None
|
self.name = None
|
||||||
|
@ -123,6 +128,7 @@ class Tensor:
|
||||||
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
||||||
return offset - orig_offset
|
return offset - orig_offset
|
||||||
|
|
||||||
|
|
||||||
class GGMLModel:
|
class GGMLModel:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.hyperparameters = None
|
self.hyperparameters = None
|
||||||
|
@ -159,8 +165,8 @@ class GGMLModel:
|
||||||
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
|
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
|
||||||
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
|
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
|
||||||
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
|
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
|
||||||
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
|
if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
|
||||||
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
|
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
|
||||||
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
|
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
|
||||||
if len(err) > 0:
|
if len(err) > 0:
|
||||||
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
|
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
|
||||||
|
@ -187,6 +193,7 @@ class GGMLModel:
|
||||||
hp.set_n_ff(self)
|
hp.set_n_ff(self)
|
||||||
return offset
|
return offset
|
||||||
|
|
||||||
|
|
||||||
class GGMLToGGUF:
|
class GGMLToGGUF:
|
||||||
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
||||||
hp = ggml_model.hyperparameters
|
hp = ggml_model.hyperparameters
|
||||||
|
@ -217,7 +224,7 @@ class GGMLToGGUF:
|
||||||
gguf_writer = gguf.GGUFWriter(
|
gguf_writer = gguf.GGUFWriter(
|
||||||
self.cfg.output,
|
self.cfg.output,
|
||||||
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
||||||
use_temp_file = False )
|
use_temp_file = False)
|
||||||
self.add_params(gguf_writer)
|
self.add_params(gguf_writer)
|
||||||
self.add_vocab(gguf_writer)
|
self.add_vocab(gguf_writer)
|
||||||
if self.special_vocab is not None:
|
if self.special_vocab is not None:
|
||||||
|
@ -341,7 +348,8 @@ class GGMLToGGUF:
|
||||||
mapped_name,
|
mapped_name,
|
||||||
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
|
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
|
||||||
raw_shape = tempdims,
|
raw_shape = tempdims,
|
||||||
raw_dtype = tensor.dtype )
|
raw_dtype = tensor.dtype)
|
||||||
|
|
||||||
|
|
||||||
def handle_metadata(cfg, hp):
|
def handle_metadata(cfg, hp):
|
||||||
import convert
|
import convert
|
||||||
|
@ -365,38 +373,40 @@ def handle_metadata(cfg, hp):
|
||||||
raise ValueError('Unable to load metadata')
|
raise ValueError('Unable to load metadata')
|
||||||
vocab = convert.load_vocab(
|
vocab = convert.load_vocab(
|
||||||
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
||||||
cfg.vocabtype )
|
cfg.vocabtype)
|
||||||
# FIXME: Respect cfg.vocab_dir?
|
# FIXME: Respect cfg.vocab_dir?
|
||||||
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
|
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
|
||||||
load_merges = cfg.vocabtype == 'bpe',
|
load_merges = cfg.vocabtype == 'bpe',
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
convert.check_vocab_size(params, vocab)
|
convert.check_vocab_size(params, vocab)
|
||||||
return (params, vocab, svocab)
|
return (params, vocab, svocab)
|
||||||
|
|
||||||
|
|
||||||
def handle_args():
|
def handle_args():
|
||||||
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
||||||
parser.add_argument('--input', '-i', type = Path, required = True,
|
parser.add_argument('--input', '-i', type = Path, required = True,
|
||||||
help = 'Input GGMLv3 filename')
|
help = 'Input GGMLv3 filename')
|
||||||
parser.add_argument('--output', '-o', type = Path, required = True,
|
parser.add_argument('--output', '-o', type = Path, required = True,
|
||||||
help ='Output GGUF filename')
|
help ='Output GGUF filename')
|
||||||
parser.add_argument('--name',
|
parser.add_argument('--name',
|
||||||
help = 'Set model name')
|
help = 'Set model name')
|
||||||
parser.add_argument('--desc',
|
parser.add_argument('--desc',
|
||||||
help = 'Set model description')
|
help = 'Set model description')
|
||||||
parser.add_argument('--gqa', type = int, default = 1,
|
parser.add_argument('--gqa', type = int, default = 1,
|
||||||
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
||||||
parser.add_argument('--eps', default = '5.0e-06',
|
parser.add_argument('--eps', default = '5.0e-06',
|
||||||
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
||||||
parser.add_argument('--context-length', '-c', type=int, default = 2048,
|
parser.add_argument('--context-length', '-c', type=int, default = 2048,
|
||||||
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
||||||
parser.add_argument('--model-metadata-dir', '-m', type = Path,
|
parser.add_argument('--model-metadata-dir', '-m', type = Path,
|
||||||
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
||||||
parser.add_argument("--vocab-dir", type=Path,
|
parser.add_argument("--vocab-dir", type=Path,
|
||||||
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
||||||
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
|
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
|
||||||
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cfg = handle_args()
|
cfg = handle_args()
|
||||||
print(f'* Using config: {cfg}')
|
print(f'* Using config: {cfg}')
|
||||||
|
@ -406,7 +416,7 @@ def main():
|
||||||
data = np.memmap(cfg.input, mode = 'r')
|
data = np.memmap(cfg.input, mode = 'r')
|
||||||
model = GGMLModel()
|
model = GGMLModel()
|
||||||
print('* Scanning GGML input file')
|
print('* Scanning GGML input file')
|
||||||
offset = model.load(data, 0)
|
offset = model.load(data, 0) # noqa
|
||||||
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||||
vocab_override = None
|
vocab_override = None
|
||||||
params_override = None
|
params_override = None
|
||||||
|
@ -421,12 +431,15 @@ def main():
|
||||||
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
||||||
if model.file_format == GGMLFormat.GGML:
|
if model.file_format == GGMLFormat.GGML:
|
||||||
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
||||||
converter = GGMLToGGUF(model, data, cfg,
|
converter = GGMLToGGUF(
|
||||||
|
model, data, cfg,
|
||||||
params_override = params_override,
|
params_override = params_override,
|
||||||
vocab_override = vocab_override,
|
vocab_override = vocab_override,
|
||||||
special_vocab = special_vocab )
|
special_vocab = special_vocab
|
||||||
|
)
|
||||||
converter.save()
|
converter.save()
|
||||||
print(f'* Successful completion. Output saved to: {cfg.output}')
|
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -9,6 +9,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
def _flatten_dict(dct, tensors, prefix=None):
|
def _flatten_dict(dct, tensors, prefix=None):
|
||||||
assert isinstance(dct, dict)
|
assert isinstance(dct, dict)
|
||||||
for key in dct.keys():
|
for key in dct.keys():
|
||||||
|
@ -21,6 +22,7 @@ def _flatten_dict(dct, tensors, prefix=None):
|
||||||
raise ValueError(type(dct[key]))
|
raise ValueError(type(dct[key]))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
||||||
tokenizer_path = dir_model / 'adept_vocab.model'
|
tokenizer_path = dir_model / 'adept_vocab.model'
|
||||||
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
||||||
|
@ -54,6 +56,7 @@ def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
||||||
pass
|
pass
|
||||||
return tokens, scores, toktypes
|
return tokens, scores, toktypes
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
||||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
|
@ -125,6 +128,5 @@ def main():
|
||||||
print("")
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
60
convert.py
Executable file → Normal file
60
convert.py
Executable file → Normal file
|
@ -46,6 +46,7 @@ DEFAULT_CONCURRENCY = 8
|
||||||
# data types
|
# data types
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class DataType:
|
class DataType:
|
||||||
name: str
|
name: str
|
||||||
|
@ -55,15 +56,18 @@ class DataType:
|
||||||
def elements_to_bytes(self, n_elements: int) -> int:
|
def elements_to_bytes(self, n_elements: int) -> int:
|
||||||
return n_elements * self.dtype.itemsize
|
return n_elements * self.dtype.itemsize
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class UnquantizedDataType(DataType):
|
class UnquantizedDataType(DataType):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
||||||
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
||||||
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
||||||
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class QuantizedDataType(DataType):
|
class QuantizedDataType(DataType):
|
||||||
block_size: int
|
block_size: int
|
||||||
|
@ -77,6 +81,7 @@ class QuantizedDataType(DataType):
|
||||||
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
|
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
|
||||||
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
|
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class Q8_0QuantizedDataType(QuantizedDataType):
|
class Q8_0QuantizedDataType(QuantizedDataType):
|
||||||
# Mini Q8_0 quantization in Python!
|
# Mini Q8_0 quantization in Python!
|
||||||
|
@ -86,6 +91,7 @@ class Q8_0QuantizedDataType(QuantizedDataType):
|
||||||
n_blocks = arr.size // self.block_size
|
n_blocks = arr.size // self.block_size
|
||||||
blocks = arr.reshape((n_blocks, self.block_size))
|
blocks = arr.reshape((n_blocks, self.block_size))
|
||||||
# Much faster implementation of block quantization contributed by @Cebtenzzre
|
# Much faster implementation of block quantization contributed by @Cebtenzzre
|
||||||
|
|
||||||
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
|
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
|
||||||
d = abs(blocks).max(axis = 1) / np.float32(127)
|
d = abs(blocks).max(axis = 1) / np.float32(127)
|
||||||
with np.errstate(divide = 'ignore'):
|
with np.errstate(divide = 'ignore'):
|
||||||
|
@ -94,10 +100,11 @@ class Q8_0QuantizedDataType(QuantizedDataType):
|
||||||
yield from zip(d, qs)
|
yield from zip(d, qs)
|
||||||
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
|
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
|
||||||
|
|
||||||
|
|
||||||
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
|
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
|
||||||
dtype = np.dtype(np.float32), valid_conversions = [],
|
dtype = np.dtype(np.float32), valid_conversions = [],
|
||||||
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
||||||
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
|
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
|
||||||
|
|
||||||
# Quantized types skipped here because they may also map to np.float32
|
# Quantized types skipped here because they may also map to np.float32
|
||||||
NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
|
NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
|
||||||
|
@ -116,6 +123,8 @@ SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
|
||||||
# TODO: match this with `llama_ftype`
|
# TODO: match this with `llama_ftype`
|
||||||
# TODO: rename to LLAMAFileType
|
# TODO: rename to LLAMAFileType
|
||||||
# TODO: move to `gguf.py`
|
# TODO: move to `gguf.py`
|
||||||
|
|
||||||
|
|
||||||
class GGMLFileType(enum.IntEnum):
|
class GGMLFileType(enum.IntEnum):
|
||||||
AllF32 = 0
|
AllF32 = 0
|
||||||
MostlyF16 = 1 # except 1d tensors
|
MostlyF16 = 1 # except 1d tensors
|
||||||
|
@ -128,6 +137,7 @@ class GGMLFileType(enum.IntEnum):
|
||||||
# 1D tensors are always F32.
|
# 1D tensors are always F32.
|
||||||
return dt if len(tensor.shape) > 1 else DT_F32
|
return dt if len(tensor.shape) > 1 else DT_F32
|
||||||
|
|
||||||
|
|
||||||
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
||||||
GGMLFileType.AllF32 : DT_F32,
|
GGMLFileType.AllF32 : DT_F32,
|
||||||
GGMLFileType.MostlyF16 : DT_F16,
|
GGMLFileType.MostlyF16 : DT_F16,
|
||||||
|
@ -138,6 +148,7 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
||||||
# hparams loading
|
# hparams loading
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Params:
|
class Params:
|
||||||
n_vocab: int
|
n_vocab: int
|
||||||
|
@ -167,11 +178,11 @@ class Params:
|
||||||
|
|
||||||
# try transformer naming first
|
# try transformer naming first
|
||||||
if "model.layers.0.self_attn.q_proj.weight" in model:
|
if "model.layers.0.self_attn.q_proj.weight" in model:
|
||||||
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
||||||
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
|
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
|
||||||
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
||||||
else:
|
else:
|
||||||
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||||
|
|
||||||
if n_layer < 1:
|
if n_layer < 1:
|
||||||
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
||||||
|
@ -308,7 +319,7 @@ class BpeVocab:
|
||||||
(item['content'], item['id'])
|
(item['content'], item['id'])
|
||||||
for item in tokenizer_json.get('added_tokens', [])
|
for item in tokenizer_json.get('added_tokens', [])
|
||||||
# Added tokens here can be duplicates of the main vocabulary.
|
# Added tokens here can be duplicates of the main vocabulary.
|
||||||
if item['content'] not in self.bpe_tokenizer )
|
if item['content'] not in self.bpe_tokenizer)
|
||||||
|
|
||||||
vocab_size: int = len(self.bpe_tokenizer)
|
vocab_size: int = len(self.bpe_tokenizer)
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||||
|
@ -326,7 +337,6 @@ class BpeVocab:
|
||||||
|
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.bpe_tokenizer
|
tokenizer = self.bpe_tokenizer
|
||||||
from transformers.models.gpt2 import tokenization_gpt2
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
||||||
|
|
||||||
for i, _ in enumerate(tokenizer):
|
for i, _ in enumerate(tokenizer):
|
||||||
|
@ -406,6 +416,7 @@ class SentencePieceVocab:
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -413,13 +424,14 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
||||||
# TODO: reuse (probably move to gguf.py?)
|
# TODO: reuse (probably move to gguf.py?)
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
||||||
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
# print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
n_head = n_head_kv
|
n_head = n_head_kv
|
||||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
.swapaxes(1, 2)
|
.swapaxes(1, 2)
|
||||||
.reshape(weights.shape))
|
.reshape(weights.shape))
|
||||||
|
|
||||||
|
|
||||||
class Tensor(metaclass=ABCMeta):
|
class Tensor(metaclass=ABCMeta):
|
||||||
|
@ -500,7 +512,7 @@ class LazyTensor:
|
||||||
ret = self._load()
|
ret = self._load()
|
||||||
# Should be okay if it maps to the same numpy type?
|
# Should be okay if it maps to the same numpy type?
|
||||||
assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
|
assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
|
||||||
(self.data_type, ret.data_type, self.description)
|
(self.data_type, ret.data_type, self.description)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def astype(self, data_type: DataType) -> LazyTensor:
|
def astype(self, data_type: DataType) -> LazyTensor:
|
||||||
|
@ -588,6 +600,7 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe
|
||||||
return lazy_tensor.load().permute(n_head, n_head_kv)
|
return lazy_tensor.load().permute(n_head, n_head_kv)
|
||||||
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
||||||
|
|
||||||
|
|
||||||
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
|
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
|
||||||
def load() -> Tensor:
|
def load() -> Tensor:
|
||||||
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
|
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
|
||||||
|
@ -595,6 +608,7 @@ def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_
|
||||||
s[0] = s[0] // 3
|
s[0] = s[0] // 3
|
||||||
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
||||||
|
|
||||||
|
|
||||||
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
||||||
def load() -> Tensor:
|
def load() -> Tensor:
|
||||||
return lazy_tensor.load().part(n_part)
|
return lazy_tensor.load().part(n_part)
|
||||||
|
@ -744,6 +758,7 @@ def lazy_load_file(path: Path) -> ModelPlus:
|
||||||
In = TypeVar('In')
|
In = TypeVar('In')
|
||||||
Out = TypeVar('Out')
|
Out = TypeVar('Out')
|
||||||
|
|
||||||
|
|
||||||
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
|
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
|
||||||
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
||||||
fast enough, this will stop calling `func` at some point rather than
|
fast enough, this will stop calling `func` at some point rather than
|
||||||
|
@ -778,6 +793,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
||||||
break
|
break
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
|
||||||
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||||
if params.n_vocab != vocab.vocab_size:
|
if params.n_vocab != vocab.vocab_size:
|
||||||
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
||||||
|
@ -796,7 +812,7 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||||
|
|
||||||
|
|
||||||
class OutputFile:
|
class OutputFile:
|
||||||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||||
|
|
||||||
def add_meta_arch(self, params: Params) -> None:
|
def add_meta_arch(self, params: Params) -> None:
|
||||||
|
@ -876,7 +892,7 @@ class OutputFile:
|
||||||
self.gguf.close()
|
self.gguf.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||||
check_vocab_size(params, vocab)
|
check_vocab_size(params, vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
|
@ -938,8 +954,9 @@ class OutputFile:
|
||||||
|
|
||||||
of.close()
|
of.close()
|
||||||
|
|
||||||
|
|
||||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||||
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
|
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
|
||||||
|
|
||||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
||||||
return GGMLFileType.AllF32
|
return GGMLFileType.AllF32
|
||||||
|
@ -952,10 +969,12 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
|
||||||
|
|
||||||
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
||||||
|
|
||||||
|
|
||||||
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
||||||
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
||||||
for (name, tensor) in model.items()}
|
for (name, tensor) in model.items()}
|
||||||
|
|
||||||
|
|
||||||
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||||
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
||||||
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
||||||
|
@ -968,7 +987,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||||
print(f"Permuting layer {i}")
|
print(f"Permuting layer {i}")
|
||||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
|
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
|
||||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
|
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
|
||||||
#tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
# tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
||||||
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
||||||
print(f"Unpacking and permuting layer {i}")
|
print(f"Unpacking and permuting layer {i}")
|
||||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
|
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
|
||||||
|
@ -993,6 +1012,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def nth_multifile_path(path: Path, n: int) -> Path | None:
|
def nth_multifile_path(path: Path, n: int) -> Path | None:
|
||||||
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
||||||
the nth path in the model.
|
the nth path in the model.
|
||||||
|
@ -1174,8 +1194,8 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
# FIXME: Try to respect vocab_dir somehow?
|
# FIXME: Try to respect vocab_dir somehow?
|
||||||
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
|
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
|
||||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||||
load_merges = args.vocabtype == 'bpe',
|
load_merges = args.vocabtype == 'bpe',
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
outfile = args.outfile
|
outfile = args.outfile
|
||||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
|
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
|
@ -1188,8 +1208,8 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
vocab = load_vocab(vocab_dir, args.vocabtype)
|
vocab = load_vocab(vocab_dir, args.vocabtype)
|
||||||
# FIXME: Try to respect vocab_dir somehow?
|
# FIXME: Try to respect vocab_dir somehow?
|
||||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||||
load_merges = args.vocabtype == 'bpe',
|
load_merges = args.vocabtype == 'bpe',
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
|
|
||||||
model = model_plus.model
|
model = model_plus.model
|
||||||
model = convert_model_names(model, params)
|
model = convert_model_names(model, params)
|
||||||
|
|
|
@ -147,6 +147,13 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
if (params.chatml) {
|
||||||
|
printf("\n************\n");
|
||||||
|
printf("%s: please use the 'main' tool for chatml mode\n", __func__);
|
||||||
|
printf("************\n\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
printf("\n************\n");
|
printf("\n************\n");
|
||||||
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
|
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
|
||||||
|
|
|
@ -235,8 +235,11 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
||||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
LOG("tokenize the prompt\n");
|
LOG("tokenize the prompt\n");
|
||||||
|
if (params.chatml) {
|
||||||
|
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
|
||||||
|
}
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||||
} else {
|
} else {
|
||||||
LOG("use session tokens\n");
|
LOG("use session tokens\n");
|
||||||
|
@ -314,7 +317,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// number of tokens to keep when resetting context
|
// number of tokens to keep when resetting context
|
||||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
|
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
|
||||||
params.n_keep = (int)embd_inp.size();
|
params.n_keep = (int)embd_inp.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -325,11 +328,23 @@ int main(int argc, char ** argv) {
|
||||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
||||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
||||||
|
|
||||||
|
// chatml prefix & suffix
|
||||||
|
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
|
||||||
|
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
||||||
|
|
||||||
|
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
||||||
|
LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
|
||||||
|
|
||||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||||
if (params.instruct) {
|
if (params.instruct) {
|
||||||
params.interactive_first = true;
|
params.interactive_first = true;
|
||||||
params.antiprompt.push_back("### Instruction:\n\n");
|
params.antiprompt.push_back("### Instruction:\n\n");
|
||||||
}
|
}
|
||||||
|
// similar for chatml mode
|
||||||
|
else if (params.chatml) {
|
||||||
|
params.interactive_first = true;
|
||||||
|
params.antiprompt.push_back("<|im_start|>user\n");
|
||||||
|
}
|
||||||
|
|
||||||
// enable interactive mode if interactive start is specified
|
// enable interactive mode if interactive start is specified
|
||||||
if (params.interactive_first) {
|
if (params.interactive_first) {
|
||||||
|
@ -706,7 +721,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
printf("\n");
|
printf("\n");
|
||||||
} else if (params.instruct) {
|
} else if (params.instruct || params.chatml) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -714,7 +729,7 @@ int main(int argc, char ** argv) {
|
||||||
if (n_past > 0 && is_interacting) {
|
if (n_past > 0 && is_interacting) {
|
||||||
LOG("waiting for user input\n");
|
LOG("waiting for user input\n");
|
||||||
|
|
||||||
if (params.instruct) {
|
if (params.instruct || params.chatml) {
|
||||||
printf("\n> ");
|
printf("\n> ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -761,6 +776,12 @@ int main(int argc, char ** argv) {
|
||||||
n_consumed = embd_inp.size();
|
n_consumed = embd_inp.size();
|
||||||
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
||||||
}
|
}
|
||||||
|
// chatml mode: insert user chat prefix
|
||||||
|
if (params.chatml && !is_antiprompt) {
|
||||||
|
LOG("inserting chatml prefix\n");
|
||||||
|
n_consumed = embd_inp.size();
|
||||||
|
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
|
||||||
|
}
|
||||||
if (params.escape) {
|
if (params.escape) {
|
||||||
process_escapes(buffer);
|
process_escapes(buffer);
|
||||||
}
|
}
|
||||||
|
@ -779,6 +800,11 @@ int main(int argc, char ** argv) {
|
||||||
LOG("inserting instruction suffix\n");
|
LOG("inserting instruction suffix\n");
|
||||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||||
}
|
}
|
||||||
|
// chatml mode: insert assistant chat suffix
|
||||||
|
if (params.chatml) {
|
||||||
|
LOG("inserting chatml suffix\n");
|
||||||
|
embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
|
||||||
|
}
|
||||||
|
|
||||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||||
const llama_token token = embd_inp[i];
|
const llama_token token = embd_inp[i];
|
||||||
|
@ -804,7 +830,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// end of text token
|
// end of text token
|
||||||
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
|
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
|
||||||
LOG_TEE(" [end of text]\n");
|
LOG_TEE(" [end of text]\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -94,6 +94,10 @@ export async function* llama(prompt, params = {}, config = {}) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (result.error) {
|
||||||
|
result.error = JSON.parse(result.error);
|
||||||
|
console.error(`llama.cpp error: ${result.error.content}`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2369,6 +2369,17 @@ int main(int argc, char **argv)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
const std::string str =
|
||||||
|
"error: " +
|
||||||
|
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
|
"\n\n";
|
||||||
|
LOG_VERBOSE("data stream", {
|
||||||
|
{ "to_send", str }
|
||||||
|
});
|
||||||
|
if (!sink.write(str.c_str(), str.size()))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -96,9 +96,22 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// tokenize the prompt
|
|
||||||
|
// Tokenize the prompt
|
||||||
|
const bool add_bos_tgt = llama_should_add_bos_token(model_tgt);
|
||||||
|
LOG("add_bos tgt: %d\n", add_bos_tgt);
|
||||||
|
|
||||||
|
const bool add_bos_dft = llama_should_add_bos_token(model_dft);
|
||||||
|
LOG("add_bos dft: %d\n", add_bos_dft);
|
||||||
|
|
||||||
|
if (add_bos_tgt != add_bos_dft) {
|
||||||
|
fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__);
|
||||||
|
fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
|
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx(ctx_tgt);
|
const int max_context_size = llama_n_ctx(ctx_tgt);
|
||||||
const int max_tokens_list_size = max_context_size - 4;
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
|
|
@ -26,7 +26,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
const bool add_bos = true;
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
|
|
||||||
|
|
107
ggml-cuda.cu
107
ggml-cuda.cu
|
@ -235,7 +235,7 @@ typedef float2 dfloat2;
|
||||||
#endif //GGML_CUDA_F16
|
#endif //GGML_CUDA_F16
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
||||||
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
||||||
|
|
||||||
int x32 = 0;
|
int x32 = 0;
|
||||||
x32 |= x16[0] << 0;
|
x32 |= x16[0] << 0;
|
||||||
|
@ -245,7 +245,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
||||||
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
||||||
|
|
||||||
int x32 = 0;
|
int x32 = 0;
|
||||||
x32 |= x16[0] << 0;
|
x32 |= x16[0] << 0;
|
||||||
|
@ -255,11 +255,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
||||||
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
||||||
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
|
@ -469,7 +469,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
||||||
#define MUL_MAT_SRC1_COL_STRIDE 128
|
#define MUL_MAT_SRC1_COL_STRIDE 128
|
||||||
|
|
||||||
#define MAX_STREAMS 8
|
#define MAX_STREAMS 8
|
||||||
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu {
|
struct ggml_tensor_extra_gpu {
|
||||||
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
||||||
|
@ -2249,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
||||||
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
||||||
|
@ -2260,7 +2261,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
GGML_CUDA_ASSUME(k >= 0);
|
GGML_CUDA_ASSUME(k >= 0);
|
||||||
|
@ -2269,7 +2270,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI4_0;
|
const int kbx = k / QI4_0;
|
||||||
const int kqsx = k % QI4_0;
|
const int kqsx = k % QI4_0;
|
||||||
|
|
||||||
const block_q4_0 * bx0 = (block_q4_0 *) vx;
|
const block_q4_0 * bx0 = (const block_q4_0 *) vx;
|
||||||
|
|
||||||
float * x_dmf = (float *) x_dm;
|
float * x_dmf = (float *) x_dm;
|
||||||
|
|
||||||
|
@ -2307,9 +2308,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
||||||
const float * x_dmf = (float *) x_dm;
|
const float * x_dmf = (const float *) x_dm;
|
||||||
|
|
||||||
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
||||||
|
|
||||||
|
@ -2343,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
||||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
||||||
|
@ -2354,6 +2357,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -2363,7 +2367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI4_1;
|
const int kbx = k / QI4_1;
|
||||||
const int kqsx = k % QI4_1;
|
const int kqsx = k % QI4_1;
|
||||||
|
|
||||||
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
const block_q4_1 * bx0 = (const block_q4_1 *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -2398,6 +2402,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
||||||
|
|
||||||
|
@ -2435,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
||||||
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
||||||
|
@ -2446,6 +2452,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -2455,7 +2462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI5_0;
|
const int kbx = k / QI5_0;
|
||||||
const int kqsx = k % QI5_0;
|
const int kqsx = k % QI5_0;
|
||||||
|
|
||||||
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
const block_q5_0 * bx0 = (const block_q5_0 *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -2510,6 +2517,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
||||||
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
||||||
|
@ -2549,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
||||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
||||||
|
@ -2560,6 +2569,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -2569,7 +2579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI5_1;
|
const int kbx = k / QI5_1;
|
||||||
const int kqsx = k % QI5_1;
|
const int kqsx = k % QI5_1;
|
||||||
|
|
||||||
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
const block_q5_1 * bx0 = (const block_q5_1 *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -2621,6 +2631,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
||||||
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
||||||
|
@ -2655,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
||||||
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
||||||
|
@ -2666,6 +2678,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -2676,7 +2689,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kqsx = k % QI8_0;
|
const int kqsx = k % QI8_0;
|
||||||
float * x_dmf = (float *) x_dm;
|
float * x_dmf = (float *) x_dm;
|
||||||
|
|
||||||
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
const block_q8_0 * bx0 = (const block_q8_0 *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -2711,6 +2724,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
const float * x_dmf = (const float *) x_dm;
|
const float * x_dmf = (const float *) x_dm;
|
||||||
const float * y_df = (const float *) y_ds;
|
const float * y_df = (const float *) y_ds;
|
||||||
|
@ -2744,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
||||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
||||||
|
@ -2757,6 +2772,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -2766,7 +2782,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI2_K;
|
const int kbx = k / QI2_K;
|
||||||
const int kqsx = k % QI2_K;
|
const int kqsx = k % QI2_K;
|
||||||
|
|
||||||
const block_q2_K * bx0 = (block_q2_K *) vx;
|
const block_q2_K * bx0 = (const block_q2_K *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -2814,6 +2830,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
const int kbx = k / QI2_K;
|
const int kbx = k / QI2_K;
|
||||||
const int ky = (k % QI2_K) * QR2_K;
|
const int ky = (k % QI2_K) * QR2_K;
|
||||||
|
@ -2887,7 +2904,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI3_K;
|
const int kbx = k / QI3_K;
|
||||||
const int kqsx = k % QI3_K;
|
const int kqsx = k % QI3_K;
|
||||||
|
|
||||||
const block_q3_K * bx0 = (block_q3_K *) vx;
|
const block_q3_K * bx0 = (const block_q3_K *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -2968,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
||||||
const float * x_dmf = (const float *) x_dm;
|
const float * x_dmf = (const float *) x_dm;
|
||||||
const float * y_df = (const float *) y_ds;
|
const float * y_df = (const float *) y_ds;
|
||||||
|
|
||||||
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
||||||
|
|
||||||
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
||||||
|
|
||||||
|
@ -3083,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
||||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
||||||
|
@ -3096,6 +3114,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -3105,7 +3124,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
||||||
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
||||||
|
|
||||||
const block_q4_K * bx0 = (block_q4_K *) vx;
|
const block_q4_K * bx0 = (const block_q4_K *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -3150,7 +3169,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
|
|
||||||
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
||||||
|
|
||||||
const int * scales = (int *) bxi->scales;
|
const int * scales = (const int *) bxi->scales;
|
||||||
|
|
||||||
const int ksc = k % (WARP_SIZE/8);
|
const int ksc = k % (WARP_SIZE/8);
|
||||||
|
|
||||||
|
@ -3165,6 +3184,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
||||||
|
|
||||||
|
@ -3264,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
||||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
||||||
|
@ -3277,6 +3298,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -3286,7 +3308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
||||||
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
||||||
|
|
||||||
const block_q5_K * bx0 = (block_q5_K *) vx;
|
const block_q5_K * bx0 = (const block_q5_K *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -3342,7 +3364,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
|
|
||||||
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
||||||
|
|
||||||
const int * scales = (int *) bxi->scales;
|
const int * scales = (const int *) bxi->scales;
|
||||||
|
|
||||||
const int ksc = k % (WARP_SIZE/8);
|
const int ksc = k % (WARP_SIZE/8);
|
||||||
|
|
||||||
|
@ -3357,6 +3379,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
||||||
|
|
||||||
|
@ -3393,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
||||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
||||||
|
@ -3406,6 +3430,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -3415,7 +3440,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
||||||
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
||||||
|
|
||||||
const block_q6_K * bx0 = (block_q6_K *) vx;
|
const block_q6_K * bx0 = (const block_q6_K *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -3477,6 +3502,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
const float * x_dmf = (const float *) x_dm;
|
const float * x_dmf = (const float *) x_dm;
|
||||||
const float * y_df = (const float *) y_ds;
|
const float * y_df = (const float *) y_ds;
|
||||||
|
@ -3519,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||||
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
||||||
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
||||||
|
|
||||||
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
|
||||||
|
|
||||||
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
||||||
|
|
||||||
|
@ -6012,18 +6038,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
||||||
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
||||||
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
||||||
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
||||||
} else if (nb0 == ts) {
|
|
||||||
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
|
||||||
} else {
|
|
||||||
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
|
||||||
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
|
||||||
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
|
||||||
// pretend the row is a matrix with cols=1
|
|
||||||
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
|
||||||
if (r != cudaSuccess) return r;
|
|
||||||
}
|
|
||||||
return cudaSuccess;
|
|
||||||
}
|
}
|
||||||
|
if (nb0 == ts) {
|
||||||
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
||||||
|
}
|
||||||
|
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
||||||
|
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
||||||
|
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
||||||
|
// pretend the row is a matrix with cols=1
|
||||||
|
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
||||||
|
if (r != cudaSuccess) { return r; }
|
||||||
|
}
|
||||||
|
return cudaSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_op_repeat(
|
static void ggml_cuda_op_repeat(
|
||||||
|
@ -6978,7 +7004,7 @@ static void ggml_cuda_op_mul_mat(
|
||||||
const int64_t ne01 = src0->ne[1];
|
const int64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne02 = src0->ne[2];
|
const int64_t ne02 = src0->ne[2];
|
||||||
const int64_t ne03 = src0->ne[3];
|
const int64_t ne03 = src0->ne[3];
|
||||||
const int64_t nrows0 = ggml_nrows(src0);
|
// const int64_t nrows0 = ggml_nrows(src0);
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
const int64_t ne11 = src1->ne[1];
|
const int64_t ne11 = src1->ne[1];
|
||||||
|
@ -7079,7 +7105,7 @@ static void ggml_cuda_op_mul_mat(
|
||||||
if (src0_on_device && src0_is_contiguous) {
|
if (src0_on_device && src0_is_contiguous) {
|
||||||
src0_dd[id] = (char *) src0_extra->data_device[id];
|
src0_dd[id] = (char *) src0_extra->data_device[id];
|
||||||
} else {
|
} else {
|
||||||
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
// const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
||||||
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7312,7 +7338,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||||
if (!g_cublas_loaded) return false;
|
if (!g_cublas_loaded) { return false; }
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
|
|
||||||
|
@ -7390,7 +7416,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
||||||
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void k_compute_batched_ptrs(
|
__global__ static void k_compute_batched_ptrs(
|
||||||
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
||||||
const void ** ptrs_src, void ** ptrs_dst,
|
const void ** ptrs_src, void ** ptrs_dst,
|
||||||
int ne12, int ne13,
|
int ne12, int ne13,
|
||||||
|
@ -8013,7 +8039,7 @@ void ggml_cuda_free_scratch() {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||||
if (!g_cublas_loaded) return false;
|
if (!g_cublas_loaded) { return false; }
|
||||||
|
|
||||||
ggml_cuda_func_t func;
|
ggml_cuda_func_t func;
|
||||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
||||||
|
@ -8312,14 +8338,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
||||||
UNUSED(cgraph);
|
UNUSED(cgraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
[[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
GGML_ASSERT(!"not implemented");
|
GGML_ASSERT(!"not implemented");
|
||||||
|
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
UNUSED(plan);
|
UNUSED(plan);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
[[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
GGML_ASSERT(!"not implemented");
|
GGML_ASSERT(!"not implemented");
|
||||||
|
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
|
@ -8335,8 +8361,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
assert(node->backend == GGML_BACKEND_GPU);
|
assert(node->backend == GGML_BACKEND_GPU);
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
if (node->src[j] != nullptr) {
|
if (node->src[j] != nullptr) {
|
||||||
|
|
|
@ -56,20 +56,21 @@ class Keys:
|
||||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
MODEL = "tokenizer.ggml.model"
|
MODEL = "tokenizer.ggml.model"
|
||||||
LIST = "tokenizer.ggml.tokens"
|
LIST = "tokenizer.ggml.tokens"
|
||||||
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
||||||
SCORES = "tokenizer.ggml.scores"
|
SCORES = "tokenizer.ggml.scores"
|
||||||
MERGES = "tokenizer.ggml.merges"
|
MERGES = "tokenizer.ggml.merges"
|
||||||
BOS_ID = "tokenizer.ggml.bos_token_id"
|
BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||||
EOS_ID = "tokenizer.ggml.eos_token_id"
|
EOS_ID = "tokenizer.ggml.eos_token_id"
|
||||||
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||||
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||||
HF_JSON = "tokenizer.huggingface.json"
|
HF_JSON = "tokenizer.huggingface.json"
|
||||||
RWKV = "tokenizer.rwkv.world"
|
RWKV = "tokenizer.rwkv.world"
|
||||||
|
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
|
@ -221,7 +221,7 @@ class GGUFWriter:
|
||||||
if self.endianess == GGUFEndian.BIG:
|
if self.endianess == GGUFEndian.BIG:
|
||||||
tensor.byteswap(inplace=True)
|
tensor.byteswap(inplace=True)
|
||||||
if self.use_temp_file and self.temp_file is None:
|
if self.use_temp_file and self.temp_file is None:
|
||||||
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
|
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024)
|
||||||
fp.seek(0)
|
fp.seek(0)
|
||||||
self.temp_file = fp
|
self.temp_file = fp
|
||||||
|
|
||||||
|
@ -399,6 +399,9 @@ class GGUFWriter:
|
||||||
def add_add_eos_token(self, value: bool) -> None:
|
def add_add_eos_token(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
||||||
|
|
||||||
|
def add_chat_template(self, value: str) -> None:
|
||||||
|
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
|
||||||
|
|
||||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||||
pack_prefix = ''
|
pack_prefix = ''
|
||||||
if not skip_pack_prefix:
|
if not skip_pack_prefix:
|
||||||
|
|
|
@ -13,6 +13,7 @@ class SpecialVocab:
|
||||||
merges: list[str]
|
merges: list[str]
|
||||||
add_special_token: dict[str, bool]
|
add_special_token: dict[str, bool]
|
||||||
special_token_ids: dict[str, int]
|
special_token_ids: dict[str, int]
|
||||||
|
chat_template: str | None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, path: str | os.PathLike[str], load_merges: bool = False,
|
self, path: str | os.PathLike[str], load_merges: bool = False,
|
||||||
|
@ -24,6 +25,7 @@ class SpecialVocab:
|
||||||
self.n_vocab = n_vocab
|
self.n_vocab = n_vocab
|
||||||
self.load_merges = load_merges
|
self.load_merges = load_merges
|
||||||
self.merges = []
|
self.merges = []
|
||||||
|
self.chat_template = None
|
||||||
if special_token_types is not None:
|
if special_token_types is not None:
|
||||||
self.special_token_types = special_token_types
|
self.special_token_types = special_token_types
|
||||||
else:
|
else:
|
||||||
|
@ -67,6 +69,10 @@ class SpecialVocab:
|
||||||
if not quiet:
|
if not quiet:
|
||||||
print(f'gguf: Setting add_{typ}_token to {value}')
|
print(f'gguf: Setting add_{typ}_token to {value}')
|
||||||
add_handler(value)
|
add_handler(value)
|
||||||
|
if self.chat_template is not None:
|
||||||
|
if not quiet:
|
||||||
|
print(f'gguf: Setting chat_template to {self.chat_template}')
|
||||||
|
gw.add_chat_template(self.chat_template)
|
||||||
|
|
||||||
def _load(self, path: Path) -> None:
|
def _load(self, path: Path) -> None:
|
||||||
self._try_load_from_tokenizer_json(path)
|
self._try_load_from_tokenizer_json(path)
|
||||||
|
@ -132,6 +138,14 @@ class SpecialVocab:
|
||||||
return True
|
return True
|
||||||
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
||||||
tokenizer_config = json.load(f)
|
tokenizer_config = json.load(f)
|
||||||
|
chat_template = tokenizer_config.get('chat_template')
|
||||||
|
if chat_template is None or isinstance(chat_template, str):
|
||||||
|
self.chat_template = chat_template
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring',
|
||||||
|
file = sys.stderr
|
||||||
|
)
|
||||||
for typ in self.special_token_types:
|
for typ in self.special_token_types:
|
||||||
add_entry = tokenizer_config.get(f'add_{typ}_token')
|
add_entry = tokenizer_config.get(f'add_{typ}_token')
|
||||||
if isinstance(add_entry, bool):
|
if isinstance(add_entry, bool):
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "gguf"
|
name = "gguf"
|
||||||
version = "0.5.3"
|
version = "0.6.0"
|
||||||
description = "Read and write ML models in GGUF for GGML"
|
description = "Read and write ML models in GGUF for GGML"
|
||||||
authors = ["GGML <ggml@ggml.ai>"]
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
packages = [
|
packages = [
|
||||||
|
|
|
@ -1882,6 +1882,7 @@ struct llama_model_loader {
|
||||||
if (value.size() > MAX_VALUE_LEN) {
|
if (value.size() > MAX_VALUE_LEN) {
|
||||||
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
||||||
}
|
}
|
||||||
|
replace_all(value, "\n", "\\n");
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue