Use some tricks to eliminate the necessity for a new format
This commit is contained in:
parent
41a2ed03e7
commit
f6d5fe3afc
3 changed files with 43 additions and 46 deletions
32
convert.py
32
convert.py
|
@ -142,7 +142,7 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
|
||||||
@dataclass
|
@dataclass
|
||||||
class Params:
|
class Params:
|
||||||
n_vocab: int
|
n_vocab: int
|
||||||
n_vocab_sp:int
|
n_vocab_base: int
|
||||||
n_embd: int
|
n_embd: int
|
||||||
n_mult: int
|
n_mult: int
|
||||||
n_head: int
|
n_head: int
|
||||||
|
@ -170,7 +170,7 @@ class Params:
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = n_vocab,
|
n_vocab = n_vocab,
|
||||||
n_vocab_sp= n_vocab,
|
n_vocab_base=n_vocab,
|
||||||
n_embd = n_embd,
|
n_embd = n_embd,
|
||||||
n_mult = 256,
|
n_mult = 256,
|
||||||
n_head = n_head,
|
n_head = n_head,
|
||||||
|
@ -193,7 +193,7 @@ class Params:
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = n_vocab,
|
n_vocab = n_vocab,
|
||||||
n_vocab_sp= n_vocab,
|
n_vocab_base=n_vocab,
|
||||||
n_embd = n_embd,
|
n_embd = n_embd,
|
||||||
n_mult = n_mult,
|
n_mult = n_mult,
|
||||||
n_head = n_head,
|
n_head = n_head,
|
||||||
|
@ -218,7 +218,7 @@ class Params:
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = n_vocab,
|
n_vocab = n_vocab,
|
||||||
n_vocab_sp= n_vocab
|
n_vocab_base=n_vocab,
|
||||||
n_embd = n_embd,
|
n_embd = n_embd,
|
||||||
n_mult = n_mult,
|
n_mult = n_mult,
|
||||||
n_head = n_head,
|
n_head = n_head,
|
||||||
|
@ -283,7 +283,7 @@ class SentencePieceVocab:
|
||||||
else:
|
else:
|
||||||
tokenizer_config = {}
|
tokenizer_config = {}
|
||||||
for key, value in tokenizer_config.items():
|
for key, value in tokenizer_config.items():
|
||||||
if not isinstance(value, dict) or not isinstance(value, str):
|
if not isinstance(value, dict) and not isinstance(value, str):
|
||||||
continue
|
continue
|
||||||
token_id = TOKEN_NAME_TO_ID.get(key, -1)
|
token_id = TOKEN_NAME_TO_ID.get(key, -1)
|
||||||
if token_id == -1:
|
if token_id == -1:
|
||||||
|
@ -296,15 +296,13 @@ class SentencePieceVocab:
|
||||||
else:
|
else:
|
||||||
special_tokens = {}
|
special_tokens = {}
|
||||||
for key, value in special_tokens.items():
|
for key, value in special_tokens.items():
|
||||||
if not isinstance(value, dict) or not isinstance(value, str):
|
if not isinstance(value, dict) and not isinstance(value, str):
|
||||||
continue
|
continue
|
||||||
token_id = TOKEN_NAME_TO_ID.get(key, -1)
|
token_id = TOKEN_NAME_TO_ID.get(key, -1)
|
||||||
if token_id == -1 or token_id in self.special_tokens_map:
|
if token_id == -1 or token_id in self.special_tokens_map:
|
||||||
continue
|
continue
|
||||||
self.special_tokens_map[token_id] = value["content"] if isinstance(value, dict) else value
|
self.special_tokens_map[token_id] = value["content"] if isinstance(value, dict) else value
|
||||||
|
|
||||||
self.vocab_special_size: int = len(self.added_tokens_list) + len(self.special_tokens_map)
|
|
||||||
|
|
||||||
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
|
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
|
||||||
tokenizer = self.sentencepiece_tokenizer
|
tokenizer = self.sentencepiece_tokenizer
|
||||||
if self.vocabtype == "bpe":
|
if self.vocabtype == "bpe":
|
||||||
|
@ -361,7 +359,7 @@ class GGMLVocab:
|
||||||
self.tokens = tokens
|
self.tokens = tokens
|
||||||
self.special_tokens = []
|
self.special_tokens = []
|
||||||
self.vocab_size = len(tokens)
|
self.vocab_size = len(tokens)
|
||||||
self.vocab_special_size = 0
|
self.vocab_size_base = 0
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
|
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
|
||||||
return self.tokens
|
return self.tokens
|
||||||
|
@ -1120,17 +1118,21 @@ class OutputFile:
|
||||||
def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
|
def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
|
||||||
self.fout.write(b"ggjt"[::-1]) # magic
|
self.fout.write(b"ggjt"[::-1]) # magic
|
||||||
values = [
|
values = [
|
||||||
4, # file version
|
1, # file version
|
||||||
params.n_vocab,
|
params.n_vocab,
|
||||||
params.n_vocab_sp,
|
|
||||||
params.n_embd,
|
params.n_embd,
|
||||||
params.n_mult,
|
params.n_mult,
|
||||||
params.n_head,
|
params.n_head,
|
||||||
params.n_layer,
|
params.n_layer,
|
||||||
|
<<<<<<< HEAD
|
||||||
params.n_embd // params.n_head, # rot (obsolete)
|
params.n_embd // params.n_head, # rot (obsolete)
|
||||||
file_type.value,
|
file_type.value,
|
||||||
|
=======
|
||||||
|
params.n_vocab_base | 0xF0000000, # reuse obsolete rot value to store vocab_base
|
||||||
|
params.file_type.value,
|
||||||
|
>>>>>>> bfccc62 (Use some tricks to eliminate the necessity for a new format)
|
||||||
]
|
]
|
||||||
self.fout.write(struct.pack("i" * len(values), *values))
|
self.fout.write(struct.pack("I" * len(values), *values))
|
||||||
|
|
||||||
def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
|
def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
|
||||||
sname = name.encode('utf-8')
|
sname = name.encode('utf-8')
|
||||||
|
@ -1144,13 +1146,11 @@ class OutputFile:
|
||||||
self.fout.write(struct.pack("i", len(text)))
|
self.fout.write(struct.pack("i", len(text)))
|
||||||
self.fout.write(text)
|
self.fout.write(text)
|
||||||
self.fout.write(struct.pack("f", score))
|
self.fout.write(struct.pack("f", score))
|
||||||
for token_id in vocab.all_special_tokens():
|
|
||||||
self.fout.write(struct.pack("i", token_id))
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
|
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
|
||||||
of = OutputFile(fname_out)
|
of = OutputFile(fname_out)
|
||||||
params = Params(n_vocab=vocab.vocab_size, n_vocab_sp=vocab.vocab_special_size, n_embd=0, n_mult=0,
|
params = Params(n_vocab=vocab.vocab_size, n_vocab_base=vocab.vocab_size_base, n_embd=0, n_mult=0,
|
||||||
n_head=1, n_layer=0)
|
n_head=1, n_layer=0)
|
||||||
of = OutputFile(fname_out)
|
of = OutputFile(fname_out)
|
||||||
of.write_file_header(params, file_type=GGMLFileType.AllF32)
|
of.write_file_header(params, file_type=GGMLFileType.AllF32)
|
||||||
|
@ -1373,7 +1373,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
||||||
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
||||||
vocab = load_vocab(vocab_dir, args.vocabtype)
|
vocab = load_vocab(vocab_dir, args.vocabtype)
|
||||||
params = Params.load(model_plus)
|
params = Params.load(model_plus)
|
||||||
params.n_vocab_sp = vocab.vocab_special_size
|
params.n_vocab_base = vocab.vocab_size_base
|
||||||
model = model_plus.model
|
model = model_plus.model
|
||||||
model = do_necessary_conversions(model, params)
|
model = do_necessary_conversions(model, params)
|
||||||
output_type = pick_output_type(model, args.outtype)
|
output_type = pick_output_type(model, args.outtype)
|
||||||
|
|
55
llama.cpp
55
llama.cpp
|
@ -181,14 +181,13 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
||||||
// default hparams (LLaMA 7B)
|
// default hparams (LLaMA 7B)
|
||||||
struct llama_hparams {
|
struct llama_hparams {
|
||||||
uint32_t n_vocab = 32000;
|
uint32_t n_vocab = 32000;
|
||||||
uint32_t n_vocab_sp = 0;
|
uint32_t n_vocab_base = 32000;
|
||||||
uint32_t n_ctx = 512; // this is provided as user input?
|
uint32_t n_ctx = 512; // this is provided as user input?
|
||||||
uint32_t n_embd = 4096;
|
uint32_t n_embd = 4096;
|
||||||
uint32_t n_mult = 256;
|
uint32_t n_mult = 256;
|
||||||
uint32_t n_head = 32;
|
uint32_t n_head = 32;
|
||||||
uint32_t n_head_kv = 32;
|
uint32_t n_head_kv = 32;
|
||||||
uint32_t n_layer = 32;
|
uint32_t n_layer = 32;
|
||||||
uint32_t n_rot = 64;
|
|
||||||
|
|
||||||
// LLaMAv2
|
// LLaMAv2
|
||||||
// TODO: load from model data hparams
|
// TODO: load from model data hparams
|
||||||
|
@ -499,7 +498,6 @@ enum llama_file_version {
|
||||||
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
||||||
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
||||||
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
|
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
|
||||||
LLAMA_FILE_VERSION_GGJT_V4, // improved support for added/special tokens
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_file_loader {
|
struct llama_file_loader {
|
||||||
|
@ -515,6 +513,7 @@ struct llama_file_loader {
|
||||||
read_hparams();
|
read_hparams();
|
||||||
read_vocab();
|
read_vocab();
|
||||||
read_tensor_metadata(tensors_map);
|
read_tensor_metadata(tensors_map);
|
||||||
|
set_vocab_sp();
|
||||||
}
|
}
|
||||||
void read_magic() {
|
void read_magic() {
|
||||||
uint32_t magic = file.read_u32();
|
uint32_t magic = file.read_u32();
|
||||||
|
@ -537,7 +536,6 @@ struct llama_file_loader {
|
||||||
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
|
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
|
||||||
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
|
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
|
||||||
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
|
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
|
||||||
case 4: file_version = LLAMA_FILE_VERSION_GGJT_V4; return;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -546,18 +544,18 @@ struct llama_file_loader {
|
||||||
}
|
}
|
||||||
void read_hparams() {
|
void read_hparams() {
|
||||||
hparams.n_vocab = file.read_u32();
|
hparams.n_vocab = file.read_u32();
|
||||||
hparams.n_vocab_sp = file_version >= LLAMA_FILE_VERSION_GGJT_V4 ? file.read_u32() : 0;
|
|
||||||
hparams.n_embd = file.read_u32();
|
hparams.n_embd = file.read_u32();
|
||||||
hparams.n_mult = file.read_u32();
|
hparams.n_mult = file.read_u32();
|
||||||
hparams.n_head = file.read_u32();
|
hparams.n_head = file.read_u32();
|
||||||
hparams.n_layer = file.read_u32();
|
hparams.n_layer = file.read_u32();
|
||||||
hparams.n_rot = file.read_u32();
|
hparams.n_vocab_base = file.read_u32();
|
||||||
|
hparams.n_vocab_base = (hparams.n_vocab_base & 0xF0000000) == 0 ? hparams.n_vocab : (hparams.n_vocab_base & ~0xF0000000); // this bitwise operation is necessary for compatibility with older models
|
||||||
hparams.ftype = (enum llama_ftype) file.read_u32();
|
hparams.ftype = (enum llama_ftype) file.read_u32();
|
||||||
|
|
||||||
// LLaMAv2
|
// LLaMAv2
|
||||||
// TODO: read from header
|
// TODO: read from header
|
||||||
hparams.n_head_kv = hparams.n_head;
|
hparams.n_head_kv = hparams.n_head;
|
||||||
}
|
=======
|
||||||
void read_vocab() {
|
void read_vocab() {
|
||||||
vocab.id_to_token.resize(hparams.n_vocab);
|
vocab.id_to_token.resize(hparams.n_vocab);
|
||||||
|
|
||||||
|
@ -574,20 +572,6 @@ struct llama_file_loader {
|
||||||
tok_score.tok = std::move(word);
|
tok_score.tok = std::move(word);
|
||||||
tok_score.score = score;
|
tok_score.score = score;
|
||||||
}
|
}
|
||||||
|
|
||||||
vocab.special_token_to_id.reserve(hparams.n_vocab_sp);
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < hparams.n_vocab_sp; i++) {
|
|
||||||
llama_vocab::id token_id = file.read_u32();
|
|
||||||
const auto & word = vocab.id_to_token[token_id].tok;
|
|
||||||
|
|
||||||
vocab.special_token_trie.add(word);
|
|
||||||
vocab.special_token_to_id[word] = token_id;
|
|
||||||
|
|
||||||
if (vocab.max_special_token_length < word.size()) {
|
|
||||||
vocab.max_special_token_length = word.size();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
|
void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
|
||||||
while (file.tell() < file.size) {
|
while (file.tell() < file.size) {
|
||||||
|
@ -634,6 +618,24 @@ struct llama_file_loader {
|
||||||
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
|
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
void set_vocab_sp() {
|
||||||
|
uint32_t vocab_sp = 3 + hparams.n_vocab - hparams.n_vocab_base;
|
||||||
|
vocab.special_token_to_id.reserve(vocab_sp);
|
||||||
|
for (uint32_t i = 0; i < vocab_sp; i++) {
|
||||||
|
llama_vocab::id token_id = i > 2 ? hparams.n_vocab_base + i : i;
|
||||||
|
const auto & word = vocab.id_to_token[token_id].tok;
|
||||||
|
if (word.empty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
vocab.special_token_trie.add(word);
|
||||||
|
vocab.special_token_to_id[word] = token_id;
|
||||||
|
|
||||||
|
if (vocab.max_special_token_length < word.size()) {
|
||||||
|
vocab.max_special_token_length = word.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_file_saver {
|
struct llama_file_saver {
|
||||||
|
@ -653,12 +655,11 @@ struct llama_file_saver {
|
||||||
void write_hparams(enum llama_ftype new_ftype) {
|
void write_hparams(enum llama_ftype new_ftype) {
|
||||||
const llama_hparams & hparams = any_file_loader->hparams;
|
const llama_hparams & hparams = any_file_loader->hparams;
|
||||||
file.write_u32(hparams.n_vocab);
|
file.write_u32(hparams.n_vocab);
|
||||||
file.write_u32(hparams.n_vocab_sp);
|
|
||||||
file.write_u32(hparams.n_embd);
|
file.write_u32(hparams.n_embd);
|
||||||
file.write_u32(hparams.n_mult);
|
file.write_u32(hparams.n_mult);
|
||||||
file.write_u32(hparams.n_head);
|
file.write_u32(hparams.n_head);
|
||||||
file.write_u32(hparams.n_layer);
|
file.write_u32(hparams.n_layer);
|
||||||
file.write_u32(hparams.n_rot);
|
file.write_u32(hparams.n_vocab_base | 0xF0000000); // this bitwise operation is necessary for compatibility with older models
|
||||||
file.write_u32(new_ftype);
|
file.write_u32(new_ftype);
|
||||||
}
|
}
|
||||||
void write_vocab() {
|
void write_vocab() {
|
||||||
|
@ -672,9 +673,6 @@ struct llama_file_saver {
|
||||||
file.write_raw(token_score.tok.data(), token_score.tok.size());
|
file.write_raw(token_score.tok.data(), token_score.tok.size());
|
||||||
file.write_raw(&token_score.score, sizeof(token_score.score));
|
file.write_raw(&token_score.score, sizeof(token_score.score));
|
||||||
}
|
}
|
||||||
for (const auto & pair : any_file_loader->vocab.special_token_to_id) {
|
|
||||||
file.write_u32(pair.second);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
|
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
|
||||||
switch (new_type) {
|
switch (new_type) {
|
||||||
|
@ -1001,8 +999,7 @@ static const char *llama_file_version_name(llama_file_version version) {
|
||||||
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
||||||
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
||||||
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
|
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
|
||||||
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (pre #1931)";
|
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
|
||||||
case LLAMA_FILE_VERSION_GGJT_V4: return "ggjt v4 (latest)";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return "unknown";
|
return "unknown";
|
||||||
|
@ -1127,7 +1124,7 @@ static void llama_model_load_internal(
|
||||||
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
||||||
fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
||||||
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||||
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_embd/hparams.n_head); // a.k.a. n_embd_head, n_head_dim
|
||||||
fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
||||||
fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
|
fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
|
||||||
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
||||||
|
|
2
llama.h
2
llama.h
|
@ -40,7 +40,7 @@
|
||||||
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
||||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||||
|
|
||||||
#define LLAMA_FILE_VERSION 4
|
#define LLAMA_FILE_VERSION 3
|
||||||
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
||||||
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue