Merge branch 'master' into merge-to-upstream-v2
This commit is contained in:
commit
f15ea2c928
12 changed files with 71 additions and 62 deletions
|
@ -239,6 +239,10 @@ class Model:
|
||||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||||
logger.info(f"gguf: experts used count = {n_experts_used}")
|
logger.info(f"gguf: experts used count = {n_experts_used}")
|
||||||
|
|
||||||
|
if (head_dim := self.hparams.get("head_dim")) is not None:
|
||||||
|
self.gguf_writer.add_key_length(head_dim)
|
||||||
|
self.gguf_writer.add_value_length(head_dim)
|
||||||
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
logger.info(f"gguf: file type = {self.ftype}")
|
logger.info(f"gguf: file type = {self.ftype}")
|
||||||
|
|
||||||
|
@ -596,6 +600,9 @@ class Model:
|
||||||
if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
|
if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
|
||||||
# ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
|
# ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
|
||||||
res = "tekken"
|
res = "tekken"
|
||||||
|
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
|
||||||
|
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
|
||||||
|
res = "smollm"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
|
@ -736,7 +743,7 @@ class Model:
|
||||||
added_tokens_json = json.load(f)
|
added_tokens_json = json.load(f)
|
||||||
for key in added_tokens_json:
|
for key in added_tokens_json:
|
||||||
token_id = added_tokens_json[key]
|
token_id = added_tokens_json[key]
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -1484,7 +1491,12 @@ class LlamaModel(Model):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
|
||||||
|
if "head_dim" in hparams:
|
||||||
|
rope_dim = hparams["head_dim"]
|
||||||
|
else:
|
||||||
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||||
|
@ -1999,7 +2011,7 @@ class Phi3MiniModel(Model):
|
||||||
|
|
||||||
for key in added_tokens_json:
|
for key in added_tokens_json:
|
||||||
token_id = added_tokens_json[key]
|
token_id = added_tokens_json[key]
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -2075,7 +2087,7 @@ class Phi3MiniModel(Model):
|
||||||
|
|
||||||
# write rope scaling for long context (128k) model
|
# write rope scaling for long context (128k) model
|
||||||
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||||
if (rope_scaling is None):
|
if rope_scaling is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
scale = max_pos_embds / orig_max_pos_embds
|
scale = max_pos_embds / orig_max_pos_embds
|
||||||
|
@ -2722,7 +2734,7 @@ class JinaBertV2Model(BertModel):
|
||||||
|
|
||||||
yield name, data
|
yield name, data
|
||||||
|
|
||||||
def set_vocab(self, *args, **kwargs):
|
def set_vocab(self):
|
||||||
tokenizer_class = 'BertTokenizer'
|
tokenizer_class = 'BertTokenizer'
|
||||||
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||||
tokenizer_class = json.load(f)['tokenizer_class']
|
tokenizer_class = json.load(f)['tokenizer_class']
|
||||||
|
@ -2870,7 +2882,7 @@ class ArcticModel(Model):
|
||||||
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
||||||
for token_id, token_json in added_tokens_decoder.items():
|
for token_id, token_json in added_tokens_decoder.items():
|
||||||
token_id = int(token_id)
|
token_id = int(token_id)
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -3119,7 +3131,7 @@ class T5Model(Model):
|
||||||
added_tokens_json = json.load(f)
|
added_tokens_json = json.load(f)
|
||||||
for key in added_tokens_json:
|
for key in added_tokens_json:
|
||||||
token_id = added_tokens_json[key]
|
token_id = added_tokens_json[key]
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
|
@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):
|
||||||
|
|
||||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||||
# will be updated with time - contributions welcome
|
# will be updated with time - contributions welcome
|
||||||
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||||
|
|
||||||
if len(sys.argv) == 2:
|
if len(sys.argv) == 2:
|
||||||
token = sys.argv[1]
|
token = sys.argv[1]
|
||||||
|
@ -93,6 +93,7 @@ models = [
|
||||||
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
||||||
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
|
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
|
||||||
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
|
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
|
||||||
|
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -101,8 +102,8 @@ def download_file_with_auth(url, token, save_path):
|
||||||
response = sess.get(url, headers=headers)
|
response = sess.get(url, headers=headers)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||||||
with open(save_path, 'wb') as f:
|
with open(save_path, 'wb') as downloaded_file:
|
||||||
f.write(response.content)
|
downloaded_file.write(response.content)
|
||||||
logger.info(f"File {save_path} downloaded successfully")
|
logger.info(f"File {save_path} downloaded successfully")
|
||||||
|
|
||||||
|
|
||||||
|
@ -161,7 +162,7 @@ for model in models:
|
||||||
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
||||||
continue # Skip to the next model if the tokenizer can't be loaded
|
continue # Skip to the next model if the tokenizer can't be loaded
|
||||||
|
|
||||||
chktok = tokenizer.encode(chktxt)
|
chktok = tokenizer.encode(CHK_TXT)
|
||||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||||
|
|
||||||
logger.info(f"model: {name}")
|
logger.info(f"model: {name}")
|
||||||
|
@ -193,7 +194,7 @@ src_func = f"""
|
||||||
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
||||||
# use in llama.cpp to implement the same pre-tokenizer
|
# use in llama.cpp to implement the same pre-tokenizer
|
||||||
|
|
||||||
chktxt = {repr(chktxt)}
|
chktxt = {repr(CHK_TXT)}
|
||||||
|
|
||||||
chktok = tokenizer.encode(chktxt)
|
chktok = tokenizer.encode(chktxt)
|
||||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||||
|
@ -289,7 +290,7 @@ tests = [
|
||||||
"333333333",
|
"333333333",
|
||||||
"Cửa Việt", # llama-bpe fails on this
|
"Cửa Việt", # llama-bpe fails on this
|
||||||
" discards",
|
" discards",
|
||||||
chktxt,
|
CHK_TXT,
|
||||||
]
|
]
|
||||||
|
|
||||||
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
|
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
|
||||||
|
|
|
@ -132,6 +132,10 @@ class Tensor:
|
||||||
|
|
||||||
|
|
||||||
class GGMLModel:
|
class GGMLModel:
|
||||||
|
|
||||||
|
file_format: GGMLFormat
|
||||||
|
format_version: int
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.hyperparameters = None
|
self.hyperparameters = None
|
||||||
self.vocab = None
|
self.vocab = None
|
||||||
|
@ -290,7 +294,7 @@ class GGMLToGGUF:
|
||||||
if self.vocab_override is not None:
|
if self.vocab_override is not None:
|
||||||
vo = self.vocab_override
|
vo = self.vocab_override
|
||||||
logger.info('* Adding vocab item(s)')
|
logger.info('* Adding vocab item(s)')
|
||||||
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
||||||
tokens.append(vbytes)
|
tokens.append(vbytes)
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
toktypes.append(ttype)
|
toktypes.append(ttype)
|
||||||
|
|
|
@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||||
|
|
||||||
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||||
return env->NewStringUTF("");
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto new_token_chars = llama_token_to_piece(context, new_token_id);
|
auto new_token_chars = llama_token_to_piece(context, new_token_id);
|
||||||
|
|
|
@ -444,7 +444,7 @@ node index.js
|
||||||
|
|
||||||
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
|
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
|
||||||
|
|
||||||
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
|
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
|
||||||
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
|
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
|
||||||
|
|
||||||
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
||||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1720768451,
|
"lastModified": 1721379653,
|
||||||
"narHash": "sha256-EYekUHJE2gxeo2pM/zM9Wlqw1Uw2XTJXOSAO79ksc4Y=",
|
"narHash": "sha256-8MUgifkJ7lkZs3u99UDZMB4kbOxvMEXQZ31FO3SopZ0=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "7e7c39ea35c5cdd002cd4588b03a3fb9ece6fad9",
|
"rev": "1d9c2c9b3e71b9ee663d11c5d298727dace8d374",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
@ -4748,7 +4748,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
|
|
||||||
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
||||||
|
|
||||||
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
|
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
|
|
|
@ -92,8 +92,9 @@ extern "C" {
|
||||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
||||||
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
||||||
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
||||||
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 20,
|
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||||
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 21,
|
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
||||||
};
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
// note: these values should be synchronized with ggml_rope
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -3707,7 +3707,7 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (param_overrides_p != nullptr) {
|
if (param_overrides_p != nullptr) {
|
||||||
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
|
||||||
kv_overrides.insert({std::string(p->key), *p});
|
kv_overrides.insert({std::string(p->key), *p});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3875,7 +3875,7 @@ struct llama_model_loader {
|
||||||
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
||||||
|
|
||||||
{
|
{
|
||||||
const int kid = gguf_find_key(meta, "general.file_type");
|
const int kid = gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
|
||||||
if (kid >= 0) {
|
if (kid >= 0) {
|
||||||
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
|
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
|
||||||
}
|
}
|
||||||
|
@ -5369,6 +5369,7 @@ static void llm_load_vocab(
|
||||||
if (merges_keyidx == -1) {
|
if (merges_keyidx == -1) {
|
||||||
throw std::runtime_error("cannot find tokenizer merges in model file\n");
|
throw std::runtime_error("cannot find tokenizer merges in model file\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
|
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
|
||||||
for (int i = 0; i < n_merges; i++) {
|
for (int i = 0; i < n_merges; i++) {
|
||||||
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
||||||
|
@ -5407,16 +5408,6 @@ static void llm_load_vocab(
|
||||||
vocab.special_cls_id = -1;
|
vocab.special_cls_id = -1;
|
||||||
vocab.special_mask_id = -1;
|
vocab.special_mask_id = -1;
|
||||||
|
|
||||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
|
||||||
if (add_space_prefix_keyidx != -1) {
|
|
||||||
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
|
||||||
} // The default value of add_space_prefix is true.
|
|
||||||
|
|
||||||
const int remove_extra_whitespaces_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
|
|
||||||
if (remove_extra_whitespaces_keyidx != -1) {
|
|
||||||
vocab.tokenizer_remove_extra_whitespaces = gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
|
|
||||||
} // The default value of remove_extra_whitespaces is false.
|
|
||||||
|
|
||||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||||
if (precompiled_charsmap_keyidx != -1) {
|
if (precompiled_charsmap_keyidx != -1) {
|
||||||
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
||||||
|
@ -5533,6 +5524,10 @@ static void llm_load_vocab(
|
||||||
vocab.tokenizer_clean_spaces = false;
|
vocab.tokenizer_clean_spaces = false;
|
||||||
vocab.tokenizer_ignore_merges = true;
|
vocab.tokenizer_ignore_merges = true;
|
||||||
vocab.tokenizer_add_bos = true;
|
vocab.tokenizer_add_bos = true;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "smollm") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
|
||||||
|
vocab.tokenizer_clean_spaces = false;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
}
|
}
|
||||||
|
@ -5556,10 +5551,8 @@ static void llm_load_vocab(
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
|
||||||
if (add_space_prefix_keyidx != -1) {
|
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
|
||||||
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
||||||
|
@ -6140,10 +6133,10 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
|
||||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
||||||
|
|
||||||
// optional bias tensors
|
// optional bias tensors
|
||||||
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
@ -15558,6 +15551,7 @@ struct llm_tokenizer_bpe {
|
||||||
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
||||||
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
||||||
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
"\\p{N}",
|
"\\p{N}",
|
||||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||||
|
@ -18292,8 +18286,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
// copy the KV pairs from the input file
|
// copy the KV pairs from the input file
|
||||||
gguf_set_kv (ctx_out, ml.meta);
|
gguf_set_kv (ctx_out, ml.meta);
|
||||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
|
||||||
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
|
||||||
|
|
||||||
// Remove split metadata
|
// Remove split metadata
|
||||||
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
||||||
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
|
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
|
||||||
|
|
|
@ -70,21 +70,19 @@ add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
|
||||||
target_link_libraries(test-tokenizer-0 PRIVATE common)
|
target_link_libraries(test-tokenizer-0 PRIVATE common)
|
||||||
install(TARGETS test-tokenizer-0 RUNTIME)
|
install(TARGETS test-tokenizer-0 RUNTIME)
|
||||||
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
|
||||||
# TODO: enable when fixed
|
|
||||||
# https://github.com/ggerganov/llama.cpp/pull/7036
|
|
||||||
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
|
||||||
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
|
||||||
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||||
|
|
||||||
# build test-tokenizer-1-bpe target once and add many tests
|
# build test-tokenizer-1-bpe target once and add many tests
|
||||||
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
|
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
|
||||||
|
@ -92,16 +90,14 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
|
||||||
install(TARGETS test-tokenizer-1-bpe RUNTIME)
|
install(TARGETS test-tokenizer-1-bpe RUNTIME)
|
||||||
|
|
||||||
# TODO: disabled due to slowness
|
# TODO: disabled due to slowness
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
|
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
||||||
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
|
||||||
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
|
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf)
|
|
||||||
|
|
||||||
# build test-tokenizer-1-spm target once and add many tests
|
# build test-tokenizer-1-spm target once and add many tests
|
||||||
add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
|
add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue