diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py index 53ab5a3ed..27841939d 100644 --- a/convert-llama-7b-pth-to-gguf.py +++ b/convert-llama-7b-pth-to-gguf.py @@ -132,7 +132,7 @@ if Path(dir_model + "/tokenizer.model").is_file(): toktype = 1 # defualt to normal token type if tokenizer.is_unknown(i): toktype = 2 if tokenizer.is_control(i): toktype = 3 - + # TODO: How to determinate if a token is user defined? # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto # if tokenizer.is_user_defined(i): toktype = 4 @@ -223,7 +223,7 @@ for part_name in part_names: sys.exit() n_dims = len(data.shape) - data_dtype = data.dtype + data_dtype = data.dtype # if f32 desired, convert any float16 to float32 if ftype == 0 and data.dtype == np.float16: @@ -261,7 +261,6 @@ for part_name in part_names: for name in model_part.keys(): data = model_part[name] - old_dtype = data.dtype # we don't need these @@ -284,7 +283,7 @@ for part_name in part_names: sys.exit() n_dims = len(data.shape) - data_dtype = data.dtype + data_dtype = data.dtype # if f32 desired, convert any float16 to float32 if ftype == 0 and data.dtype == np.float16: diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 9c15febb5..5c2f64883 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -266,9 +266,6 @@ int main(int argc, char ** argv) { params.interactive = true; } - // determine newline token - auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); - if (params.verbose_prompt) { fprintf(stderr, "\n"); fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); @@ -778,8 +775,7 @@ int main(int argc, char ** argv) { if (grammar != NULL) { llama_grammar_free(grammar); - std::vector grammar_rules( - parsed_grammar.c_rules()); + std::vector grammar_rules( parsed_grammar.c_rules()); grammar = llama_grammar_init( grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 744f549c5..f628d0642 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -68,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std: } // usage: -// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] +// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // void usage(const char * executable) { - fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable); + fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); fprintf(stderr, "\nAllowed quantization types:\n"); @@ -118,8 +118,8 @@ int main(int argc, char ** argv) { if (pos != std::string::npos) { fpath = fname_inp.substr(0, pos + 1); } - // export as [inp path]/ggml-model-[ftype].bin - fname_out = fpath + "ggml-model-" + ftype_str + ".bin"; + // export as [inp path]/ggml-model-[ftype].gguf + fname_out = fpath + "ggml-model-" + ftype_str + ".gguf"; arg_idx++; } else { diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index d5a81978e..3db61b754 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -26,7 +26,6 @@ int main(int argc, char ** argv) { auto lparams = llama_context_default_params(); lparams.n_ctx = params.n_ctx; - lparams.n_gqa = params.n_gqa; lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.use_mmap = params.use_mmap; diff --git a/llama.cpp b/llama.cpp index f297e7c24..592e7e48a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4774,23 +4774,26 @@ float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } -int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * str, int length) { +// does not write null-terminator to str +int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) { if (0 <= token && token < llama_n_vocab_from_model(model)) { if (llama_is_normal_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].tok; - if(llama_vocab_type(model->vocab) == "spm") { + if (llama_vocab_type(model->vocab) == "spm") { result = llama_unescape_whitespace(result); } if (length < (int) result.length()) { return -result.length(); } - strncpy(str, result.c_str(), result.length()); + memcpy(buf, result.c_str(), result.length()); return result.length(); } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT if (length < 3) { return -3; } - strncpy(str, "\xe2\x96\x85", 4); + buf[0] = '\xe2'; + buf[1] = '\x96'; + buf[2] = '\x85'; return 3; } else if (llama_is_control_token(model->vocab, token)) { ; @@ -4798,8 +4801,7 @@ int llama_token_to_str_with_model(const struct llama_model * model, llama_token if (length < 1) { return -1; } - str[0] = llama_byte_to_char(model->vocab, token); - str[1] = 0x00; + buf[0] = llama_byte_to_char(model->vocab, token); return 1; } } @@ -4830,7 +4832,7 @@ int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, if (length < (int) result.length()) { return -result.length(); } - strncpy(str, result.c_str(), result.length()); + memcpy(str, result.c_str(), result.length()); return result.length(); } return 0; diff --git a/llama.h b/llama.h index d81a1b5de..7bae54d6a 100644 --- a/llama.h +++ b/llama.h @@ -355,22 +355,23 @@ extern "C" { LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); // Token Id -> String. Uses the vocabulary in the provided context + // Does not write null terminator to the buffer LLAMA_API int llama_token_to_str( const struct llama_context * ctx, llama_token token, - char * str, + char * buf, int length); LLAMA_API int llama_token_to_str_bpe( const struct llama_context * ctx, llama_token token, - char * str, + char * buf, int length); LLAMA_API int llama_token_to_str_with_model( const struct llama_model * model, llama_token token, - char * str, + char * buf, int length); // Special tokens LLAMA_API llama_token llama_token_bos(void); // beginning-of-sentence