llama : fix strncpy warning + note token_to_str does not write null

2023-08-16 15:28:09 +03:00 · 2023-08-16 15:28:09 +03:00 · 5b94b14d5d
commit 5b94b14d5d
parent a49931300a
6 changed files with 21 additions and 24 deletions
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@ -132,7 +132,7 @@ if Path(dir_model + "/tokenizer.model").is_file():
        toktype = 1 # defualt to normal token type
        if tokenizer.is_unknown(i): toktype = 2
        if tokenizer.is_control(i): toktype = 3
- 
+
        # TODO: How to determinate if a token is user defined?
        # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
        # if tokenizer.is_user_defined(i): toktype = 4
@ -223,7 +223,7 @@ for part_name in part_names:
            sys.exit()

        n_dims = len(data.shape)
-        data_dtype = data.dtype 
+        data_dtype = data.dtype

        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data.dtype == np.float16:
@ -261,7 +261,6 @@ for part_name in part_names:
    for name in model_part.keys():
        data = model_part[name]

-    
        old_dtype = data.dtype

        # we don't need these
@ -284,7 +283,7 @@ for part_name in part_names:
            sys.exit()

        n_dims = len(data.shape)
-        data_dtype = data.dtype 
+        data_dtype = data.dtype

        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data.dtype == np.float16:
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -266,9 +266,6 @@ int main(int argc, char ** argv) {
        params.interactive = true;
    }

-    // determine newline token
-    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-
    if (params.verbose_prompt) {
        fprintf(stderr, "\n");
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
@ -778,8 +775,7 @@ int main(int argc, char ** argv) {
                    if (grammar != NULL) {
                        llama_grammar_free(grammar);

-                        std::vector<const llama_grammar_element *> grammar_rules(
-                            parsed_grammar.c_rules());
+                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
                        grammar = llama_grammar_init(
                            grammar_rules.data(), grammar_rules.size(),
                            parsed_grammar.symbol_ids.at("root"));
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -68,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
 }

 // usage:
-//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 void usage(const char * executable) {
-    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
+    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    fprintf(stderr, "\nAllowed quantization types:\n");
@ -118,8 +118,8 @@ int main(int argc, char ** argv) {
        if (pos != std::string::npos) {
            fpath = fname_inp.substr(0, pos + 1);
        }
-        // export as [inp path]/ggml-model-[ftype].bin
-        fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
+        // export as [inp path]/ggml-model-[ftype].gguf
+        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
        arg_idx++;
    }
    else {
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -26,7 +26,6 @@ int main(int argc, char ** argv) {
    auto lparams = llama_context_default_params();

    lparams.n_ctx     = params.n_ctx;
-    lparams.n_gqa     = params.n_gqa;
    lparams.seed      = params.seed;
    lparams.f16_kv    = params.memory_f16;
    lparams.use_mmap  = params.use_mmap;
--- a/llama.cpp
+++ b/llama.cpp
@ -4774,23 +4774,26 @@ float * llama_get_embeddings(struct llama_context * ctx) {
    return ctx->embedding.data();
 }

-int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * str, int length) {
+// does not write null-terminator to str
+int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
    if (0 <= token && token < llama_n_vocab_from_model(model)) {
        if (llama_is_normal_token(model->vocab, token)) {
            std::string result = model->vocab.id_to_token[token].tok;
-            if(llama_vocab_type(model->vocab) == "spm") {
+            if (llama_vocab_type(model->vocab) == "spm") {
                result = llama_unescape_whitespace(result);
            }
            if (length < (int) result.length()) {
                return -result.length();
            }
-            strncpy(str, result.c_str(), result.length());
+            memcpy(buf, result.c_str(), result.length());
            return result.length();
        } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
            if (length < 3) {
                return -3;
            }
-            strncpy(str, "\xe2\x96\x85", 4);
+            buf[0] = '\xe2';
+            buf[1] = '\x96';
+            buf[2] = '\x85';
            return 3;
        } else if (llama_is_control_token(model->vocab, token)) {
            ;
@ -4798,8 +4801,7 @@ int llama_token_to_str_with_model(const struct llama_model * model, llama_token
            if (length < 1) {
                return -1;
            }
-            str[0] = llama_byte_to_char(model->vocab, token);
-            str[1] = 0x00;
+            buf[0] = llama_byte_to_char(model->vocab, token);
            return 1;
        }
    }
@ -4830,7 +4832,7 @@ int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token,
        if (length < (int) result.length()) {
            return -result.length();
        }
-        strncpy(str, result.c_str(), result.length());
+        memcpy(str, result.c_str(), result.length());
        return result.length();
    }
    return 0;
--- a/llama.h
+++ b/llama.h
@ -355,22 +355,23 @@ extern "C" {
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);

    // Token Id -> String. Uses the vocabulary in the provided context
+    // Does not write null terminator to the buffer
    LLAMA_API int llama_token_to_str(
            const struct llama_context * ctx,
                           llama_token   token,
-                                  char * str,
+                                  char * buf,
                                  int    length);

    LLAMA_API int llama_token_to_str_bpe(
            const struct llama_context * ctx,
                           llama_token   token,
-                                  char * str,
+                                  char * buf,
                                  int    length);

    LLAMA_API int llama_token_to_str_with_model(
              const struct llama_model * model,
                           llama_token   token,
-                                  char * str,
+                                  char * buf,
                                  int    length);
    // Special tokens
    LLAMA_API llama_token llama_token_bos(void);  // beginning-of-sentence