Merge branch 'master' into concedo

# Conflicts: # .devops/tools.sh # CMakeLists.txt # README.md # flake.nix
2023-03-24 19:28:27 +08:00 · 2023-03-24 19:28:27 +08:00 · 3879d84400
commit 3879d84400
parent 706e19e9b4 b6b268d441
9 changed files with 115 additions and 107 deletions
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@ -36,7 +36,8 @@ fname_out = sys.argv[3]
 fout = open(fname_out, "wb")
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
 fout.write(struct.pack("i", 1)) # file version
 fout.write(struct.pack("i", n_vocab))
 fout.write(struct.pack("i", n_embd))
 fout.write(struct.pack("i", n_mult))
@ -49,27 +50,21 @@ fout.write(struct.pack("i", 4))
 # This loop unchanged from convert-pth-to-ggml.py:
 for i in range(tokenizer.vocab_size()):
    if tokenizer.is_unknown(i):
        # "<unk>" token (translated as ??)
        text = " \u2047 ".encode("utf-8")
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
    elif tokenizer.is_control(i):
-        # "<s>"/"</s>" tokens
+        text = b""
        fout.write(struct.pack("i", 0))
    elif tokenizer.is_byte(i):
        # "<U+XX>" tokens (which may be invalid UTF-8)
        piece = tokenizer.id_to_piece(i)
        if len(piece) != 6:
-            print("Invalid token: " + piece)
+            print(f"Invalid token: {piece}")
            sys.exit(1)
        byte_value = int(piece[3:-1], 16)
-        fout.write(struct.pack("i", 1))
+        text = struct.pack("B", byte_value)
        fout.write(struct.pack("B", byte_value))
    else:
        # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
-        fout.write(struct.pack("i", len(text)))
+    fout.write(struct.pack("i", len(text)))
-        fout.write(text)
+    fout.write(text)
    fout.write(struct.pack("f", tokenizer.get_score(i)))
 def write_header(shape, dst_name, ftype_cur):
    sname = dst_name.encode('utf-8')
--- a/download-pth.py
+++ b/download-pth.py
@ -1,66 +0,0 @@
 import os
 import sys
 from tqdm import tqdm
 import requests
 if len(sys.argv) < 3:
    print("Usage: download-pth.py dir-model model-type\n")
    print("  model-type: Available models 7B, 13B, 30B or 65B")
    sys.exit(1)
 modelsDir = sys.argv[1]
 model = sys.argv[2]
 num = {
    "7B": 1,
    "13B": 2,
    "30B": 4,
    "65B": 8,
 }
 if model not in num:
    print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
    sys.exit(1)
 print(f"Downloading model {model}")
 files = ["checklist.chk", "params.json"]
 for i in range(num[model]):
    files.append(f"consolidated.0{i}.pth")
 resolved_path = os.path.abspath(os.path.join(modelsDir, model))
 os.makedirs(resolved_path, exist_ok=True)
 for file in files:
    dest_path = os.path.join(resolved_path, file)
    if os.path.exists(dest_path):
        print(f"Skip file download, it already exists: {file}")
        continue
    url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
    response = requests.get(url, stream=True)
    with open(dest_path, 'wb') as f:
        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    t.update(len(chunk))
 files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
 for file in files2:
    dest_path = os.path.join(modelsDir, file)
    if os.path.exists(dest_path):
        print(f"Skip file download, it already exists: {file}")
        continue
    url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
    response = requests.get(url, stream=True)
    with open(dest_path, 'wb') as f:
        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    t.update(len(chunk))
--- a/llama.cpp
+++ b/llama.cpp
@ -734,11 +734,13 @@ static bool llama_eval_internal(
            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            struct ggml_tensor * V_trans =
-                ggml_permute(ctx0,
+                ggml_cpy(ctx0,
-                        ggml_reshape_3d(ctx0,
+                    ggml_permute(ctx0,
-                            ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                            ggml_reshape_3d(ctx0,
-                            n_embd/n_head, n_head, n_past + N),
+                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                        1, 2, 0, 3);
+                                n_embd/n_head, n_head, n_past + N),
                            1, 2, 0, 3),
                    ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
            // KQV = transpose(V) * KQ_soft_max
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
--- a/llamacpp.dll
+++ b/llamacpp.dll
--- a/main.cpp
+++ b/main.cpp
@ -258,6 +258,9 @@ int main(int argc, char ** argv) {
        params.interactive = true;
    }
    // determine newline token
    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
    fprintf(stderr, "\n");
    fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
    fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
@ -359,6 +362,16 @@ int main(int argc, char ** argv) {
                last_n_tokens.push_back(id);
            }
            // replace end of text token with newline token when in interactive mode
            if (id == llama_token_eos() && params.interactive) {
                id = llama_token_newline.front();
                if (params.antiprompt.size() != 0) {
                    // tokenize and inject first reverse prompt
                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
                    embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                }
            }
            // add it to the context
            embd.push_back(id);
@ -451,12 +464,8 @@ int main(int argc, char ** argv) {
        // end of text token
        if (embd.back() == llama_token_eos()) {
-            if (params.interactive) {
+            fprintf(stderr, " [end of text]\n");
-                is_interacting = true;
+            break;
            } else {
                fprintf(stderr, " [end of text]\n");
                break;
            }
        }
        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
--- a/main.exe
+++ b/main.exe
--- a/quantize.exe
+++ b/quantize.exe
--- a/quantize.py
+++ b/quantize.py
@ -57,6 +57,7 @@ def main():
    # )
    args = parser.parse_args()
    args.models_path = os.path.abspath(args.models_path)
    if not os.path.isfile(args.quantize_script_path):
        print(
--- a/utils.cpp
+++ b/utils.cpp
@ -26,41 +26,95 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
    }
    bool invalid_param = false;
    std::string arg;
    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
+        arg = argv[i];
        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.seed = std::stoi(argv[i]);
        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_threads = std::stoi(argv[i]);
        } else if (arg == "-p" || arg == "--prompt") {
-            params.prompt = argv[++i];
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.prompt = argv[i];
        } else if (arg == "-f" || arg == "--file") {
-            std::ifstream file(argv[++i]);
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            std::ifstream file(argv[i]);
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
            if (params.prompt.back() == '\n') {
                params.prompt.pop_back();
            }
        } else if (arg == "-n" || arg == "--n_predict") {
-            params.n_predict = std::stoi(argv[++i]);
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_predict = std::stoi(argv[i]);
        } else if (arg == "--top_k") {
-            params.top_k = std::stoi(argv[++i]);
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.top_k = std::stoi(argv[i]);
        } else if (arg == "-c" || arg == "--ctx_size") {
-            params.n_ctx = std::stoi(argv[++i]);
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
        } else if (arg == "--memory_f16") {
            params.memory_f16 = true;
        } else if (arg == "--top_p") {
-            params.top_p = std::stof(argv[++i]);
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.top_p = std::stof(argv[i]);
        } else if (arg == "--temp") {
-            params.temp = std::stof(argv[++i]);
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.temp = std::stof(argv[i]);
        } else if (arg == "--repeat_last_n") {
-            params.repeat_last_n = std::stoi(argv[++i]);
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.repeat_last_n = std::stoi(argv[i]);
        } else if (arg == "--repeat_penalty") {
-            params.repeat_penalty = std::stof(argv[++i]);
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.repeat_penalty = std::stof(argv[i]);
        } else if (arg == "-b" || arg == "--batch_size") {
-            params.n_batch = std::stoi(argv[++i]);
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_batch = std::stoi(argv[i]);
        } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.model = argv[i];
        } else if (arg == "-i" || arg == "--interactive") {
            params.interactive = true;
        } else if (arg == "--interactive-first") {
@ -70,13 +124,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        } else if (arg == "--color") {
            params.use_color = true;
        } else if (arg == "-r" || arg == "--reverse-prompt") {
-            params.antiprompt.push_back(argv[++i]);
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.antiprompt.push_back(argv[i]);
        } else if (arg == "--perplexity") {
            params.perplexity = true;
        } else if (arg == "--ignore-eos") {
            params.ignore_eos = true;
        } else if (arg == "--n_parts") {
-            params.n_parts = std::stoi(argv[++i]);
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_parts = std::stoi(argv[i]);
        } else if (arg == "-h" || arg == "--help") {
            gpt_print_usage(argc, argv, params);
            exit(0);
@ -85,9 +147,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            gpt_print_usage(argc, argv, params);
-            exit(0);
+            exit(1);
        }
    }
    if (invalid_param) {
        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
        gpt_print_usage(argc, argv, params);
        exit(1);
    }
    return true;
 }