diff --git a/convert-gptq-to-ggml.py b/convert-gptq-to-ggml.py index 7fccb4d56..6c77808fc 100644 --- a/convert-gptq-to-ggml.py +++ b/convert-gptq-to-ggml.py @@ -36,7 +36,8 @@ fname_out = sys.argv[3] fout = open(fname_out, "wb") -fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex +fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex +fout.write(struct.pack("i", 1)) # file version fout.write(struct.pack("i", n_vocab)) fout.write(struct.pack("i", n_embd)) fout.write(struct.pack("i", n_mult)) @@ -49,27 +50,21 @@ fout.write(struct.pack("i", 4)) # This loop unchanged from convert-pth-to-ggml.py: for i in range(tokenizer.vocab_size()): if tokenizer.is_unknown(i): - # "" token (translated as ??) text = " \u2047 ".encode("utf-8") - fout.write(struct.pack("i", len(text))) - fout.write(text) elif tokenizer.is_control(i): - # ""/"" tokens - fout.write(struct.pack("i", 0)) + text = b"" elif tokenizer.is_byte(i): - # "" tokens (which may be invalid UTF-8) piece = tokenizer.id_to_piece(i) if len(piece) != 6: - print("Invalid token: " + piece) + print(f"Invalid token: {piece}") sys.exit(1) byte_value = int(piece[3:-1], 16) - fout.write(struct.pack("i", 1)) - fout.write(struct.pack("B", byte_value)) + text = struct.pack("B", byte_value) else: - # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces. text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") - fout.write(struct.pack("i", len(text))) - fout.write(text) + fout.write(struct.pack("i", len(text))) + fout.write(text) + fout.write(struct.pack("f", tokenizer.get_score(i))) def write_header(shape, dst_name, ftype_cur): sname = dst_name.encode('utf-8') diff --git a/download-pth.py b/download-pth.py deleted file mode 100644 index 129532c0c..000000000 --- a/download-pth.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -import sys -from tqdm import tqdm -import requests - -if len(sys.argv) < 3: - print("Usage: download-pth.py dir-model model-type\n") - print(" model-type: Available models 7B, 13B, 30B or 65B") - sys.exit(1) - -modelsDir = sys.argv[1] -model = sys.argv[2] - -num = { - "7B": 1, - "13B": 2, - "30B": 4, - "65B": 8, -} - -if model not in num: - print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B") - sys.exit(1) - -print(f"Downloading model {model}") - -files = ["checklist.chk", "params.json"] - -for i in range(num[model]): - files.append(f"consolidated.0{i}.pth") - -resolved_path = os.path.abspath(os.path.join(modelsDir, model)) -os.makedirs(resolved_path, exist_ok=True) - -for file in files: - dest_path = os.path.join(resolved_path, file) - - if os.path.exists(dest_path): - print(f"Skip file download, it already exists: {file}") - continue - - url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}" - response = requests.get(url, stream=True) - with open(dest_path, 'wb') as f: - with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t: - for chunk in response.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - t.update(len(chunk)) - -files2 = ["tokenizer_checklist.chk", "tokenizer.model"] -for file in files2: - dest_path = os.path.join(modelsDir, file) - - if os.path.exists(dest_path): - print(f"Skip file download, it already exists: {file}") - continue - - url = f"https://agi.gpt4.org/llama/LLaMA/{file}" - response = requests.get(url, stream=True) - with open(dest_path, 'wb') as f: - with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t: - for chunk in response.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - t.update(len(chunk)) \ No newline at end of file diff --git a/llama.cpp b/llama.cpp index 3701ca16d..710f83a10 100644 --- a/llama.cpp +++ b/llama.cpp @@ -734,11 +734,13 @@ static bool llama_eval_internal( // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() struct ggml_tensor * V_trans = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), - n_embd/n_head, n_head, n_past + N), - 1, 2, 0, 3); + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + n_embd/n_head, n_head, n_past + N), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); // KQV = transpose(V) * KQ_soft_max struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); diff --git a/llamacpp.dll b/llamacpp.dll index de83ebb1f..422696934 100644 Binary files a/llamacpp.dll and b/llamacpp.dll differ diff --git a/main.cpp b/main.cpp index 431c94b52..5ba6d5a75 100644 --- a/main.cpp +++ b/main.cpp @@ -258,6 +258,9 @@ int main(int argc, char ** argv) { params.interactive = true; } + // determine newline token + auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); + fprintf(stderr, "\n"); fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); @@ -359,6 +362,16 @@ int main(int argc, char ** argv) { last_n_tokens.push_back(id); } + // replace end of text token with newline token when in interactive mode + if (id == llama_token_eos() && params.interactive) { + id = llama_token_newline.front(); + if (params.antiprompt.size() != 0) { + // tokenize and inject first reverse prompt + const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); + embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); + } + } + // add it to the context embd.push_back(id); @@ -451,12 +464,8 @@ int main(int argc, char ** argv) { // end of text token if (embd.back() == llama_token_eos()) { - if (params.interactive) { - is_interacting = true; - } else { - fprintf(stderr, " [end of text]\n"); - break; - } + fprintf(stderr, " [end of text]\n"); + break; } // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. diff --git a/main.exe b/main.exe index c44900cb2..afeb0eda6 100644 Binary files a/main.exe and b/main.exe differ diff --git a/quantize.exe b/quantize.exe index 677372d9c..c90b88d73 100644 Binary files a/quantize.exe and b/quantize.exe differ diff --git a/quantize.py b/quantize.py index 6320b0a26..16b5963d3 100644 --- a/quantize.py +++ b/quantize.py @@ -57,6 +57,7 @@ def main(): # ) args = parser.parse_args() + args.models_path = os.path.abspath(args.models_path) if not os.path.isfile(args.quantize_script_path): print( diff --git a/utils.cpp b/utils.cpp index 1d5309c3a..45c9cabb1 100644 --- a/utils.cpp +++ b/utils.cpp @@ -26,41 +26,95 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency()); } + bool invalid_param = false; + std::string arg; for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; + arg = argv[i]; if (arg == "-s" || arg == "--seed") { - params.seed = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.seed = std::stoi(argv[i]); } else if (arg == "-t" || arg == "--threads") { - params.n_threads = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); } else if (arg == "-p" || arg == "--prompt") { - params.prompt = argv[++i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.prompt = argv[i]; } else if (arg == "-f" || arg == "--file") { - std::ifstream file(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + std::ifstream file(argv[i]); std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); if (params.prompt.back() == '\n') { params.prompt.pop_back(); } } else if (arg == "-n" || arg == "--n_predict") { - params.n_predict = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_predict = std::stoi(argv[i]); } else if (arg == "--top_k") { - params.top_k = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.top_k = std::stoi(argv[i]); } else if (arg == "-c" || arg == "--ctx_size") { - params.n_ctx = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_ctx = std::stoi(argv[i]); } else if (arg == "--memory_f16") { params.memory_f16 = true; } else if (arg == "--top_p") { - params.top_p = std::stof(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.top_p = std::stof(argv[i]); } else if (arg == "--temp") { - params.temp = std::stof(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.temp = std::stof(argv[i]); } else if (arg == "--repeat_last_n") { - params.repeat_last_n = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.repeat_last_n = std::stoi(argv[i]); } else if (arg == "--repeat_penalty") { - params.repeat_penalty = std::stof(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.repeat_penalty = std::stof(argv[i]); } else if (arg == "-b" || arg == "--batch_size") { - params.n_batch = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); } else if (arg == "-m" || arg == "--model") { - params.model = argv[++i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; } else if (arg == "-i" || arg == "--interactive") { params.interactive = true; } else if (arg == "--interactive-first") { @@ -70,13 +124,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } else if (arg == "--color") { params.use_color = true; } else if (arg == "-r" || arg == "--reverse-prompt") { - params.antiprompt.push_back(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.antiprompt.push_back(argv[i]); } else if (arg == "--perplexity") { params.perplexity = true; } else if (arg == "--ignore-eos") { params.ignore_eos = true; } else if (arg == "--n_parts") { - params.n_parts = std::stoi(argv[++i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_parts = std::stoi(argv[i]); } else if (arg == "-h" || arg == "--help") { gpt_print_usage(argc, argv, params); exit(0); @@ -85,9 +147,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); gpt_print_usage(argc, argv, params); - exit(0); + exit(1); } } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + gpt_print_usage(argc, argv, params); + exit(1); + } return true; }