Merge branch 'master' into concedo
# Conflicts: # .devops/tools.sh # CMakeLists.txt # README.md # flake.nix
This commit is contained in:
commit
3879d84400
9 changed files with 115 additions and 107 deletions
|
@ -36,7 +36,8 @@ fname_out = sys.argv[3]
|
||||||
|
|
||||||
fout = open(fname_out, "wb")
|
fout = open(fname_out, "wb")
|
||||||
|
|
||||||
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
|
fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
|
||||||
|
fout.write(struct.pack("i", 1)) # file version
|
||||||
fout.write(struct.pack("i", n_vocab))
|
fout.write(struct.pack("i", n_vocab))
|
||||||
fout.write(struct.pack("i", n_embd))
|
fout.write(struct.pack("i", n_embd))
|
||||||
fout.write(struct.pack("i", n_mult))
|
fout.write(struct.pack("i", n_mult))
|
||||||
|
@ -49,27 +50,21 @@ fout.write(struct.pack("i", 4))
|
||||||
# This loop unchanged from convert-pth-to-ggml.py:
|
# This loop unchanged from convert-pth-to-ggml.py:
|
||||||
for i in range(tokenizer.vocab_size()):
|
for i in range(tokenizer.vocab_size()):
|
||||||
if tokenizer.is_unknown(i):
|
if tokenizer.is_unknown(i):
|
||||||
# "<unk>" token (translated as ??)
|
|
||||||
text = " \u2047 ".encode("utf-8")
|
text = " \u2047 ".encode("utf-8")
|
||||||
fout.write(struct.pack("i", len(text)))
|
|
||||||
fout.write(text)
|
|
||||||
elif tokenizer.is_control(i):
|
elif tokenizer.is_control(i):
|
||||||
# "<s>"/"</s>" tokens
|
text = b""
|
||||||
fout.write(struct.pack("i", 0))
|
|
||||||
elif tokenizer.is_byte(i):
|
elif tokenizer.is_byte(i):
|
||||||
# "<U+XX>" tokens (which may be invalid UTF-8)
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
piece = tokenizer.id_to_piece(i)
|
||||||
if len(piece) != 6:
|
if len(piece) != 6:
|
||||||
print("Invalid token: " + piece)
|
print(f"Invalid token: {piece}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
byte_value = int(piece[3:-1], 16)
|
byte_value = int(piece[3:-1], 16)
|
||||||
fout.write(struct.pack("i", 1))
|
text = struct.pack("B", byte_value)
|
||||||
fout.write(struct.pack("B", byte_value))
|
|
||||||
else:
|
else:
|
||||||
# normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
|
|
||||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||||
fout.write(struct.pack("i", len(text)))
|
fout.write(struct.pack("i", len(text)))
|
||||||
fout.write(text)
|
fout.write(text)
|
||||||
|
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||||
|
|
||||||
def write_header(shape, dst_name, ftype_cur):
|
def write_header(shape, dst_name, ftype_cur):
|
||||||
sname = dst_name.encode('utf-8')
|
sname = dst_name.encode('utf-8')
|
||||||
|
|
|
@ -1,66 +0,0 @@
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from tqdm import tqdm
|
|
||||||
import requests
|
|
||||||
|
|
||||||
if len(sys.argv) < 3:
|
|
||||||
print("Usage: download-pth.py dir-model model-type\n")
|
|
||||||
print(" model-type: Available models 7B, 13B, 30B or 65B")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
modelsDir = sys.argv[1]
|
|
||||||
model = sys.argv[2]
|
|
||||||
|
|
||||||
num = {
|
|
||||||
"7B": 1,
|
|
||||||
"13B": 2,
|
|
||||||
"30B": 4,
|
|
||||||
"65B": 8,
|
|
||||||
}
|
|
||||||
|
|
||||||
if model not in num:
|
|
||||||
print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
print(f"Downloading model {model}")
|
|
||||||
|
|
||||||
files = ["checklist.chk", "params.json"]
|
|
||||||
|
|
||||||
for i in range(num[model]):
|
|
||||||
files.append(f"consolidated.0{i}.pth")
|
|
||||||
|
|
||||||
resolved_path = os.path.abspath(os.path.join(modelsDir, model))
|
|
||||||
os.makedirs(resolved_path, exist_ok=True)
|
|
||||||
|
|
||||||
for file in files:
|
|
||||||
dest_path = os.path.join(resolved_path, file)
|
|
||||||
|
|
||||||
if os.path.exists(dest_path):
|
|
||||||
print(f"Skip file download, it already exists: {file}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
|
|
||||||
response = requests.get(url, stream=True)
|
|
||||||
with open(dest_path, 'wb') as f:
|
|
||||||
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
|
|
||||||
for chunk in response.iter_content(chunk_size=1024):
|
|
||||||
if chunk:
|
|
||||||
f.write(chunk)
|
|
||||||
t.update(len(chunk))
|
|
||||||
|
|
||||||
files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
|
|
||||||
for file in files2:
|
|
||||||
dest_path = os.path.join(modelsDir, file)
|
|
||||||
|
|
||||||
if os.path.exists(dest_path):
|
|
||||||
print(f"Skip file download, it already exists: {file}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
|
|
||||||
response = requests.get(url, stream=True)
|
|
||||||
with open(dest_path, 'wb') as f:
|
|
||||||
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
|
|
||||||
for chunk in response.iter_content(chunk_size=1024):
|
|
||||||
if chunk:
|
|
||||||
f.write(chunk)
|
|
||||||
t.update(len(chunk))
|
|
12
llama.cpp
12
llama.cpp
|
@ -734,11 +734,13 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
||||||
struct ggml_tensor * V_trans =
|
struct ggml_tensor * V_trans =
|
||||||
ggml_permute(ctx0,
|
ggml_cpy(ctx0,
|
||||||
ggml_reshape_3d(ctx0,
|
ggml_permute(ctx0,
|
||||||
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
ggml_reshape_3d(ctx0,
|
||||||
n_embd/n_head, n_head, n_past + N),
|
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
||||||
1, 2, 0, 3);
|
n_embd/n_head, n_head, n_past + N),
|
||||||
|
1, 2, 0, 3),
|
||||||
|
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
|
||||||
|
|
||||||
// KQV = transpose(V) * KQ_soft_max
|
// KQV = transpose(V) * KQ_soft_max
|
||||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
||||||
|
|
BIN
llamacpp.dll
BIN
llamacpp.dll
Binary file not shown.
21
main.cpp
21
main.cpp
|
@ -258,6 +258,9 @@ int main(int argc, char ** argv) {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// determine newline token
|
||||||
|
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
|
@ -359,6 +362,16 @@ int main(int argc, char ** argv) {
|
||||||
last_n_tokens.push_back(id);
|
last_n_tokens.push_back(id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// replace end of text token with newline token when in interactive mode
|
||||||
|
if (id == llama_token_eos() && params.interactive) {
|
||||||
|
id = llama_token_newline.front();
|
||||||
|
if (params.antiprompt.size() != 0) {
|
||||||
|
// tokenize and inject first reverse prompt
|
||||||
|
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
|
||||||
|
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// add it to the context
|
// add it to the context
|
||||||
embd.push_back(id);
|
embd.push_back(id);
|
||||||
|
|
||||||
|
@ -451,12 +464,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// end of text token
|
// end of text token
|
||||||
if (embd.back() == llama_token_eos()) {
|
if (embd.back() == llama_token_eos()) {
|
||||||
if (params.interactive) {
|
fprintf(stderr, " [end of text]\n");
|
||||||
is_interacting = true;
|
break;
|
||||||
} else {
|
|
||||||
fprintf(stderr, " [end of text]\n");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
||||||
|
|
BIN
main.exe
BIN
main.exe
Binary file not shown.
BIN
quantize.exe
BIN
quantize.exe
Binary file not shown.
|
@ -57,6 +57,7 @@ def main():
|
||||||
# )
|
# )
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
args.models_path = os.path.abspath(args.models_path)
|
||||||
|
|
||||||
if not os.path.isfile(args.quantize_script_path):
|
if not os.path.isfile(args.quantize_script_path):
|
||||||
print(
|
print(
|
||||||
|
|
101
utils.cpp
101
utils.cpp
|
@ -26,41 +26,95 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
|
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool invalid_param = false;
|
||||||
|
std::string arg;
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
std::string arg = argv[i];
|
arg = argv[i];
|
||||||
|
|
||||||
if (arg == "-s" || arg == "--seed") {
|
if (arg == "-s" || arg == "--seed") {
|
||||||
params.seed = std::stoi(argv[++i]);
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.seed = std::stoi(argv[i]);
|
||||||
} else if (arg == "-t" || arg == "--threads") {
|
} else if (arg == "-t" || arg == "--threads") {
|
||||||
params.n_threads = std::stoi(argv[++i]);
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_threads = std::stoi(argv[i]);
|
||||||
} else if (arg == "-p" || arg == "--prompt") {
|
} else if (arg == "-p" || arg == "--prompt") {
|
||||||
params.prompt = argv[++i];
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.prompt = argv[i];
|
||||||
} else if (arg == "-f" || arg == "--file") {
|
} else if (arg == "-f" || arg == "--file") {
|
||||||
std::ifstream file(argv[++i]);
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
std::ifstream file(argv[i]);
|
||||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||||
if (params.prompt.back() == '\n') {
|
if (params.prompt.back() == '\n') {
|
||||||
params.prompt.pop_back();
|
params.prompt.pop_back();
|
||||||
}
|
}
|
||||||
} else if (arg == "-n" || arg == "--n_predict") {
|
} else if (arg == "-n" || arg == "--n_predict") {
|
||||||
params.n_predict = std::stoi(argv[++i]);
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_predict = std::stoi(argv[i]);
|
||||||
} else if (arg == "--top_k") {
|
} else if (arg == "--top_k") {
|
||||||
params.top_k = std::stoi(argv[++i]);
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.top_k = std::stoi(argv[i]);
|
||||||
} else if (arg == "-c" || arg == "--ctx_size") {
|
} else if (arg == "-c" || arg == "--ctx_size") {
|
||||||
params.n_ctx = std::stoi(argv[++i]);
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_ctx = std::stoi(argv[i]);
|
||||||
} else if (arg == "--memory_f16") {
|
} else if (arg == "--memory_f16") {
|
||||||
params.memory_f16 = true;
|
params.memory_f16 = true;
|
||||||
} else if (arg == "--top_p") {
|
} else if (arg == "--top_p") {
|
||||||
params.top_p = std::stof(argv[++i]);
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.top_p = std::stof(argv[i]);
|
||||||
} else if (arg == "--temp") {
|
} else if (arg == "--temp") {
|
||||||
params.temp = std::stof(argv[++i]);
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.temp = std::stof(argv[i]);
|
||||||
} else if (arg == "--repeat_last_n") {
|
} else if (arg == "--repeat_last_n") {
|
||||||
params.repeat_last_n = std::stoi(argv[++i]);
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.repeat_last_n = std::stoi(argv[i]);
|
||||||
} else if (arg == "--repeat_penalty") {
|
} else if (arg == "--repeat_penalty") {
|
||||||
params.repeat_penalty = std::stof(argv[++i]);
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.repeat_penalty = std::stof(argv[i]);
|
||||||
} else if (arg == "-b" || arg == "--batch_size") {
|
} else if (arg == "-b" || arg == "--batch_size") {
|
||||||
params.n_batch = std::stoi(argv[++i]);
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_batch = std::stoi(argv[i]);
|
||||||
} else if (arg == "-m" || arg == "--model") {
|
} else if (arg == "-m" || arg == "--model") {
|
||||||
params.model = argv[++i];
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.model = argv[i];
|
||||||
} else if (arg == "-i" || arg == "--interactive") {
|
} else if (arg == "-i" || arg == "--interactive") {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
} else if (arg == "--interactive-first") {
|
} else if (arg == "--interactive-first") {
|
||||||
|
@ -70,13 +124,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
} else if (arg == "--color") {
|
} else if (arg == "--color") {
|
||||||
params.use_color = true;
|
params.use_color = true;
|
||||||
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
||||||
params.antiprompt.push_back(argv[++i]);
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.antiprompt.push_back(argv[i]);
|
||||||
} else if (arg == "--perplexity") {
|
} else if (arg == "--perplexity") {
|
||||||
params.perplexity = true;
|
params.perplexity = true;
|
||||||
} else if (arg == "--ignore-eos") {
|
} else if (arg == "--ignore-eos") {
|
||||||
params.ignore_eos = true;
|
params.ignore_eos = true;
|
||||||
} else if (arg == "--n_parts") {
|
} else if (arg == "--n_parts") {
|
||||||
params.n_parts = std::stoi(argv[++i]);
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_parts = std::stoi(argv[i]);
|
||||||
} else if (arg == "-h" || arg == "--help") {
|
} else if (arg == "-h" || arg == "--help") {
|
||||||
gpt_print_usage(argc, argv, params);
|
gpt_print_usage(argc, argv, params);
|
||||||
exit(0);
|
exit(0);
|
||||||
|
@ -85,9 +147,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
gpt_print_usage(argc, argv, params);
|
gpt_print_usage(argc, argv, params);
|
||||||
exit(0);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (invalid_param) {
|
||||||
|
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||||
|
gpt_print_usage(argc, argv, params);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue