Merge branch 'master' into wbpxre150
This commit is contained in:
commit
949dec0e42
25 changed files with 1868 additions and 1406 deletions
|
@ -5,9 +5,10 @@ FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential python3 python3-pip
|
apt-get install -y build-essential python3 python3-pip
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
&& pip install numpy requests sentencepiece tqdm \
|
&& pip install -r requirements.txt
|
||||||
&& pip install torch --index-url https://download.pytorch.org/whl/cpu
|
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
|
@ -192,10 +192,10 @@ ls ./models
|
||||||
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
|
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
|
||||||
|
|
||||||
# install Python dependencies
|
# install Python dependencies
|
||||||
python3 -m pip install torch numpy sentencepiece
|
python3 -m pip install -r requirements.txt
|
||||||
|
|
||||||
# convert the 7B model to ggml FP16 format
|
# convert the 7B model to ggml FP16 format
|
||||||
python3 convert-pth-to-ggml.py models/7B/ 1
|
python3 convert.py models/7B/
|
||||||
|
|
||||||
# quantize the model to 4-bits (using method 2 = q4_0)
|
# quantize the model to 4-bits (using method 2 = q4_0)
|
||||||
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
|
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
|
||||||
|
|
21
configs/alpaca-native-enhanced.txt
Normal file
21
configs/alpaca-native-enhanced.txt
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
--ctx_size 2048
|
||||||
|
--batch_size 16
|
||||||
|
--repeat_penalty 1.15
|
||||||
|
--temp 0.4
|
||||||
|
--top_k 30
|
||||||
|
--top_p 0.18
|
||||||
|
|
||||||
|
--interactive-first
|
||||||
|
--keep -1
|
||||||
|
|
||||||
|
--ins-prefix-bos
|
||||||
|
--ins-prefix "\n\nUser: "
|
||||||
|
--ins-suffix "\n\nAssistant: "
|
||||||
|
--reverse-prompt "User: "
|
||||||
|
|
||||||
|
-p "You are an AI language model designed to assist the User by answering their questions, offering advice, and engaging in casual conversation in a friendly, helpful, and informative manner. You respond clearly, coherently, and you consider the conversation history.
|
||||||
|
|
||||||
|
User: Hey, how's it going?
|
||||||
|
|
||||||
|
Assistant: Hey there! I'm doing great, thank you. What can I help you with today? Let's have a fun chat!"
|
||||||
|
|
9
configs/alpaca.txt
Normal file
9
configs/alpaca.txt
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
--clean-interface
|
||||||
|
--interactive-first
|
||||||
|
--keep -1
|
||||||
|
--ins-prefix-bos
|
||||||
|
--ins-prefix "\n\n### Instruction:\n\n"
|
||||||
|
--ins-suffix "\n\n### Response:\n\n"
|
||||||
|
--reverse-prompt "### Instruction:\n\n"
|
||||||
|
|
||||||
|
-p "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n"
|
15
configs/chat-with-bob.txt
Normal file
15
configs/chat-with-bob.txt
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
--interactive-first
|
||||||
|
--keep -1
|
||||||
|
--ins-prefix-bos
|
||||||
|
--ins-prefix "\nUser: "
|
||||||
|
--ins-suffix "\nBob: "
|
||||||
|
--reverse-prompt "User: "
|
||||||
|
--rm-trailing-space-workaround
|
||||||
|
|
||||||
|
-p "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
||||||
|
|
||||||
|
User: Hello, Bob.
|
||||||
|
Bob: Hello. How may I help you today?
|
||||||
|
User: Please tell me the largest city in Europe.
|
||||||
|
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia."
|
||||||
|
|
3
configs/llama.txt
Normal file
3
configs/llama.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
--interactive-first
|
||||||
|
--keep -1
|
||||||
|
--temp 0.1
|
7
configs/vicuna-simple.txt
Normal file
7
configs/vicuna-simple.txt
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
--interactive-first
|
||||||
|
--keep -1
|
||||||
|
--ins-prefix-bos
|
||||||
|
--ins-prefix "\n### Human: "
|
||||||
|
--ins-suffix "\n### Assistant: "
|
||||||
|
--reverse-prompt "### Human: "
|
||||||
|
--rm-trailing-space-workaround
|
8
configs/vicuna-stop.txt
Normal file
8
configs/vicuna-stop.txt
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
--interactive-first
|
||||||
|
--keep -1
|
||||||
|
--ins-prefix-bos
|
||||||
|
--ins-prefix "\n### Human: "
|
||||||
|
--ins-suffix "\n### Assistant: "
|
||||||
|
--reverse-prompt "### Human: "
|
||||||
|
--stop-prompt "### Assistant: "
|
||||||
|
--rm-trailing-space-workaround
|
9
configs/vicuna.txt
Normal file
9
configs/vicuna.txt
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
--interactive-first
|
||||||
|
--keep -1
|
||||||
|
--ins-prefix-bos
|
||||||
|
--ins-prefix "\n### Human: "
|
||||||
|
--ins-suffix "\n### Assistant: "
|
||||||
|
--reverse-prompt "### Human: "
|
||||||
|
--rm-trailing-space-workaround
|
||||||
|
|
||||||
|
-p "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
|
|
@ -1,299 +0,0 @@
|
||||||
# Author: github.com/ductai199x
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from numba import njit
|
|
||||||
from tqdm.auto import tqdm
|
|
||||||
|
|
||||||
|
|
||||||
def read_header(fin):
|
|
||||||
values = struct.unpack("i" * 9, fin.read(4 * 9))
|
|
||||||
_, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
|
|
||||||
return {
|
|
||||||
"vocab_size": vocab_size,
|
|
||||||
"dim": dim,
|
|
||||||
"multiple_of": multiple_of,
|
|
||||||
"n_heads": n_heads,
|
|
||||||
"n_layers": n_layers,
|
|
||||||
}, ftype
|
|
||||||
|
|
||||||
|
|
||||||
def read_tokens(fin, vocab_size):
|
|
||||||
tokens = []
|
|
||||||
for _ in range(vocab_size):
|
|
||||||
text_len = struct.unpack("i", fin.read(4))[0]
|
|
||||||
text_bytes = fin.read(text_len)
|
|
||||||
try:
|
|
||||||
text = text_bytes.decode()
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
text = text_bytes.decode(errors="replace")
|
|
||||||
score = struct.unpack("f", fin.read(4))[0]
|
|
||||||
tokens.append((text, score))
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
|
|
||||||
@njit
|
|
||||||
def dequantize_weights_numba(fin_data, n_rows, n_cols):
|
|
||||||
qk = 32
|
|
||||||
nb = n_cols // qk
|
|
||||||
bs = 4 + (qk // 2)
|
|
||||||
|
|
||||||
weights = np.zeros((n_rows, n_cols), dtype=np.float32)
|
|
||||||
data_pos = 0
|
|
||||||
|
|
||||||
for row in range(n_rows):
|
|
||||||
for block in range(nb):
|
|
||||||
d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
|
|
||||||
data_pos += 4
|
|
||||||
packed_values = fin_data[data_pos : data_pos + (qk // 2)]
|
|
||||||
data_pos += qk // 2
|
|
||||||
|
|
||||||
for i in range(qk // 2):
|
|
||||||
packed_value = packed_values[i]
|
|
||||||
v0 = np.float32((packed_value & 0b00001111) - 8) * d
|
|
||||||
v1 = np.float32((packed_value >> 4) - 8) * d
|
|
||||||
|
|
||||||
weights[row, block * qk + 2 * i] = v0
|
|
||||||
weights[row, block * qk + 2 * i + 1] = v1
|
|
||||||
|
|
||||||
return weights
|
|
||||||
|
|
||||||
|
|
||||||
def dequantize_weights(fin, n_rows, n_cols):
|
|
||||||
qk = 32
|
|
||||||
nb = n_cols // qk
|
|
||||||
data_size = n_rows * n_cols // 2 + n_rows * nb * 4
|
|
||||||
fin_data = fin.read(data_size)
|
|
||||||
return dequantize_weights_numba(fin_data, n_rows, n_cols)
|
|
||||||
|
|
||||||
|
|
||||||
def read_variables(fin):
|
|
||||||
model = {}
|
|
||||||
pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
|
|
||||||
while True:
|
|
||||||
start_pos = fin.tell()
|
|
||||||
try:
|
|
||||||
n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
|
|
||||||
except struct.error:
|
|
||||||
break
|
|
||||||
|
|
||||||
shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
|
|
||||||
shape = shape[::-1]
|
|
||||||
name = fin.read(name_length).decode()
|
|
||||||
|
|
||||||
# ensure tensor data is aligned
|
|
||||||
tensor_data_offset = fin.tell()
|
|
||||||
tensor_data_offset = (tensor_data_offset + 31) & -32
|
|
||||||
fin.seek(tensor_data_offset)
|
|
||||||
|
|
||||||
if ftype_cur == 2:
|
|
||||||
# 4-bit quantized weights
|
|
||||||
dtype = np.uint8
|
|
||||||
data = dequantize_weights(fin, shape[0], shape[1])
|
|
||||||
data = data.reshape(shape)
|
|
||||||
elif ftype_cur == 0:
|
|
||||||
dtype = np.float32
|
|
||||||
data_size = np.prod(shape)
|
|
||||||
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
|
|
||||||
elif ftype_cur == 1:
|
|
||||||
dtype = np.float16
|
|
||||||
data_size = np.prod(shape)
|
|
||||||
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
|
|
||||||
|
|
||||||
model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
|
|
||||||
|
|
||||||
pbar.update(fin.tell() - start_pos)
|
|
||||||
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def convert_to_hf_format(model, hparams):
|
|
||||||
# This works for llama 7B, need to test with other models
|
|
||||||
n_layers = hparams["n_layers"]
|
|
||||||
n_heads = hparams["n_heads"]
|
|
||||||
dim = hparams["dim"]
|
|
||||||
dims_per_head = dim // n_heads
|
|
||||||
base = 10000.0
|
|
||||||
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
|
||||||
|
|
||||||
# permute for sliced rotary
|
|
||||||
def permute(w):
|
|
||||||
return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
|
|
||||||
|
|
||||||
state_dict = {}
|
|
||||||
for layer_i in range(n_layers):
|
|
||||||
state_dict.update(
|
|
||||||
{
|
|
||||||
f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
|
|
||||||
model[f"layers.{layer_i}.attention.wq.weight"]
|
|
||||||
),
|
|
||||||
f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
|
|
||||||
model[f"layers.{layer_i}.attention.wk.weight"]
|
|
||||||
),
|
|
||||||
f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
|
|
||||||
f"layers.{layer_i}.attention.wv.weight"
|
|
||||||
],
|
|
||||||
f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
|
|
||||||
f"layers.{layer_i}.attention.wo.weight"
|
|
||||||
],
|
|
||||||
f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
|
|
||||||
f"layers.{layer_i}.feed_forward.w1.weight"
|
|
||||||
],
|
|
||||||
f"model.layers.{layer_i}.mlp.down_proj.weight": model[
|
|
||||||
f"layers.{layer_i}.feed_forward.w2.weight"
|
|
||||||
],
|
|
||||||
f"model.layers.{layer_i}.mlp.up_proj.weight": model[
|
|
||||||
f"layers.{layer_i}.feed_forward.w3.weight"
|
|
||||||
],
|
|
||||||
f"model.layers.{layer_i}.input_layernorm.weight": model[
|
|
||||||
f"layers.{layer_i}.attention_norm.weight"
|
|
||||||
],
|
|
||||||
f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
|
|
||||||
f"layers.{layer_i}.ffn_norm.weight"
|
|
||||||
],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
|
|
||||||
state_dict.update(
|
|
||||||
{
|
|
||||||
"model.embed_tokens.weight": model["tok_embeddings.weight"],
|
|
||||||
"model.norm.weight": model["norm.weight"],
|
|
||||||
"lm_head.weight": model["output.weight"],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return state_dict
|
|
||||||
|
|
||||||
|
|
||||||
def chat(model, hparams, llama_dir):
|
|
||||||
from transformers import (GenerationConfig, LlamaForCausalLM,
|
|
||||||
LlamaTokenizer, StoppingCriteria,
|
|
||||||
StoppingCriteriaList)
|
|
||||||
from transformers.models.llama.configuration_llama import LlamaConfig
|
|
||||||
|
|
||||||
class StoppingCriteriaSub(StoppingCriteria):
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
|
|
||||||
print(tokenizer.decode(input_ids[0]), end="", flush=True)
|
|
||||||
if input_ids[0][-1] == 13:
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
config = LlamaConfig(
|
|
||||||
vocab_size=hparams["vocab_size"],
|
|
||||||
dim=hparams["dim"],
|
|
||||||
num_hidden_layers=hparams["n_layers"],
|
|
||||||
num_attention_heads=hparams["n_heads"],
|
|
||||||
)
|
|
||||||
|
|
||||||
llama = LlamaForCausalLM(config=config)
|
|
||||||
llama.load_state_dict(state_dict=model, strict=True)
|
|
||||||
tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
|
|
||||||
|
|
||||||
device = torch.device("cpu")
|
|
||||||
llama = llama.to(device)
|
|
||||||
|
|
||||||
ctx = """You are AI.
|
|
||||||
This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
|
|
||||||
User: Hello, AI.
|
|
||||||
AI: Hello! How can I assist you today?
|
|
||||||
"""
|
|
||||||
print(ctx.rstrip("\n"))
|
|
||||||
while True:
|
|
||||||
print("-" * 60)
|
|
||||||
prompt = input("User: ")
|
|
||||||
if ctx != "":
|
|
||||||
ctx = f"{ctx}User: {prompt}\n"
|
|
||||||
else:
|
|
||||||
ctx = f"{prompt}\nAI:"
|
|
||||||
|
|
||||||
ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
|
|
||||||
|
|
||||||
print("-" * 60)
|
|
||||||
if len(ctx.strip()) > 0:
|
|
||||||
input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
|
|
||||||
generation_config = GenerationConfig(
|
|
||||||
temperature=0.8,
|
|
||||||
top_p=0.95,
|
|
||||||
top_k=50,
|
|
||||||
repetition_penalty=1.1764,
|
|
||||||
)
|
|
||||||
with torch.no_grad():
|
|
||||||
generation_output = llama.generate(
|
|
||||||
input_ids=input_ids,
|
|
||||||
generation_config=generation_config,
|
|
||||||
return_dict_in_generate=True,
|
|
||||||
output_scores=True,
|
|
||||||
max_length=2048,
|
|
||||||
do_sample=True,
|
|
||||||
stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
|
|
||||||
)
|
|
||||||
s = generation_output.sequences[0]
|
|
||||||
decoded = tokenizer.decode(s)
|
|
||||||
ctx = f"{decoded}\n"
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--prefix",
|
|
||||||
"-p",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--hf",
|
|
||||||
action="store_true",
|
|
||||||
help="Whether to save the model in the Hugging Face format. (default: False)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
llama_dir = os.path.abspath(f"{args.input_dir}/../")
|
|
||||||
|
|
||||||
ggml_files = sorted(
|
|
||||||
[f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
|
|
||||||
)
|
|
||||||
|
|
||||||
fin = open(ggml_files[0], "rb")
|
|
||||||
hparams, ftype = read_header(fin)
|
|
||||||
tokens = read_tokens(fin, hparams["vocab_size"])
|
|
||||||
model = read_variables(fin)
|
|
||||||
|
|
||||||
for f in tqdm(ggml_files[1:]):
|
|
||||||
fin = open(f, "rb")
|
|
||||||
read_header(fin)
|
|
||||||
read_tokens(fin, hparams["vocab_size"])
|
|
||||||
model.update(read_variables(fin))
|
|
||||||
|
|
||||||
if args.hf:
|
|
||||||
model = convert_to_hf_format(model, hparams)
|
|
||||||
|
|
||||||
pth_ckpt = {
|
|
||||||
"state_dict": model,
|
|
||||||
"hparams": hparams,
|
|
||||||
"tokens": tokens,
|
|
||||||
}
|
|
||||||
|
|
||||||
torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
|
|
||||||
|
|
||||||
if args.chat:
|
|
||||||
if not args.hf:
|
|
||||||
model = convert_to_hf_format(model, hparams)
|
|
||||||
chat(model, hparams, llama_dir)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,107 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
#
|
|
||||||
# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
|
|
||||||
#
|
|
||||||
|
|
||||||
# Original by https://github.com/eiz
|
|
||||||
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
|
|
||||||
import argparse
|
|
||||||
import glob
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
import sys
|
|
||||||
from sentencepiece import SentencePieceProcessor
|
|
||||||
|
|
||||||
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
|
|
||||||
parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
|
|
||||||
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
def read_header(f_in):
|
|
||||||
struct_fmt = "i" * (3 + len(HPARAMS))
|
|
||||||
struct_size = struct.calcsize(struct_fmt)
|
|
||||||
buf = f_in.read(struct_size)
|
|
||||||
return struct.unpack(struct_fmt, buf)
|
|
||||||
|
|
||||||
def write_header(f_out, header):
|
|
||||||
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
|
|
||||||
|
|
||||||
if magic != 0x67676d6c:
|
|
||||||
raise Exception('Invalid file magic. Must be an old style ggml file.')
|
|
||||||
|
|
||||||
values = [
|
|
||||||
0x67676d66, # magic: ggml in hex
|
|
||||||
1, # file version
|
|
||||||
vocab_size,
|
|
||||||
dim,
|
|
||||||
multiple_of,
|
|
||||||
n_heads,
|
|
||||||
n_layers,
|
|
||||||
rot,
|
|
||||||
ftype
|
|
||||||
]
|
|
||||||
f_out.write(struct.pack("i" * len(values), *values))
|
|
||||||
|
|
||||||
def write_tokens(fout, tokenizer):
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
if tokenizer.is_unknown(i):
|
|
||||||
text = " \u2047 ".encode()
|
|
||||||
elif tokenizer.is_control(i):
|
|
||||||
text = b""
|
|
||||||
elif tokenizer.is_byte(i):
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
|
||||||
if len(piece) != 6:
|
|
||||||
print(f"Invalid token: {piece}")
|
|
||||||
sys.exit(1)
|
|
||||||
byte_value = int(piece[3:-1], 16)
|
|
||||||
text = struct.pack("B", byte_value)
|
|
||||||
else:
|
|
||||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
|
||||||
fout.write(struct.pack("i", len(text)))
|
|
||||||
fout.write(text)
|
|
||||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
|
||||||
|
|
||||||
# TODO: GPT4All - add extra <pad> token
|
|
||||||
text = "<pad>".encode()
|
|
||||||
fout.write(struct.pack("i", len(text)))
|
|
||||||
fout.write(text)
|
|
||||||
fout.write(struct.pack("f", 0.0))
|
|
||||||
|
|
||||||
def read_tokens(f_in, tokenizer):
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
len_b = f_in.read(4)
|
|
||||||
(length,) = struct.unpack("i", len_b)
|
|
||||||
f_in.read(length)
|
|
||||||
|
|
||||||
def copy_all_data(f_out, f_in):
|
|
||||||
while True:
|
|
||||||
buf = f_in.read(1024 * 1024)
|
|
||||||
if not buf:
|
|
||||||
break
|
|
||||||
f_out.write(buf)
|
|
||||||
|
|
||||||
def convert_one_file(path_in, tokenizer):
|
|
||||||
path_tmp = f"{path_in}.tmp"
|
|
||||||
path_orig= f"{path_in}.orig"
|
|
||||||
print(f"converting {path_in}")
|
|
||||||
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
|
|
||||||
write_header(f_out, read_header(f_in))
|
|
||||||
read_tokens(f_in, tokenizer)
|
|
||||||
write_tokens(f_out, tokenizer)
|
|
||||||
copy_all_data(f_out, f_in)
|
|
||||||
os.rename(path_in, path_orig)
|
|
||||||
os.rename(path_tmp, path_in)
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor(args.tokenizer_model)
|
|
||||||
|
|
||||||
convert_one_file(args.gpt4all_model, tokenizer)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,172 +0,0 @@
|
||||||
# Convert a GPTQ quantized LLaMA model to a ggml compatible file
|
|
||||||
# Based on: https://github.com/qwopqwop200/GPTQ-for-LLaMa
|
|
||||||
#
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
import struct
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from sentencepiece import SentencePieceProcessor
|
|
||||||
|
|
||||||
if len(sys.argv) != 4:
|
|
||||||
print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
fname_model = sys.argv[1]
|
|
||||||
fname_tokenizer = sys.argv[2]
|
|
||||||
dir_out = sys.argv[3]
|
|
||||||
|
|
||||||
model = torch.load(fname_model, map_location="cpu")
|
|
||||||
|
|
||||||
n_vocab, n_embd = model['model.embed_tokens.weight'].shape
|
|
||||||
n_layer = 1 + max(int(m.group(1)) for name in model
|
|
||||||
if (m := re.match(r'model\.layers\.([0-9]+)', name)))
|
|
||||||
|
|
||||||
# hardcoded:
|
|
||||||
n_mult = 256
|
|
||||||
n_head = {32: 32, 40: 40, 60: 52, 80: 64}[n_layer]
|
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor(fname_tokenizer)
|
|
||||||
|
|
||||||
assert tokenizer.vocab_size() == n_vocab
|
|
||||||
|
|
||||||
fname_out = sys.argv[3]
|
|
||||||
|
|
||||||
fout = open(fname_out, "wb")
|
|
||||||
|
|
||||||
fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
|
|
||||||
fout.write(struct.pack("i", 1)) # file version
|
|
||||||
fout.write(struct.pack("i", n_vocab))
|
|
||||||
fout.write(struct.pack("i", n_embd))
|
|
||||||
fout.write(struct.pack("i", n_mult))
|
|
||||||
fout.write(struct.pack("i", n_head))
|
|
||||||
fout.write(struct.pack("i", n_layer))
|
|
||||||
fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
|
|
||||||
fout.write(struct.pack("i", 4))
|
|
||||||
|
|
||||||
|
|
||||||
# This loop unchanged from convert-pth-to-ggml.py:
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
if tokenizer.is_unknown(i):
|
|
||||||
text = " \u2047 ".encode()
|
|
||||||
elif tokenizer.is_control(i):
|
|
||||||
text = b""
|
|
||||||
elif tokenizer.is_byte(i):
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
|
||||||
if len(piece) != 6:
|
|
||||||
print(f"Invalid token: {piece}")
|
|
||||||
sys.exit(1)
|
|
||||||
byte_value = int(piece[3:-1], 16)
|
|
||||||
text = struct.pack("B", byte_value)
|
|
||||||
else:
|
|
||||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
|
||||||
fout.write(struct.pack("i", len(text)))
|
|
||||||
fout.write(text)
|
|
||||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
|
||||||
|
|
||||||
def write_header(shape, dst_name, ftype_cur):
|
|
||||||
sname = dst_name.encode()
|
|
||||||
fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
|
|
||||||
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
|
||||||
fout.write(sname)
|
|
||||||
|
|
||||||
# ensure tensor data is aligned
|
|
||||||
tensor_data_offset = fout.tell()
|
|
||||||
tensor_data_offset = (tensor_data_offset + 31) & -32
|
|
||||||
fout.seek(tensor_data_offset)
|
|
||||||
|
|
||||||
def convert_non_q4(src_name, dst_name):
|
|
||||||
v = model[src_name]
|
|
||||||
shape = v.shape
|
|
||||||
print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
|
|
||||||
if len(shape) == 1:
|
|
||||||
print(" Converting to float32")
|
|
||||||
v = v.to(torch.float32)
|
|
||||||
|
|
||||||
ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype]
|
|
||||||
|
|
||||||
# header
|
|
||||||
write_header(shape, dst_name, ftype_cur)
|
|
||||||
|
|
||||||
# data
|
|
||||||
v.numpy().tofile(fout)
|
|
||||||
|
|
||||||
def convert_q4(src_name, dst_name, permute=False):
|
|
||||||
zeros = model[f"{src_name}.zeros"].numpy()
|
|
||||||
scales = model[f"{src_name}.scales"].numpy()
|
|
||||||
bias = model[f"{src_name}.bias"].numpy()
|
|
||||||
qweight = model[f"{src_name}.qweight"].numpy().T # transpose
|
|
||||||
|
|
||||||
# Q4_1 does not support bias; good thing the bias is always all zeros.
|
|
||||||
assert not np.any(bias)
|
|
||||||
|
|
||||||
# Each int32 item is actually 8 int4 items packed together, and it's transposed.
|
|
||||||
shape = (qweight.shape[0], qweight.shape[1] * 8)
|
|
||||||
|
|
||||||
print(f"Processing Q4 variable: {src_name} with shape: {shape}")
|
|
||||||
|
|
||||||
# The output format has the int4 weights in groups of 32 rather than 8.
|
|
||||||
# It looks like this:
|
|
||||||
# For each row:
|
|
||||||
# For each group of 32 columns:
|
|
||||||
# - addend (float32, 4 bytes)
|
|
||||||
# - scale (float32, 4 bytes)
|
|
||||||
# - weights (int4 * 32, 16 bytes)
|
|
||||||
# Note that in the input, the scales and addends are shared between all
|
|
||||||
# the columns in a row, so we end up wasting quite a bit of memory with
|
|
||||||
# repeated scales and addends.
|
|
||||||
|
|
||||||
addends = -zeros # flip sign
|
|
||||||
|
|
||||||
# Since the output format is mixed between integers and floats, we have
|
|
||||||
# to hackily view the floats as int32s just so numpy will let us
|
|
||||||
# concatenate them.
|
|
||||||
addends_view = addends.view(dtype=np.int32)
|
|
||||||
scales_view = scales.view(dtype=np.int32)
|
|
||||||
|
|
||||||
# Split into groups of 4 columns (i.e. 32 columns of quantized data):
|
|
||||||
grouped = qweight.reshape([qweight.shape[0], qweight.shape[1] // 4, 4])
|
|
||||||
|
|
||||||
# Repeat addends and scales:
|
|
||||||
addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1)
|
|
||||||
scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1)
|
|
||||||
|
|
||||||
blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no')
|
|
||||||
|
|
||||||
if permute:
|
|
||||||
# Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
|
|
||||||
# This can be done after the above conversion because it doesn't affect column order/layout.
|
|
||||||
blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
|
|
||||||
.swapaxes(1, 2)
|
|
||||||
.reshape(blob.shape))
|
|
||||||
|
|
||||||
# header
|
|
||||||
write_header(shape, dst_name, 3) # ftype = Q4_1
|
|
||||||
|
|
||||||
# data
|
|
||||||
blob.tofile(fout)
|
|
||||||
|
|
||||||
convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight")
|
|
||||||
convert_non_q4("model.norm.weight", "norm.weight")
|
|
||||||
convert_non_q4("lm_head.weight", "output.weight")
|
|
||||||
|
|
||||||
for i in range(n_layer):
|
|
||||||
convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True)
|
|
||||||
convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True)
|
|
||||||
convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight")
|
|
||||||
convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight")
|
|
||||||
|
|
||||||
convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight")
|
|
||||||
convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight")
|
|
||||||
convert_q4(f"model.layers.{i}.mlp.up_proj", f"layers.{i}.feed_forward.w3.weight")
|
|
||||||
|
|
||||||
convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight")
|
|
||||||
convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight")
|
|
||||||
|
|
||||||
|
|
||||||
fout.close()
|
|
||||||
|
|
||||||
print(f"Done. Output file: {fname_out}")
|
|
||||||
print()
|
|
|
@ -1,274 +1,11 @@
|
||||||
# Convert a LLaMA model checkpoint to a ggjt compatible file
|
# Compatibility stub
|
||||||
#
|
|
||||||
# Load the model using Torch
|
|
||||||
# Iterate over all variables and write them to a binary file.
|
|
||||||
#
|
|
||||||
# For each variable, write the following:
|
|
||||||
# - Number of dimensions (int)
|
|
||||||
# - Name length (int)
|
|
||||||
# - Dimensions (int[n_dims])
|
|
||||||
# - Name (char[name_length])
|
|
||||||
# - Data (float[n_dims])
|
|
||||||
#
|
|
||||||
# At the start of the ggml file we write the model parameters
|
|
||||||
# and vocabulary.
|
|
||||||
#
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
import struct
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from sentencepiece import SentencePieceProcessor
|
import convert
|
||||||
|
|
||||||
QK = 32
|
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
|
||||||
|
parser.add_argument('dir_model', help='directory containing the model checkpoint')
|
||||||
GGML_TYPE_Q4_0 = 0
|
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
|
||||||
GGML_TYPE_Q4_1 = 1
|
args = parser.parse_args()
|
||||||
GGML_TYPE_I8 = 2
|
convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])
|
||||||
GGML_TYPE_I16 = 3
|
|
||||||
GGML_TYPE_I32 = 4
|
|
||||||
GGML_TYPE_F16 = 5
|
|
||||||
GGML_TYPE_F32 = 6
|
|
||||||
|
|
||||||
WTYPES = {
|
|
||||||
0: GGML_TYPE_F32,
|
|
||||||
1: GGML_TYPE_F16,
|
|
||||||
2: GGML_TYPE_Q4_0,
|
|
||||||
3: GGML_TYPE_Q4_1,
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_BLCK_SIZE = {
|
|
||||||
GGML_TYPE_Q4_0: QK,
|
|
||||||
GGML_TYPE_Q4_1: QK,
|
|
||||||
GGML_TYPE_I8: 1,
|
|
||||||
GGML_TYPE_I16: 1,
|
|
||||||
GGML_TYPE_I32: 1,
|
|
||||||
GGML_TYPE_F16: 1,
|
|
||||||
GGML_TYPE_F32: 1,
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_TYPE_SIZE = {
|
|
||||||
GGML_TYPE_Q4_0: 4 + QK//2,
|
|
||||||
GGML_TYPE_Q4_1: 4*2 + QK//2,
|
|
||||||
GGML_TYPE_I8: 1,
|
|
||||||
GGML_TYPE_I16: 2,
|
|
||||||
GGML_TYPE_I32: 4,
|
|
||||||
GGML_TYPE_F16: 2,
|
|
||||||
GGML_TYPE_F32: 4,
|
|
||||||
}
|
|
||||||
|
|
||||||
def ggml_nelements(shape):
|
|
||||||
r = 1
|
|
||||||
for i in shape:
|
|
||||||
r *= i
|
|
||||||
return r
|
|
||||||
|
|
||||||
def ggml_nbytes(shape, ftype):
|
|
||||||
x = ggml_nelements(shape)
|
|
||||||
t = WTYPES[ftype]
|
|
||||||
x *= GGML_TYPE_SIZE[t]
|
|
||||||
x //= GGML_BLCK_SIZE[t]
|
|
||||||
return x
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
|
|
||||||
parser.add_argument('dir_model', help='directory containing the model checkpoint')
|
|
||||||
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
|
|
||||||
parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
def get_n_parts(dim):
|
|
||||||
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
|
|
||||||
n_parts = mappings.get(dim)
|
|
||||||
if n_parts is None:
|
|
||||||
print(f"Invalid dim: {dim}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
print(f"n_parts = {n_parts}\n")
|
|
||||||
return n_parts
|
|
||||||
|
|
||||||
def load_hparams_and_tokenizer(dir_model):
|
|
||||||
# `dir_model` is something like `models/7B` or `models/7B/`.
|
|
||||||
# "tokenizer.model" is expected under model's parent dir.
|
|
||||||
# When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
|
|
||||||
# Let's use the model's parent dir directly.
|
|
||||||
model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
|
|
||||||
fname_hparams = f"{dir_model}/params.json"
|
|
||||||
fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
|
|
||||||
with open(fname_hparams, "r") as f:
|
|
||||||
hparams = json.load(f)
|
|
||||||
print(hparams)
|
|
||||||
tokenizer = SentencePieceProcessor(fname_tokenizer)
|
|
||||||
hparams.update({"vocab_size": tokenizer.vocab_size()})
|
|
||||||
return hparams, tokenizer
|
|
||||||
|
|
||||||
def write_header(fout, hparams, ftype):
|
|
||||||
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
|
||||||
values = [
|
|
||||||
0x67676a74, # magic: ggjt in hex
|
|
||||||
1, # file version
|
|
||||||
*[hparams[key] for key in keys],
|
|
||||||
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
|
|
||||||
ftype
|
|
||||||
]
|
|
||||||
fout.write(struct.pack("i" * len(values), *values))
|
|
||||||
|
|
||||||
def write_tokens(fout, tokenizer):
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
if tokenizer.is_unknown(i):
|
|
||||||
text = " \u2047 ".encode()
|
|
||||||
elif tokenizer.is_control(i):
|
|
||||||
text = b""
|
|
||||||
elif tokenizer.is_byte(i):
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
|
||||||
if len(piece) != 6:
|
|
||||||
print(f"Invalid token: {piece}")
|
|
||||||
sys.exit(1)
|
|
||||||
byte_value = int(piece[3:-1], 16)
|
|
||||||
text = struct.pack("B", byte_value)
|
|
||||||
else:
|
|
||||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
|
||||||
fout.write(struct.pack("i", len(text)))
|
|
||||||
fout.write(text)
|
|
||||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
|
||||||
|
|
||||||
def process_and_write_variables(fout, model, ftype, part_id, n_parts):
|
|
||||||
for name, datao in model.items():
|
|
||||||
if name.endswith("freqs"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# remove dimensions with a single element
|
|
||||||
data = datao.numpy().squeeze()
|
|
||||||
partshape = data.shape
|
|
||||||
n_dims = len(data.shape)
|
|
||||||
assert n_dims in (1, 2)
|
|
||||||
|
|
||||||
print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
|
|
||||||
|
|
||||||
# coerce single-dimensional tensors from float16 to float32
|
|
||||||
ftype_cur = 1
|
|
||||||
if ftype == 0 or n_dims == 1:
|
|
||||||
print(" Converting to float32")
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
ftype_cur = 0
|
|
||||||
blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
|
|
||||||
type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
|
|
||||||
|
|
||||||
# determine dimension along which multipart tensor is sharded
|
|
||||||
#
|
|
||||||
# split_dim 0 regex:
|
|
||||||
# - output.*
|
|
||||||
# - layers.*.attention.wq.weight
|
|
||||||
# - layers.*.attention.wk.weight
|
|
||||||
# - layers.*.attention.wv.weight
|
|
||||||
# - layers.*.feed_forward.w1.weight
|
|
||||||
# - layers.*.feed_forward.w3.weight
|
|
||||||
#
|
|
||||||
# split_dim 1 regex:
|
|
||||||
# - tok_embeddings.*
|
|
||||||
# - layers.*.attention.wo.weight
|
|
||||||
# - layers.*.feed_forward.w2.weight
|
|
||||||
#
|
|
||||||
if n_dims > 1:
|
|
||||||
split_dim = 1
|
|
||||||
if "tok_embeddings" in name:
|
|
||||||
split_dim = 1
|
|
||||||
elif "layers" in name:
|
|
||||||
if "attention.wo.weight" in name:
|
|
||||||
split_dim = 1
|
|
||||||
elif "feed_forward.w2.weight" in name:
|
|
||||||
split_dim = 1
|
|
||||||
else:
|
|
||||||
split_dim = 0
|
|
||||||
elif "output" in name:
|
|
||||||
split_dim = 0
|
|
||||||
|
|
||||||
# output tensor header
|
|
||||||
fullshape = list(partshape)
|
|
||||||
if n_dims > 1:
|
|
||||||
fullshape[split_dim] *= n_parts
|
|
||||||
sname = name.encode()
|
|
||||||
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
|
|
||||||
for dim in reversed(fullshape):
|
|
||||||
fout.write(struct.pack("i", dim))
|
|
||||||
fout.write(sname)
|
|
||||||
|
|
||||||
# ensure tensor data is aligned
|
|
||||||
tensor_data_offset = fout.tell()
|
|
||||||
while tensor_data_offset % QK != 0:
|
|
||||||
fout.write(struct.pack("B", 0))
|
|
||||||
tensor_data_offset += 1
|
|
||||||
|
|
||||||
# output unified mappable tensor data
|
|
||||||
if n_dims == 1 or n_parts == 1:
|
|
||||||
# copy tensor which we thankfully received in one piece
|
|
||||||
if part_id == 0:
|
|
||||||
data.tofile(fout)
|
|
||||||
elif split_dim == 0:
|
|
||||||
# reassemble multifile tensor containing some of the rows
|
|
||||||
rows_per_chunk = partshape[0]
|
|
||||||
current_row = part_id * rows_per_chunk
|
|
||||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
|
||||||
offset = current_row * bytes_per_row
|
|
||||||
fout.seek(tensor_data_offset + offset)
|
|
||||||
data.tofile(fout)
|
|
||||||
elif split_dim == 1:
|
|
||||||
# reassemble multifile tensor containing some of the cols
|
|
||||||
cols_per_chunk = partshape[1]
|
|
||||||
current_col = part_id * cols_per_chunk
|
|
||||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
|
||||||
offset_current_col = current_col // blck_size * type_size
|
|
||||||
for row in range(partshape[0]):
|
|
||||||
offset_row = row * bytes_per_row
|
|
||||||
offset = offset_row + offset_current_col
|
|
||||||
fout.seek(tensor_data_offset + offset)
|
|
||||||
data[row].tofile(fout)
|
|
||||||
|
|
||||||
# advance file position to next tensor
|
|
||||||
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = parse_args()
|
|
||||||
dir_model = args.dir_model
|
|
||||||
ftype = args.ftype
|
|
||||||
ftype_str = ["f32", "f16"]
|
|
||||||
hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
|
|
||||||
|
|
||||||
print(args)
|
|
||||||
|
|
||||||
# if only writing vocab to file
|
|
||||||
if args.vocab_only:
|
|
||||||
fname_model = f"{dir_model}/consolidated.00.pth"
|
|
||||||
fname_out = f"{dir_model}/ggml-vocab.bin"
|
|
||||||
print(f"Extracting only the vocab from '{fname_model}'\n")
|
|
||||||
with open(fname_out, "wb") as fout:
|
|
||||||
write_header(fout, hparams, ftype)
|
|
||||||
write_tokens(fout, tokenizer)
|
|
||||||
print(f"Done. Output file: {fname_out}\n")
|
|
||||||
return
|
|
||||||
|
|
||||||
n_parts = get_n_parts(hparams["dim"])
|
|
||||||
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
|
|
||||||
|
|
||||||
# we output a single file for ggml
|
|
||||||
with open(fname_out, "wb") as fout:
|
|
||||||
write_header(fout, hparams, ftype)
|
|
||||||
write_tokens(fout, tokenizer)
|
|
||||||
offset_of_tensors = fout.tell()
|
|
||||||
# the tensors we load could be split across multiple files
|
|
||||||
for part_id in range(n_parts):
|
|
||||||
fout.seek(offset_of_tensors)
|
|
||||||
print(f"Processing part {part_id+1} of {n_parts}\n")
|
|
||||||
fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
|
|
||||||
model = torch.load(fname_model, map_location="cpu")
|
|
||||||
process_and_write_variables(fout, model, ftype, part_id, n_parts)
|
|
||||||
del model
|
|
||||||
|
|
||||||
print(f"Done. Output file: {fname_out}\n")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
|
@ -1,100 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# Original by https://github.com/eiz
|
|
||||||
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
|
|
||||||
import argparse
|
|
||||||
import glob
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
import sys
|
|
||||||
from sentencepiece import SentencePieceProcessor
|
|
||||||
|
|
||||||
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
|
|
||||||
parser.add_argument('dir_model', help='directory containing ggml .bin files')
|
|
||||||
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
def read_header(f_in):
|
|
||||||
struct_fmt = "i" * (3 + len(HPARAMS))
|
|
||||||
struct_size = struct.calcsize(struct_fmt)
|
|
||||||
buf = f_in.read(struct_size)
|
|
||||||
return struct.unpack(struct_fmt, buf)
|
|
||||||
|
|
||||||
def write_header(f_out, header):
|
|
||||||
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
|
|
||||||
|
|
||||||
if magic != 0x67676d6c:
|
|
||||||
raise Exception('Invalid file magic. Must be an old style ggml file.')
|
|
||||||
|
|
||||||
values = [
|
|
||||||
0x67676d66, # magic: ggml in hex
|
|
||||||
1, # file version
|
|
||||||
vocab_size,
|
|
||||||
dim,
|
|
||||||
multiple_of,
|
|
||||||
n_heads,
|
|
||||||
n_layers,
|
|
||||||
rot,
|
|
||||||
ftype
|
|
||||||
]
|
|
||||||
f_out.write(struct.pack("i" * len(values), *values))
|
|
||||||
|
|
||||||
def write_tokens(fout, tokenizer):
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
if tokenizer.is_unknown(i):
|
|
||||||
text = " \u2047 ".encode()
|
|
||||||
elif tokenizer.is_control(i):
|
|
||||||
text = b""
|
|
||||||
elif tokenizer.is_byte(i):
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
|
||||||
if len(piece) != 6:
|
|
||||||
print(f"Invalid token: {piece}")
|
|
||||||
sys.exit(1)
|
|
||||||
byte_value = int(piece[3:-1], 16)
|
|
||||||
text = struct.pack("B", byte_value)
|
|
||||||
else:
|
|
||||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
|
||||||
fout.write(struct.pack("i", len(text)))
|
|
||||||
fout.write(text)
|
|
||||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
|
||||||
|
|
||||||
def read_tokens(f_in, tokenizer):
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
len_b = f_in.read(4)
|
|
||||||
(length,) = struct.unpack("i", len_b)
|
|
||||||
f_in.read(length)
|
|
||||||
|
|
||||||
def copy_all_data(f_out, f_in):
|
|
||||||
while True:
|
|
||||||
buf = f_in.read(1024 * 1024)
|
|
||||||
if not buf:
|
|
||||||
break
|
|
||||||
f_out.write(buf)
|
|
||||||
|
|
||||||
def convert_one_file(path_in, tokenizer):
|
|
||||||
path_tmp = f"{path_in}.tmp"
|
|
||||||
path_orig= f"{path_in}.orig"
|
|
||||||
print(f"converting {path_in}")
|
|
||||||
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
|
|
||||||
write_header(f_out, read_header(f_in))
|
|
||||||
read_tokens(f_in, tokenizer)
|
|
||||||
write_tokens(f_out, tokenizer)
|
|
||||||
copy_all_data(f_out, f_in)
|
|
||||||
os.rename(path_in, path_orig)
|
|
||||||
os.rename(path_tmp, path_in)
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = parse_args()
|
|
||||||
files = []
|
|
||||||
files.extend(glob.glob(f"{args.dir_model}/*.bin"))
|
|
||||||
files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
|
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor(args.tokenizer_model)
|
|
||||||
|
|
||||||
for file in files:
|
|
||||||
convert_one_file(file, tokenizer)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
1145
convert.py
Normal file
1145
convert.py
Normal file
File diff suppressed because it is too large
Load diff
|
@ -2,10 +2,13 @@
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <iostream>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <regex>
|
||||||
|
|
||||||
#if defined (_WIN32)
|
#if defined (_WIN32)
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
|
@ -23,6 +26,43 @@ extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int
|
||||||
#define CP_UTF8 65001
|
#define CP_UTF8 65001
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
void split_args(const std::string & args_string, std::vector<std::string> & output_args)
|
||||||
|
{
|
||||||
|
std::string current_arg = "";
|
||||||
|
bool in_quotes = false;
|
||||||
|
char quote_type;
|
||||||
|
|
||||||
|
for (char c : args_string) {
|
||||||
|
if (c == '"' || c == '\'') {
|
||||||
|
if (!in_quotes) {
|
||||||
|
in_quotes = true;
|
||||||
|
quote_type = c;
|
||||||
|
} else if (quote_type == c) {
|
||||||
|
in_quotes = false;
|
||||||
|
} else {
|
||||||
|
current_arg += c;
|
||||||
|
}
|
||||||
|
} else if (in_quotes) {
|
||||||
|
current_arg += c;
|
||||||
|
} else if (std::isspace(c)) {
|
||||||
|
if (current_arg != "") {
|
||||||
|
output_args.push_back(current_arg);
|
||||||
|
current_arg = "";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
current_arg += c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current_arg != "") {
|
||||||
|
output_args.push_back(current_arg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string unescape(const std::string & str) {
|
||||||
|
return std::regex_replace(str, std::regex("\\\\n"), "\n");
|
||||||
|
}
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
// determine sensible default number of threads.
|
// determine sensible default number of threads.
|
||||||
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
|
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
|
||||||
|
@ -40,28 +80,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
std::string arg;
|
std::string arg;
|
||||||
gpt_params default_params;
|
gpt_params default_params;
|
||||||
|
|
||||||
|
// get additional arguments from config files
|
||||||
|
std::vector<std::string> args;
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
arg = argv[i];
|
arg = argv[i];
|
||||||
|
if (arg == "--config") {
|
||||||
if (arg == "-s" || arg == "--seed") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
params.seed = std::stoi(argv[i]);
|
|
||||||
} else if (arg == "-t" || arg == "--threads") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
params.n_threads = std::stoi(argv[i]);
|
|
||||||
} else if (arg == "-p" || arg == "--prompt") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
params.prompt = argv[i];
|
|
||||||
} else if (arg == "-f" || arg == "--file") {
|
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
|
@ -72,85 +95,153 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
std::string args_string;
|
||||||
|
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(args_string));
|
||||||
|
if (args_string.back() == '\n') {
|
||||||
|
args_string.pop_back();
|
||||||
|
}
|
||||||
|
split_args(args_string, args);
|
||||||
|
for (int j = 0; j < args.size(); j++) {
|
||||||
|
args[j] = unescape(args[j]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
args.emplace_back(argv[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// parse args
|
||||||
|
int args_c = static_cast<int>(args.size());
|
||||||
|
for (int i = 0; i < args_c && !invalid_param; i++) {
|
||||||
|
arg = args[i];
|
||||||
|
|
||||||
|
if (arg == "-s" || arg == "--seed") {
|
||||||
|
if (++i >= args_c) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.seed = std::stoi(args[i]);
|
||||||
|
} else if (arg == "-t" || arg == "--threads") {
|
||||||
|
if (++i >= args_c) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_threads = std::stoi(args[i]);
|
||||||
|
} else if (arg == "-p" || arg == "--prompt") {
|
||||||
|
if (++i >= args_c) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.prompt = args[i];
|
||||||
|
} else if (arg == "-f" || arg == "--file") {
|
||||||
|
if (++i >= args_c) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
std::ifstream file(args[i]);
|
||||||
|
if (!file) {
|
||||||
|
fprintf(stderr, "error: failed to open file '%s'\n", args[i].c_str());
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||||
if (params.prompt.back() == '\n') {
|
if (params.prompt.back() == '\n') {
|
||||||
params.prompt.pop_back();
|
params.prompt.pop_back();
|
||||||
}
|
}
|
||||||
} else if (arg == "-n" || arg == "--n_predict") {
|
} else if (arg == "-n" || arg == "--n_predict") {
|
||||||
if (++i >= argc) {
|
if (++i >= args_c) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_predict = std::stoi(argv[i]);
|
params.n_predict = std::stoi(args[i]);
|
||||||
} else if (arg == "--top_k") {
|
} else if (arg == "--top_k") {
|
||||||
if (++i >= argc) {
|
if (++i >= args_c) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.top_k = std::stoi(argv[i]);
|
params.top_k = std::stoi(args[i]);
|
||||||
} else if (arg == "-c" || arg == "--ctx_size") {
|
} else if (arg == "-c" || arg == "--ctx_size") {
|
||||||
if (++i >= argc) {
|
if (++i >= args_c) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_ctx = std::stoi(argv[i]);
|
params.n_ctx = std::stoi(args[i]);
|
||||||
} else if (arg == "--memory_f32") {
|
} else if (arg == "--memory_f32") {
|
||||||
params.memory_f16 = false;
|
params.memory_f16 = false;
|
||||||
} else if (arg == "--top_p") {
|
} else if (arg == "--top_p") {
|
||||||
if (++i >= argc) {
|
if (++i >= args_c) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.top_p = std::stof(argv[i]);
|
params.top_p = std::stof(args[i]);
|
||||||
} else if (arg == "--temp") {
|
} else if (arg == "--temp") {
|
||||||
if (++i >= argc) {
|
if (++i >= args_c) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.temp = std::stof(argv[i]);
|
params.temp = std::stof(args[i]);
|
||||||
} else if (arg == "--repeat_last_n") {
|
} else if (arg == "--repeat_last_n") {
|
||||||
if (++i >= argc) {
|
if (++i >= args_c) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.repeat_last_n = std::stoi(argv[i]);
|
params.repeat_last_n = std::stoi(args[i]);
|
||||||
} else if (arg == "--repeat_penalty") {
|
} else if (arg == "--repeat_penalty") {
|
||||||
if (++i >= argc) {
|
if (++i >= args_c) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.repeat_penalty = std::stof(argv[i]);
|
params.repeat_penalty = std::stof(args[i]);
|
||||||
} else if (arg == "-b" || arg == "--batch_size") {
|
} else if (arg == "-b" || arg == "--batch_size") {
|
||||||
if (++i >= argc) {
|
if (++i >= args_c) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_batch = std::stoi(argv[i]);
|
params.n_batch = std::stoi(args[i]);
|
||||||
params.n_batch = std::min(512, params.n_batch);
|
params.n_batch = std::min(512, params.n_batch);
|
||||||
} else if (arg == "--keep") {
|
} else if (arg == "--keep") {
|
||||||
if (++i >= argc) {
|
if (++i >= args_c) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_keep = std::stoi(argv[i]);
|
params.n_keep = std::stoi(args[i]);
|
||||||
} else if (arg == "-m" || arg == "--model") {
|
} else if (arg == "-m" || arg == "--model") {
|
||||||
if (++i >= argc) {
|
if (++i >= args_c) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.model = argv[i];
|
params.model = args[i];
|
||||||
} else if (arg == "-i" || arg == "--interactive") {
|
} else if (arg == "-i" || arg == "--interactive") {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
} else if (arg == "--embedding") {
|
} else if (arg == "--embedding") {
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
|
} else if (arg == "--clean-interface") {
|
||||||
|
params.clean_interface = true;
|
||||||
} else if (arg == "--interactive-start") {
|
} else if (arg == "--interactive-start") {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
} else if (arg == "--interactive-first") {
|
} else if (arg == "--interactive-first") {
|
||||||
params.interactive_start = true;
|
params.interactive_start = true;
|
||||||
} else if (arg == "-ins" || arg == "--instruct") {
|
} else if (arg == "-ins" || arg == "--instruct") {
|
||||||
params.instruct = true;
|
fprintf(stderr, "\n\nWarning: instruct mode is deprecated! Use: \n"
|
||||||
|
"--clean-interface "
|
||||||
|
"--interactive-first "
|
||||||
|
"--keep -1 "
|
||||||
|
"--ins-prefix-bos "
|
||||||
|
"--ins-prefix \"\\n\\n### Instruction:\\n\\n\" "
|
||||||
|
"--ins-suffix \"\\n\\n### Response:\\n\\n\" "
|
||||||
|
"-r \"### Instruction:\\n\\n\" "
|
||||||
|
"\n\n");
|
||||||
|
// params.instruct = true;
|
||||||
|
params.clean_interface = true;
|
||||||
|
params.interactive_start = true;
|
||||||
|
params.n_keep = -1;
|
||||||
|
params.instruct_prefix_bos = true;
|
||||||
|
params.instruct_prefix = "\n\n### Instruction:\n\n";
|
||||||
|
params.instruct_suffix = "\n\n### Response:\n\n";
|
||||||
|
params.antiprompt.push_back("### Instruction:\n\n");
|
||||||
} else if (arg == "--color") {
|
} else if (arg == "--color") {
|
||||||
params.use_color = true;
|
params.use_color = true;
|
||||||
|
} else if (arg == "--disable-multiline") {
|
||||||
|
params.multiline_mode = false;
|
||||||
} else if (arg == "--mlock") {
|
} else if (arg == "--mlock") {
|
||||||
params.use_mlock = true;
|
params.use_mlock = true;
|
||||||
} else if (arg == "--no-mmap") {
|
} else if (arg == "--no-mmap") {
|
||||||
|
@ -160,65 +251,94 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
} else if (arg == "--verbose-prompt") {
|
} else if (arg == "--verbose-prompt") {
|
||||||
params.verbose_prompt = true;
|
params.verbose_prompt = true;
|
||||||
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
||||||
if (++i >= argc) {
|
if (++i >= args_c) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.antiprompt.push_back(argv[i]);
|
params.antiprompt.push_back(args[i]);
|
||||||
|
} else if (arg == "--stop-prompt") {
|
||||||
|
if (++i >= args_c) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.stopprompt.push_back(args[i]);
|
||||||
|
} else if (arg == "--rm-trailing-space-workaround") {
|
||||||
|
params.rm_trailing_space_workaround = true;
|
||||||
} else if (arg == "--perplexity") {
|
} else if (arg == "--perplexity") {
|
||||||
params.perplexity = true;
|
params.perplexity = true;
|
||||||
} else if (arg == "--ignore-eos") {
|
} else if (arg == "--ignore-eos") {
|
||||||
params.ignore_eos = true;
|
params.ignore_eos = true;
|
||||||
} else if (arg == "--n_parts") {
|
} else if (arg == "--n_parts") {
|
||||||
if (++i >= argc) {
|
if (++i >= args_c) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_parts = std::stoi(argv[i]);
|
params.n_parts = std::stoi(args[i]);
|
||||||
} else if (arg == "-h" || arg == "--help") {
|
} else if (arg == "-h" || arg == "--help") {
|
||||||
gpt_print_usage(argc, argv, default_params);
|
gpt_print_usage(argv[0], default_params);
|
||||||
exit(0);
|
exit(0);
|
||||||
} else if (arg == "--random-prompt") {
|
} else if (arg == "--random-prompt") {
|
||||||
params.random_prompt = true;
|
params.random_prompt = true;
|
||||||
} else if (arg == "--in-prefix") {
|
} else if (arg == "--in-prefix") {
|
||||||
if (++i >= argc) {
|
if (++i >= args_c) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.input_prefix = argv[i];
|
params.input_prefix = args[i];
|
||||||
|
} else if (arg == "--ins-prefix-bos") {
|
||||||
|
params.instruct_prefix_bos = true;
|
||||||
|
} else if (arg == "--ins-prefix") {
|
||||||
|
if (++i >= args_c) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.instruct_prefix = args[i];
|
||||||
|
} else if (arg == "--ins-suffix-bos") {
|
||||||
|
params.instruct_suffix_bos = true;
|
||||||
|
} else if (arg == "--ins-suffix") {
|
||||||
|
if (++i >= args_c) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.instruct_suffix = args[i];
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
gpt_print_usage(argc, argv, default_params);
|
gpt_print_usage(argv[0], default_params);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (invalid_param) {
|
if (invalid_param) {
|
||||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||||
gpt_print_usage(argc, argv, default_params);
|
gpt_print_usage(argv[0], default_params);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
void gpt_print_usage(char * argv_0, const gpt_params & params) {
|
||||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
fprintf(stderr, "usage: %s [options]\n", argv_0);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "options:\n");
|
fprintf(stderr, "options:\n");
|
||||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||||
fprintf(stderr, " -i, --interactive run in interactive mode\n");
|
fprintf(stderr, " -i, --interactive run in interactive mode\n");
|
||||||
fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
|
fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
|
||||||
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
fprintf(stderr, " --clean-interface hides input prefix & suffix and displays '>' instead\n");
|
||||||
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
|
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
|
||||||
fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
|
fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
|
||||||
fprintf(stderr, " specified more than once for multiple prompts).\n");
|
fprintf(stderr, " specified more than once for multiple prompts).\n");
|
||||||
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
|
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
|
||||||
|
fprintf(stderr, " --disable-multiline disable multiline mode (use Ctrl+D on Linux/Mac and Ctrl+Z then Return on Windows to toggle multiline)\n");
|
||||||
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
|
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
|
||||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
||||||
fprintf(stderr, " prompt to start generation with (default: empty)\n");
|
fprintf(stderr, " prompt to start generation with (default: empty)\n");
|
||||||
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
|
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
|
||||||
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
|
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
|
||||||
|
fprintf(stderr, " --ins-prefix STRING (instruct) prefix user inputs with tokenized string (default: empty)\n");
|
||||||
|
fprintf(stderr, " --ins-prefix-bos (instruct) prepend bos token to instruct prefix.\n");
|
||||||
|
fprintf(stderr, " --ins-suffix STRING (instruct) suffix user inputs with tokenized string (default: empty)\n");
|
||||||
|
fprintf(stderr, " --ins-suffix-bos (instruct) prepend bos token to instruct suffix.\n");
|
||||||
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
||||||
fprintf(stderr, " prompt file to start generation.\n");
|
fprintf(stderr, " prompt file to start generation.\n");
|
||||||
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
|
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
|
||||||
|
@ -328,3 +448,61 @@ void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
|
||||||
str = strTo;
|
str = strTo;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
bool get_input_text(std::string & input_text, bool eof_toggled_multiline_mode) {
|
||||||
|
bool another_line = true;
|
||||||
|
bool is_eof_multiline_toggled = false;
|
||||||
|
do {
|
||||||
|
std::string line;
|
||||||
|
#if defined(_WIN32)
|
||||||
|
auto & stdcin = std::wcin;
|
||||||
|
std::wstring wline;
|
||||||
|
if (!std::getline(stdcin, wline)) {
|
||||||
|
// input stream is bad or EOF received
|
||||||
|
if (stdcin.bad()) {
|
||||||
|
fprintf(stderr, "%s: error: input stream bad\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
win32_utf8_encode(wline, line);
|
||||||
|
#else
|
||||||
|
auto & stdcin = std::cin;
|
||||||
|
if (!std::getline(stdcin, line)) {
|
||||||
|
// input stream is bad or EOF received
|
||||||
|
if (stdcin.bad()) {
|
||||||
|
fprintf(stderr, "%s: error: input stream bad\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (stdcin.eof()) {
|
||||||
|
stdcin.clear();
|
||||||
|
stdcin.seekg(0, std::ios::beg);
|
||||||
|
if (!eof_toggled_multiline_mode) {
|
||||||
|
another_line = false;
|
||||||
|
} else {
|
||||||
|
is_eof_multiline_toggled = !is_eof_multiline_toggled;
|
||||||
|
if (is_eof_multiline_toggled) {
|
||||||
|
input_text += line;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!eof_toggled_multiline_mode) {
|
||||||
|
if (line.empty() || line.back() != '\\') {
|
||||||
|
another_line = false;
|
||||||
|
} else {
|
||||||
|
line.pop_back(); // Remove the continue character
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (!is_eof_multiline_toggled) {
|
||||||
|
another_line = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
input_text += line;
|
||||||
|
if (another_line) {
|
||||||
|
input_text += '\n'; // Append the line to the result
|
||||||
|
}
|
||||||
|
} while (another_line);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
|
@ -15,13 +15,13 @@
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
int32_t seed = -1; // RNG seed
|
int32_t seed = -1; // RNG seed
|
||||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); // max 4 threads (default)
|
||||||
int32_t n_predict = 128; // new tokens to predict
|
int32_t n_predict = 128; // new tokens to predict
|
||||||
int32_t repeat_last_n = 64; // last n tokens to penalize
|
int32_t repeat_last_n = 64; // last n tokens to penalize
|
||||||
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 512; // context size
|
||||||
int32_t n_batch = 8; // batch size for prompt processing
|
int32_t n_batch = 8; // batch size for prompt processing
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt (-1 for all)
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
int32_t top_k = 40;
|
int32_t top_k = 40;
|
||||||
|
@ -33,8 +33,15 @@ struct gpt_params {
|
||||||
std::string prompt = "";
|
std::string prompt = "";
|
||||||
std::string input_prefix = ""; // string to prefix user inputs with
|
std::string input_prefix = ""; // string to prefix user inputs with
|
||||||
|
|
||||||
|
std::string instruct_prefix = ""; // prefix user inputs with tokenized string
|
||||||
|
bool instruct_prefix_bos = false; // prepend bos token to instruct prefix
|
||||||
|
std::string instruct_suffix = ""; // suffix user inputs with tokenized string
|
||||||
|
bool instruct_suffix_bos = false; // prepend bos token to instruct suffix
|
||||||
|
|
||||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||||
|
std::vector<std::string> stopprompt; // string upon seeing which more user input is prompted (without adding instruct prefixes and suffixes)
|
||||||
|
|
||||||
|
bool rm_trailing_space_workaround = false; // workaround for removing trailing space from reverse/stop prompts
|
||||||
|
|
||||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool random_prompt = false; // do not randomize prompt if none provided
|
||||||
|
@ -51,11 +58,14 @@ struct gpt_params {
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool mem_test = false; // compute maximum memory usage
|
bool mem_test = false; // compute maximum memory usage
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
|
|
||||||
|
bool clean_interface = false; // hides input prefix & suffix and displays '>'
|
||||||
|
bool multiline_mode = true; // enables multi-line mode, to send input press CTRL+D on Linux/Max, Ctrl+Z then Return on Windows
|
||||||
};
|
};
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||||
|
|
||||||
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
void gpt_print_usage(char * argv_0, const gpt_params & params);
|
||||||
|
|
||||||
std::string gpt_random_prompt(std::mt19937 & rng);
|
std::string gpt_random_prompt(std::mt19937 & rng);
|
||||||
|
|
||||||
|
@ -95,3 +105,5 @@ void set_console_color(console_state & con_st, console_color_t color);
|
||||||
void win32_console_init(bool enable_color);
|
void win32_console_init(bool enable_color);
|
||||||
void win32_utf8_encode(const std::wstring & wstr, std::string & str);
|
void win32_utf8_encode(const std::wstring & wstr, std::string & str);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
bool get_input_text(std::string & input_text, bool escape_newline_mode);
|
||||||
|
|
|
@ -34,7 +34,8 @@ llama_context * ctx;
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
void sigint_handler(int signo) {
|
void sigint_handler(int signo) {
|
||||||
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
|
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
|
||||||
printf("\n"); // this also force flush stdout.
|
fflush(stdout);
|
||||||
|
fflush(stderr);
|
||||||
if (signo == SIGINT) {
|
if (signo == SIGINT) {
|
||||||
if (!is_interacting) {
|
if (!is_interacting) {
|
||||||
is_interacting=true;
|
is_interacting=true;
|
||||||
|
@ -144,6 +145,8 @@ int main(int argc, char ** argv) {
|
||||||
params.prompt = gpt_random_prompt(rng);
|
params.prompt = gpt_random_prompt(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool instruct_mode = !params.instruct_prefix.empty() || !params.instruct_suffix.empty();
|
||||||
|
|
||||||
// params.prompt = R"(// this function checks if the number n is prime
|
// params.prompt = R"(// this function checks if the number n is prime
|
||||||
//bool is_prime(int n) {)";
|
//bool is_prime(int n) {)";
|
||||||
|
|
||||||
|
@ -206,22 +209,20 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// number of tokens to keep when resetting context
|
// number of tokens to keep when resetting context
|
||||||
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
|
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size()) {
|
||||||
params.n_keep = (int)embd_inp.size();
|
params.n_keep = (int)embd_inp.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
// prefix & suffix for instruct mode
|
// prefix & suffix for instruct mode
|
||||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
|
const auto inp_pfx = ::llama_tokenize(ctx, params.instruct_prefix, params.instruct_prefix_bos);
|
||||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
std::string instruct_suffix = params.instruct_suffix;
|
||||||
|
if (params.rm_trailing_space_workaround) {
|
||||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
if (instruct_suffix.back() == ' ') { instruct_suffix.pop_back(); }
|
||||||
if (params.instruct) {
|
|
||||||
params.interactive_start = true;
|
|
||||||
params.antiprompt.push_back("###");
|
|
||||||
}
|
}
|
||||||
|
const auto inp_sfx = ::llama_tokenize(ctx, instruct_suffix, params.instruct_suffix_bos);
|
||||||
|
|
||||||
// enable interactive mode if reverse prompt or interactive start is specified
|
// enable interactive mode if reverse prompt or interactive start is specified
|
||||||
if (params.antiprompt.size() != 0 || params.interactive_start) {
|
if (params.antiprompt.size() != 0 || params.stopprompt.size() != 0 || params.interactive_start) {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -263,10 +264,21 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
|
fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (params.stopprompt.size()) {
|
||||||
|
for (auto stopprompt : params.stopprompt) {
|
||||||
|
fprintf(stderr, "Stop prompt: '%s'\n", stopprompt.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!params.input_prefix.empty()) {
|
if (!params.input_prefix.empty()) {
|
||||||
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
|
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||||
}
|
}
|
||||||
|
if (!params.instruct_prefix.empty()) {
|
||||||
|
fprintf(stderr, "Instruct prefix %s: '%s'\n", params.instruct_prefix_bos ? "(with bos token)" : "", params.instruct_prefix.c_str());
|
||||||
|
}
|
||||||
|
if (!params.instruct_suffix.empty()) {
|
||||||
|
fprintf(stderr, "Instruct suffix %s: '%s'\n", params.instruct_suffix_bos ? "(with bos token)" : "", params.instruct_suffix.c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
|
fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
|
||||||
params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
|
params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
|
||||||
|
@ -282,12 +294,29 @@ int main(int argc, char ** argv) {
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
" - Press Ctrl+C to interject at any time.\n"
|
" - Press Ctrl+C to interject at any time.\n"
|
||||||
#endif
|
#endif
|
||||||
" - Press Return to return control to LLaMa.\n"
|
);
|
||||||
|
if (params.multiline_mode) {
|
||||||
|
fprintf(stderr, " - Press Return to return control to LLaMa.\n"
|
||||||
|
#if defined (_WIN32)
|
||||||
|
" - [MULTILINE MODE] Press Ctrl+Z then Return (EOF) to toggle.\n\n");
|
||||||
|
#else
|
||||||
|
" - [MULTILINE MODE] Press Ctrl+D (EOF) to toggle.\n\n");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
fprintf(stderr, " - Press Return to return control to LLaMa.\n"
|
||||||
" - If you want to submit another line, end your input in '\\'.\n\n");
|
" - If you want to submit another line, end your input in '\\'.\n\n");
|
||||||
|
}
|
||||||
is_interacting = params.interactive_start;
|
is_interacting = params.interactive_start;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_antiprompt = false;
|
struct Antiprompt {
|
||||||
|
bool any = false;
|
||||||
|
bool trailing_space = false;
|
||||||
|
size_t len;
|
||||||
|
bool is_stop_prompt = false;
|
||||||
|
} antiprompt;
|
||||||
|
|
||||||
bool input_noecho = false;
|
bool input_noecho = false;
|
||||||
|
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
@ -357,7 +386,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// replace end of text token with newline token when in interactive mode
|
// replace end of text token with newline token when in interactive mode
|
||||||
if (id == llama_token_eos() && params.interactive && !params.instruct) {
|
if (id == llama_token_eos() && params.interactive && !instruct_mode) {
|
||||||
id = llama_token_newline.front();
|
id = llama_token_newline.front();
|
||||||
if (params.antiprompt.size() != 0) {
|
if (params.antiprompt.size() != 0) {
|
||||||
// tokenize and inject first reverse prompt
|
// tokenize and inject first reverse prompt
|
||||||
|
@ -405,27 +434,72 @@ int main(int argc, char ** argv) {
|
||||||
// check if we should prompt the user for more
|
// check if we should prompt the user for more
|
||||||
if (params.interactive && (int) embd_inp.size() <= n_consumed) {
|
if (params.interactive && (int) embd_inp.size() <= n_consumed) {
|
||||||
|
|
||||||
// check for reverse prompt
|
// check for reverse prompt or stop prompt
|
||||||
if (params.antiprompt.size()) {
|
if (params.antiprompt.size() || params.stopprompt.size()) {
|
||||||
std::string last_output;
|
std::string last_output;
|
||||||
for (auto id : last_n_tokens) {
|
for (auto id : last_n_tokens) {
|
||||||
last_output += llama_token_to_str(ctx, id);
|
last_output += llama_token_to_str(ctx, id);
|
||||||
}
|
}
|
||||||
|
|
||||||
is_antiprompt = false;
|
antiprompt.any = false;
|
||||||
|
antiprompt.is_stop_prompt = false;
|
||||||
// Check if each of the reverse prompts appears at the end of the output.
|
// Check if each of the reverse prompts appears at the end of the output.
|
||||||
for (std::string & antiprompt : params.antiprompt) {
|
for (std::string & prompt : params.antiprompt) {
|
||||||
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
|
if (params.rm_trailing_space_workaround) {
|
||||||
|
antiprompt.trailing_space = prompt.back() == ' ';
|
||||||
|
antiprompt.len = prompt.length() - (antiprompt.trailing_space ? 1 : 0);
|
||||||
|
}
|
||||||
|
if (last_output.find(prompt.c_str(), last_output.length() - antiprompt.len, antiprompt.len) != std::string::npos) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
is_antiprompt = true;
|
antiprompt.any = true;
|
||||||
|
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
|
||||||
|
fflush(stdout);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!antiprompt.any) {
|
||||||
|
for (std::string & prompt : params.stopprompt) {
|
||||||
|
if (params.rm_trailing_space_workaround) {
|
||||||
|
antiprompt.trailing_space = prompt.back() == ' ';
|
||||||
|
antiprompt.len = prompt.length() - (antiprompt.trailing_space ? 1 : 0);
|
||||||
|
}
|
||||||
|
if (last_output.find(prompt.c_str(), last_output.length() - antiprompt.len, antiprompt.len) != std::string::npos) {
|
||||||
|
is_interacting = true;
|
||||||
|
antiprompt.any = true;
|
||||||
|
antiprompt.is_stop_prompt = true;
|
||||||
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
|
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_past > 0 && is_interacting)
|
||||||
|
{
|
||||||
|
std::string buffer;
|
||||||
|
if (!params.clean_interface && !params.instruct_prefix.empty() && !antiprompt.any) {
|
||||||
|
// avoid printing again user's new line (TODO: try to revert enter press and print newline)
|
||||||
|
int i = params.instruct_prefix.front() == '\n' ? 1 : 0;
|
||||||
|
for (; i < inp_pfx.size(); i++) {
|
||||||
|
printf("%s", llama_token_to_str(ctx, inp_pfx[i]));
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
if (params.rm_trailing_space_workaround) {
|
||||||
|
// add only if not stopprompt (as stopprompt could be used to pause
|
||||||
|
// assistant and then continue without input - adding back trailing
|
||||||
|
// space may mess it up.)
|
||||||
|
if (!antiprompt.is_stop_prompt && antiprompt.any && antiprompt.trailing_space) {
|
||||||
|
// add back removed trailing space to buffer(workaround)
|
||||||
|
buffer += ' ';
|
||||||
|
if (!params.clean_interface) {
|
||||||
|
printf("%s", buffer.c_str());
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (n_past > 0 && is_interacting) {
|
|
||||||
// potentially set color to indicate we are taking user input
|
// potentially set color to indicate we are taking user input
|
||||||
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
|
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
|
||||||
|
|
||||||
|
@ -434,43 +508,39 @@ int main(int argc, char ** argv) {
|
||||||
signal(SIGINT, sigint_handler);
|
signal(SIGINT, sigint_handler);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (params.instruct) {
|
if (params.clean_interface) {
|
||||||
printf("\n> ");
|
printf("\n> ");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string buffer;
|
|
||||||
if (!params.input_prefix.empty()) {
|
if (!params.input_prefix.empty()) {
|
||||||
buffer += params.input_prefix;
|
buffer += params.input_prefix;
|
||||||
printf("%s", buffer.c_str());
|
printf("%s", buffer.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string line;
|
if (!get_input_text(buffer, params.multiline_mode)) {
|
||||||
bool another_line = true;
|
// input stream is bad
|
||||||
do {
|
return 1;
|
||||||
#if defined(_WIN32)
|
|
||||||
std::wstring wline;
|
|
||||||
if (!std::getline(std::wcin, wline)) {
|
|
||||||
// input stream is bad or EOF received
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
win32_utf8_encode(wline, line);
|
if (!antiprompt.is_stop_prompt) {
|
||||||
#else
|
buffer += "\n";
|
||||||
if (!std::getline(std::cin, line)) {
|
|
||||||
// input stream is bad or EOF received
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
if (line.empty() || line.back() != '\\') {
|
|
||||||
another_line = false;
|
|
||||||
} else {
|
|
||||||
line.pop_back(); // Remove the continue character
|
|
||||||
}
|
|
||||||
buffer += line + '\n'; // Append the line to the result
|
|
||||||
} while (another_line);
|
|
||||||
|
|
||||||
// done taking input, reset color
|
// done taking input, reset color
|
||||||
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
|
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
|
||||||
|
|
||||||
|
if (!params.clean_interface && !params.instruct_suffix.empty() && !antiprompt.is_stop_prompt) {
|
||||||
|
// avoid printing again user's new line (TODO: try to revert enter press and print newline)
|
||||||
|
int i = params.instruct_suffix.front() == '\n' ? 1 : 0;
|
||||||
|
for (; i < inp_sfx.size(); i++) {
|
||||||
|
printf("%s", llama_token_to_str(ctx, inp_sfx[i]));
|
||||||
|
}
|
||||||
|
// if (remove trailing space workaround) {
|
||||||
|
// We won't add back removed trailing space here, because assistant continues here,
|
||||||
|
// and it may mess up it's output (remove trailing space workaround).
|
||||||
|
// }
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
// Add tokens to embd only if the input buffer is non-empty
|
// Add tokens to embd only if the input buffer is non-empty
|
||||||
// Entering a empty line lets the user pass control back
|
// Entering a empty line lets the user pass control back
|
||||||
if (buffer.length() > 1) {
|
if (buffer.length() > 1) {
|
||||||
|
@ -478,8 +548,8 @@ int main(int argc, char ** argv) {
|
||||||
if (command(buffer, params, n_ctx) == 0) {
|
if (command(buffer, params, n_ctx) == 0) {
|
||||||
// this is not a command, run normally.
|
// this is not a command, run normally.
|
||||||
is_command = false;
|
is_command = false;
|
||||||
// instruct mode: insert instruction prefix
|
// insert input prefix
|
||||||
if (params.instruct && !is_antiprompt) {
|
if (!params.instruct_prefix.empty() && !antiprompt.any) {
|
||||||
n_consumed = embd_inp.size();
|
n_consumed = embd_inp.size();
|
||||||
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
||||||
}
|
}
|
||||||
|
@ -487,11 +557,9 @@ int main(int argc, char ** argv) {
|
||||||
auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
||||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||||
|
|
||||||
// instruct mode: insert response suffix
|
// insert response suffix
|
||||||
if (params.instruct) {
|
if (!params.instruct_suffix.empty() && !antiprompt.is_stop_prompt) {
|
||||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||||
}
|
|
||||||
|
|
||||||
n_remain -= line_inp.size();
|
n_remain -= line_inp.size();
|
||||||
} else {
|
} else {
|
||||||
// this was a command, so we need to stop anything more from printing.
|
// this was a command, so we need to stop anything more from printing.
|
||||||
|
@ -509,7 +577,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// end of text token
|
// end of text token
|
||||||
if (!embd.empty() && embd.back() == llama_token_eos()) {
|
if (!embd.empty() && embd.back() == llama_token_eos()) {
|
||||||
if (params.instruct) {
|
if (instruct_mode) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, " [end of text]\n");
|
fprintf(stderr, " [end of text]\n");
|
||||||
|
|
|
@ -10,7 +10,6 @@
|
||||||
inherit system;
|
inherit system;
|
||||||
};
|
};
|
||||||
llama-python = pkgs.python310.withPackages (ps: with ps; [
|
llama-python = pkgs.python310.withPackages (ps: with ps; [
|
||||||
torch
|
|
||||||
numpy
|
numpy
|
||||||
sentencepiece
|
sentencepiece
|
||||||
]);
|
]);
|
||||||
|
|
265
ggml.c
265
ggml.c
|
@ -2344,14 +2344,14 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
#if defined(__ARM_FEATURE_DOTPROD)
|
||||||
// dot product into int32x4_t
|
// dot product into int32x4_t
|
||||||
int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l);
|
uint32x4_t p_0 = vdotq_u32(vdupq_n_u32(0), v0_0l, v1_0l);
|
||||||
int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l);
|
uint32x4_t p_1 = vdotq_u32(vdupq_n_u32(0), v0_1l, v1_1l);
|
||||||
|
|
||||||
p_0 = vdotq_s32(p_0, v0_0h, v1_0h);
|
p_0 = vdotq_u32(p_0, v0_0h, v1_0h);
|
||||||
p_1 = vdotq_s32(p_1, v0_1h, v1_1h);
|
p_1 = vdotq_u32(p_1, v0_1h, v1_1h);
|
||||||
|
|
||||||
sum11 += x0->d*y0->d*vaddvq_s32(p_0);
|
sum11 += x0->d*y0->d*vaddvq_u32(p_0);
|
||||||
sum11 += x1->d*y1->d*vaddvq_s32(p_1);
|
sum11 += x1->d*y1->d*vaddvq_u32(p_1);
|
||||||
#else
|
#else
|
||||||
const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
|
const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
|
||||||
const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
|
const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
|
||||||
|
@ -2712,9 +2712,12 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
||||||
|
|
||||||
"FLASH_ATTN",
|
"FLASH_ATTN",
|
||||||
"FLASH_FF",
|
"FLASH_FF",
|
||||||
|
|
||||||
|
"MAP_UNARY",
|
||||||
|
"MAP_BINARY",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36");
|
static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
|
||||||
|
|
||||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"none",
|
"none",
|
||||||
|
@ -2757,9 +2760,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
|
|
||||||
"flash_attn(x)",
|
"flash_attn(x)",
|
||||||
"flash_ff(x)",
|
"flash_ff(x)",
|
||||||
|
|
||||||
|
"f(x)",
|
||||||
|
"f(x,y)",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36");
|
static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
|
||||||
|
|
||||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||||
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
||||||
|
@ -3054,9 +3060,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
|
||||||
|
|
||||||
*ctx = (struct ggml_context) {
|
*ctx = (struct ggml_context) {
|
||||||
/*.mem_size =*/ params.mem_size,
|
/*.mem_size =*/ mem_size,
|
||||||
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(params.mem_size),
|
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
|
||||||
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
||||||
/*.no_alloc =*/ params.no_alloc,
|
/*.no_alloc =*/ params.no_alloc,
|
||||||
/*.n_objects =*/ 0,
|
/*.n_objects =*/ 0,
|
||||||
|
@ -3066,7 +3074,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
/*.scratch_save =*/ { 0, 0, NULL, },
|
/*.scratch_save =*/ { 0, 0, NULL, },
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure
|
GGML_ASSERT(ctx->mem_buffer != NULL);
|
||||||
|
|
||||||
ggml_assert_aligned(ctx->mem_buffer);
|
ggml_assert_aligned(ctx->mem_buffer);
|
||||||
|
|
||||||
|
@ -4905,6 +4913,90 @@ struct ggml_tensor * ggml_flash_ff(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_map_unary
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_unary_impl_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
const ggml_unary_op_f32_t fun,
|
||||||
|
bool inplace) {
|
||||||
|
bool is_node = false;
|
||||||
|
|
||||||
|
if (!inplace && a->grad) {
|
||||||
|
is_node = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
||||||
|
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
||||||
|
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
result->op = GGML_OP_MAP_UNARY;
|
||||||
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
result->src0 = a;
|
||||||
|
result->opt[0] = addr_tensor;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_unary_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
const ggml_unary_op_f32_t fun) {
|
||||||
|
return ggml_map_unary_impl_f32(ctx, a, fun, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_unary_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
const ggml_unary_op_f32_t fun) {
|
||||||
|
return ggml_map_unary_impl_f32(ctx, a, fun, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ggml_map_binary
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_binary_impl_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
const ggml_binary_op_f32_t fun,
|
||||||
|
bool inplace) {
|
||||||
|
GGML_ASSERT(ggml_are_same_shape(a, b));
|
||||||
|
|
||||||
|
bool is_node = false;
|
||||||
|
|
||||||
|
if (!inplace && (a->grad || b->grad)) {
|
||||||
|
is_node = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
||||||
|
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
||||||
|
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
result->op = GGML_OP_MAP_BINARY;
|
||||||
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
result->src0 = a;
|
||||||
|
result->src1 = b;
|
||||||
|
result->opt[0] = addr_tensor;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_binary_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
const ggml_binary_op_f32_t fun) {
|
||||||
|
return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_binary_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
const ggml_binary_op_f32_t fun) {
|
||||||
|
return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void ggml_set_param(
|
void ggml_set_param(
|
||||||
|
@ -7507,6 +7599,8 @@ static void ggml_compute_forward_rope_f32(
|
||||||
// row index used to determine which thread to use
|
// row index used to determine which thread to use
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
|
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
||||||
const int p = (mode == 0 ? n_past + i2 : i2);
|
const int p = (mode == 0 ? n_past + i2 : i2);
|
||||||
|
@ -7514,11 +7608,13 @@ static void ggml_compute_forward_rope_f32(
|
||||||
if (ir++ < ir0) continue;
|
if (ir++ < ir0) continue;
|
||||||
if (ir > ir1) break;
|
if (ir > ir1) break;
|
||||||
|
|
||||||
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
float theta = (float)p;
|
||||||
const float theta = powf(10000.0, ((float)-i0)/n_dims);
|
|
||||||
|
|
||||||
const float cos_theta = cosf(p*theta);
|
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
||||||
const float sin_theta = sinf(p*theta);
|
const float cos_theta = cosf(theta);
|
||||||
|
const float sin_theta = sinf(theta);
|
||||||
|
|
||||||
|
theta *= theta_scale;
|
||||||
|
|
||||||
const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
@ -7580,6 +7676,8 @@ static void ggml_compute_forward_rope_f16(
|
||||||
// row index used to determine which thread to use
|
// row index used to determine which thread to use
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
|
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
||||||
const int p = (mode == 0 ? n_past + i2 : i2);
|
const int p = (mode == 0 ? n_past + i2 : i2);
|
||||||
|
@ -7587,11 +7685,13 @@ static void ggml_compute_forward_rope_f16(
|
||||||
if (ir++ < ir0) continue;
|
if (ir++ < ir0) continue;
|
||||||
if (ir > ir1) break;
|
if (ir > ir1) break;
|
||||||
|
|
||||||
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
float theta = (float)p;
|
||||||
const float theta = powf(10000.0, ((float)-i0)/n_dims);
|
|
||||||
|
|
||||||
const float cos_theta = cosf(p*theta);
|
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
||||||
const float sin_theta = sinf(p*theta);
|
const float cos_theta = cosf(theta);
|
||||||
|
const float sin_theta = sinf(theta);
|
||||||
|
|
||||||
|
theta *= theta_scale;
|
||||||
|
|
||||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
@ -8865,6 +8965,111 @@ static void ggml_compute_forward_flash_ff(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_compute_forward_map_unary
|
||||||
|
|
||||||
|
static void ggml_compute_forward_map_unary_f32(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * src0,
|
||||||
|
struct ggml_tensor * dst,
|
||||||
|
const ggml_unary_op_f32_t fun) {
|
||||||
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n = ggml_nrows(src0);
|
||||||
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
|
assert( dst->nb[0] == sizeof(float));
|
||||||
|
assert(src0->nb[0] == sizeof(float));
|
||||||
|
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
fun(nc,
|
||||||
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void ggml_compute_forward_map_unary(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * src0,
|
||||||
|
struct ggml_tensor * dst,
|
||||||
|
const ggml_unary_op_f32_t fun) {
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
case GGML_TYPE_I8:
|
||||||
|
case GGML_TYPE_I16:
|
||||||
|
case GGML_TYPE_I32:
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
case GGML_TYPE_COUNT:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ggml_compute_forward_map_binary
|
||||||
|
|
||||||
|
static void ggml_compute_forward_map_binary_f32(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * src0,
|
||||||
|
const struct ggml_tensor * src1,
|
||||||
|
struct ggml_tensor * dst,
|
||||||
|
const ggml_binary_op_f32_t fun) {
|
||||||
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n = ggml_nrows(src0);
|
||||||
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
|
assert( dst->nb[0] == sizeof(float));
|
||||||
|
assert(src0->nb[0] == sizeof(float));
|
||||||
|
assert(src1->nb[0] == sizeof(float));
|
||||||
|
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
fun(nc,
|
||||||
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
(float *) ((char *) src0->data + i*(src0->nb[1])),
|
||||||
|
(float *) ((char *) src1->data + i*(src1->nb[1])));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void ggml_compute_forward_map_binary(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * src0,
|
||||||
|
const struct ggml_tensor * src1,
|
||||||
|
struct ggml_tensor * dst,
|
||||||
|
const ggml_binary_op_f32_t fun) {
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
case GGML_TYPE_I8:
|
||||||
|
case GGML_TYPE_I16:
|
||||||
|
case GGML_TYPE_I32:
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
case GGML_TYPE_COUNT:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
|
|
||||||
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||||
|
@ -9014,6 +9219,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
{
|
{
|
||||||
ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
|
ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_MAP_UNARY:
|
||||||
|
{
|
||||||
|
const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
|
||||||
|
ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case GGML_OP_MAP_BINARY:
|
||||||
|
{
|
||||||
|
const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data);
|
||||||
|
ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
|
||||||
|
}
|
||||||
|
break;
|
||||||
case GGML_OP_NONE:
|
case GGML_OP_NONE:
|
||||||
{
|
{
|
||||||
// nop
|
// nop
|
||||||
|
@ -9273,6 +9490,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false); // not supported
|
GGML_ASSERT(false); // not supported
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_MAP_UNARY:
|
||||||
|
case GGML_OP_MAP_BINARY:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false); // not supported
|
||||||
|
} break;
|
||||||
case GGML_OP_NONE:
|
case GGML_OP_NONE:
|
||||||
{
|
{
|
||||||
// nop
|
// nop
|
||||||
|
@ -9765,6 +9987,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
|
|
||||||
work_size = MAX(work_size, cur);
|
work_size = MAX(work_size, cur);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_MAP_UNARY:
|
||||||
|
case GGML_OP_MAP_BINARY:
|
||||||
|
{
|
||||||
|
node->n_tasks = 1;
|
||||||
|
} break;
|
||||||
case GGML_OP_NONE:
|
case GGML_OP_NONE:
|
||||||
{
|
{
|
||||||
node->n_tasks = 1;
|
node->n_tasks = 1;
|
||||||
|
|
18
ggml.h
18
ggml.h
|
@ -253,6 +253,9 @@ enum ggml_op {
|
||||||
GGML_OP_FLASH_ATTN,
|
GGML_OP_FLASH_ATTN,
|
||||||
GGML_OP_FLASH_FF,
|
GGML_OP_FLASH_FF,
|
||||||
|
|
||||||
|
GGML_OP_MAP_UNARY,
|
||||||
|
GGML_OP_MAP_BINARY,
|
||||||
|
|
||||||
GGML_OP_COUNT,
|
GGML_OP_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -652,6 +655,21 @@ struct ggml_tensor * ggml_flash_ff(
|
||||||
struct ggml_tensor * c0,
|
struct ggml_tensor * c0,
|
||||||
struct ggml_tensor * c1);
|
struct ggml_tensor * c1);
|
||||||
|
|
||||||
|
// Mapping operations
|
||||||
|
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
||||||
|
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_unary_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
const ggml_unary_op_f32_t fun);
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_binary_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
const ggml_binary_op_f32_t fun);
|
||||||
|
|
||||||
//
|
//
|
||||||
// automatic differentiation
|
// automatic differentiation
|
||||||
//
|
//
|
||||||
|
|
|
@ -1,311 +0,0 @@
|
||||||
# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
|
|
||||||
#
|
|
||||||
# We caused a breaking change to the file format on 2023-03-30 in:
|
|
||||||
# https://github.com/ggerganov/llama.cpp/pull/613
|
|
||||||
#
|
|
||||||
# (1) If you still have the Meta LLaMA .pth files, then close this
|
|
||||||
# file now; you can just run `convert-pth-to-ggml.py` again to
|
|
||||||
# migrate to the new format. The tool is easier to use too. It
|
|
||||||
# isn't necessary anymore to manage split output files because
|
|
||||||
# the new format always combines things into a single file.
|
|
||||||
#
|
|
||||||
# (2) If you deleted the Meta LLaMA .pth files due to save on disk
|
|
||||||
# space, then this tool is intended to help you. Please check
|
|
||||||
# out the instructions below.
|
|
||||||
#
|
|
||||||
# USAGE
|
|
||||||
#
|
|
||||||
# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
|
|
||||||
#
|
|
||||||
# PREREQUISITES
|
|
||||||
#
|
|
||||||
# pip install numpy
|
|
||||||
# cd llama.cpp
|
|
||||||
# make -j4
|
|
||||||
#
|
|
||||||
# EXAMPLE (7B MODEL)
|
|
||||||
#
|
|
||||||
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
|
|
||||||
# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
|
|
||||||
#
|
|
||||||
# # check that it works
|
|
||||||
# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
|
|
||||||
#
|
|
||||||
# # you can delete the old files
|
|
||||||
# rm -f models/7B/ggml-model-f16.bin
|
|
||||||
# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
|
|
||||||
#
|
|
||||||
# EXAMPLE (13B MODEL)
|
|
||||||
#
|
|
||||||
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
|
|
||||||
# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
|
|
||||||
#
|
|
||||||
# # check that it works
|
|
||||||
# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
|
|
||||||
#
|
|
||||||
# # you can delete the old files
|
|
||||||
# rm -f models/13B/ggml-model-f16.bin*
|
|
||||||
# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
|
|
||||||
#
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
import struct
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
QK = 32
|
|
||||||
|
|
||||||
GGML_TYPE_Q4_0 = 0
|
|
||||||
GGML_TYPE_Q4_1 = 1
|
|
||||||
GGML_TYPE_I8 = 2
|
|
||||||
GGML_TYPE_I16 = 3
|
|
||||||
GGML_TYPE_I32 = 4
|
|
||||||
GGML_TYPE_F16 = 5
|
|
||||||
GGML_TYPE_F32 = 6
|
|
||||||
|
|
||||||
WTYPE_NAMES = {
|
|
||||||
0: "F32",
|
|
||||||
1: "F16",
|
|
||||||
2: "Q4_0",
|
|
||||||
3: "Q4_1",
|
|
||||||
}
|
|
||||||
|
|
||||||
WTYPES = {
|
|
||||||
0: GGML_TYPE_F32,
|
|
||||||
1: GGML_TYPE_F16,
|
|
||||||
2: GGML_TYPE_Q4_0,
|
|
||||||
3: GGML_TYPE_Q4_1,
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_BLCK_SIZE = {
|
|
||||||
GGML_TYPE_Q4_0: QK,
|
|
||||||
GGML_TYPE_Q4_1: QK,
|
|
||||||
GGML_TYPE_I8: 1,
|
|
||||||
GGML_TYPE_I16: 1,
|
|
||||||
GGML_TYPE_I32: 1,
|
|
||||||
GGML_TYPE_F16: 1,
|
|
||||||
GGML_TYPE_F32: 1,
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_TYPE_SIZE = {
|
|
||||||
GGML_TYPE_Q4_0: 4 + QK//2,
|
|
||||||
GGML_TYPE_Q4_1: 4*2 + QK//2,
|
|
||||||
GGML_TYPE_I8: 1,
|
|
||||||
GGML_TYPE_I16: 2,
|
|
||||||
GGML_TYPE_I32: 4,
|
|
||||||
GGML_TYPE_F16: 2,
|
|
||||||
GGML_TYPE_F32: 4,
|
|
||||||
}
|
|
||||||
|
|
||||||
HPARAMS = [
|
|
||||||
'magic', # int32
|
|
||||||
'version', # int32
|
|
||||||
'n_vocab', # int32
|
|
||||||
'n_embd', # int32
|
|
||||||
'n_mult', # int32
|
|
||||||
'n_head', # int32
|
|
||||||
'n_layer', # int32
|
|
||||||
'n_rot', # int32
|
|
||||||
'f16', # int32
|
|
||||||
]
|
|
||||||
|
|
||||||
def read_hparams(fin):
|
|
||||||
struct_fmt = "i" * len(HPARAMS)
|
|
||||||
struct_size = struct.calcsize(struct_fmt)
|
|
||||||
buf = fin.read(struct_size)
|
|
||||||
ints = struct.unpack(struct_fmt, buf)
|
|
||||||
hparams = dict(zip(HPARAMS, ints))
|
|
||||||
return hparams
|
|
||||||
|
|
||||||
def write_hparams(fout, hparams):
|
|
||||||
struct_fmt = "i" * len(HPARAMS)
|
|
||||||
struct_size = struct.calcsize(struct_fmt)
|
|
||||||
ints = [hparams[h] for h in HPARAMS]
|
|
||||||
fout.write(struct.pack(struct_fmt, *ints))
|
|
||||||
|
|
||||||
def read_tokens(fin, hparams):
|
|
||||||
tokens = []
|
|
||||||
for i in range(hparams['n_vocab']):
|
|
||||||
len_b = fin.read(4)
|
|
||||||
(length,) = struct.unpack("i", len_b)
|
|
||||||
word = fin.read(length)
|
|
||||||
score_b = fin.read(4)
|
|
||||||
(score,) = struct.unpack("f", score_b)
|
|
||||||
tokens.append((word, score))
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
def write_tokens(fout, tokens):
|
|
||||||
for word, score in tokens:
|
|
||||||
fout.write(struct.pack("i", len(word)))
|
|
||||||
fout.write(word)
|
|
||||||
fout.write(struct.pack("f", score))
|
|
||||||
|
|
||||||
def ggml_nelements(shape):
|
|
||||||
r = 1
|
|
||||||
for i in shape:
|
|
||||||
r *= i
|
|
||||||
return r
|
|
||||||
|
|
||||||
def ggml_nbytes(shape, ftype):
|
|
||||||
x = ggml_nelements(shape)
|
|
||||||
t = WTYPES[ftype]
|
|
||||||
x *= GGML_TYPE_SIZE[t]
|
|
||||||
x //= GGML_BLCK_SIZE[t]
|
|
||||||
return x
|
|
||||||
|
|
||||||
def copy_tensors(fin, fout, part_id, n_parts):
|
|
||||||
while True:
|
|
||||||
|
|
||||||
b = fin.read(4)
|
|
||||||
if not b: break
|
|
||||||
(n_dims,) = struct.unpack("i", b)
|
|
||||||
b = fin.read(4)
|
|
||||||
(length,) = struct.unpack("i", b)
|
|
||||||
b = fin.read(4)
|
|
||||||
(ftype,) = struct.unpack("i", b)
|
|
||||||
|
|
||||||
assert n_dims in (1, 2)
|
|
||||||
|
|
||||||
partshape = list(range(n_dims))
|
|
||||||
for i in range(n_dims):
|
|
||||||
b = fin.read(4)
|
|
||||||
partshape[i] = struct.unpack("i", b)[0]
|
|
||||||
partshape = list(reversed(partshape))
|
|
||||||
|
|
||||||
name = fin.read(length)
|
|
||||||
data = fin.read(ggml_nbytes(partshape, ftype))
|
|
||||||
|
|
||||||
blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
|
|
||||||
type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
|
|
||||||
|
|
||||||
print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
|
|
||||||
|
|
||||||
# determine dimension along which multipart tensor is sharded
|
|
||||||
#
|
|
||||||
# split_dim 0 regex:
|
|
||||||
# - output.*
|
|
||||||
# - layers.*.attention.wq.weight
|
|
||||||
# - layers.*.attention.wk.weight
|
|
||||||
# - layers.*.attention.wv.weight
|
|
||||||
# - layers.*.feed_forward.w1.weight
|
|
||||||
# - layers.*.feed_forward.w3.weight
|
|
||||||
#
|
|
||||||
# split_dim 1 regex:
|
|
||||||
# - tok_embeddings.*
|
|
||||||
# - layers.*.attention.wo.weight
|
|
||||||
# - layers.*.feed_forward.w2.weight
|
|
||||||
#
|
|
||||||
if n_dims > 1:
|
|
||||||
split_dim = 1
|
|
||||||
if b"tok_embeddings" in name:
|
|
||||||
split_dim = 1
|
|
||||||
elif b"layers" in name:
|
|
||||||
if b"attention.wo.weight" in name:
|
|
||||||
split_dim = 1
|
|
||||||
elif b"feed_forward.w2.weight" in name:
|
|
||||||
split_dim = 1
|
|
||||||
else:
|
|
||||||
split_dim = 0
|
|
||||||
elif b"output" in name:
|
|
||||||
split_dim = 0
|
|
||||||
|
|
||||||
# output tensor header
|
|
||||||
fullshape = list(partshape)
|
|
||||||
if n_dims > 1:
|
|
||||||
fullshape[split_dim] *= n_parts
|
|
||||||
fout.write(struct.pack("iii", n_dims, len(name), ftype))
|
|
||||||
for dim in reversed(fullshape):
|
|
||||||
fout.write(struct.pack("i", dim))
|
|
||||||
fout.write(name)
|
|
||||||
|
|
||||||
# ensure tensor data is aligned
|
|
||||||
tensor_data_offset = fout.tell()
|
|
||||||
while tensor_data_offset % QK != 0:
|
|
||||||
fout.write(struct.pack("B", 0))
|
|
||||||
tensor_data_offset += 1
|
|
||||||
|
|
||||||
# output unified mappable tensor data
|
|
||||||
if n_dims == 1 or n_parts == 1:
|
|
||||||
# copy tensor which we thankfully received in one piece
|
|
||||||
if part_id == 0:
|
|
||||||
fout.write(data)
|
|
||||||
elif split_dim == 0:
|
|
||||||
# reassemble multifile tensor containing some of the rows
|
|
||||||
rows_per_chunk = partshape[0]
|
|
||||||
current_row = part_id * rows_per_chunk
|
|
||||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
|
||||||
offset = current_row * bytes_per_row
|
|
||||||
fout.seek(tensor_data_offset + offset)
|
|
||||||
fout.write(data)
|
|
||||||
elif split_dim == 1:
|
|
||||||
# reassemble multifile tensor containing some of the cols
|
|
||||||
cols_per_chunk = partshape[1]
|
|
||||||
current_col = part_id * cols_per_chunk
|
|
||||||
bpr = partshape[1] // blck_size * type_size
|
|
||||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
|
||||||
offset_current_col = current_col // blck_size * type_size
|
|
||||||
for row in range(partshape[0]):
|
|
||||||
offset_row = row * bytes_per_row
|
|
||||||
offset = offset_row + offset_current_col
|
|
||||||
fout.seek(tensor_data_offset + offset)
|
|
||||||
fout.write(data[row * bpr:row * bpr + bpr])
|
|
||||||
|
|
||||||
# advance file position to next tensor
|
|
||||||
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
|
|
||||||
parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
|
|
||||||
parser.add_argument('fout_path', help='your new ggjt file name')
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = parse_args()
|
|
||||||
assert args.fin_path
|
|
||||||
assert args.fout_path
|
|
||||||
assert args.fin_path != args.fout_path
|
|
||||||
|
|
||||||
with open(args.fin_path, "rb") as fin:
|
|
||||||
hparams = read_hparams(fin)
|
|
||||||
tokens = read_tokens(fin, hparams)
|
|
||||||
|
|
||||||
if hparams['magic'] == 0x67676a74: # ggjt
|
|
||||||
print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if hparams['magic'] != 0x67676d66: # ggmf
|
|
||||||
print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
hparams['magic'] = 0x67676a74 # ggjt
|
|
||||||
|
|
||||||
# count number of multipart files by convention
|
|
||||||
n_parts = 1
|
|
||||||
while True:
|
|
||||||
if os.path.exists(f"{args.fin_path}.{n_parts}"):
|
|
||||||
n_parts += 1
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
# we output a single file for ggml
|
|
||||||
with open(args.fout_path, "wb") as fout:
|
|
||||||
write_hparams(fout, hparams)
|
|
||||||
write_tokens(fout, tokens)
|
|
||||||
offset_of_tensors = fout.tell()
|
|
||||||
# the tensors we load could be split across multiple files
|
|
||||||
for part_id in range(n_parts):
|
|
||||||
fout.seek(offset_of_tensors)
|
|
||||||
print(f"Processing part {part_id+1} of {n_parts}\n")
|
|
||||||
fin_path = args.fin_path
|
|
||||||
if part_id > 0:
|
|
||||||
fin_path += f".{part_id}"
|
|
||||||
with open(fin_path, "rb") as fin:
|
|
||||||
read_tokens(fin, read_hparams(fin))
|
|
||||||
copy_tensors(fin, fout, part_id, n_parts)
|
|
||||||
|
|
||||||
print(f"Done. Output file: {args.fout_path}\n")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1 +0,0 @@
|
||||||
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
|
|
@ -1,7 +0,0 @@
|
||||||
Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
|
||||||
|
|
||||||
User: Hello, Bob.
|
|
||||||
Bob: Hello. How may I help you today?
|
|
||||||
User: Please tell me the largest city in Europe.
|
|
||||||
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
|
||||||
User:
|
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
numpy==1.24
|
||||||
|
sentencepiece==0.1.97
|
Loading…
Add table
Add a link
Reference in a new issue