diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b6e21b4ec..84faad37a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -197,6 +197,8 @@ jobs:
strategy:
matrix:
include:
+ - build: 'noavx'
+ defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
- build: 'avx2'
defines: '-DLLAMA_BUILD_SERVER=ON'
- build: 'avx'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index abc96814d..c43e65e74 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -357,6 +357,7 @@ if (LLAMA_ALL_WARNINGS)
-Wshadow
-Wstrict-prototypes
-Wpointer-arith
+ -Wmissing-prototypes
)
set(cxx_flags
-Wall
diff --git a/Makefile b/Makefile
index fb7c27cd9..2035c5253 100644
--- a/Makefile
+++ b/Makefile
@@ -63,7 +63,8 @@ ifdef LLAMA_SERVER_VERBOSE
endif
# warnings
-CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
+CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
+ -Wmissing-prototypes
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
# OS specific
diff --git a/convert.py b/convert.py
index 8d7af06d1..ac99579c4 100755
--- a/convert.py
+++ b/convert.py
@@ -234,14 +234,21 @@ class Params:
class SentencePieceVocab:
- def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
- self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None:
+ self.vocabtype = vocabtype
+ if self.vocabtype == "bpe":
+ self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read())
+ else:
+ self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
added_tokens: Dict[str, int]
if fname_added_tokens is not None:
added_tokens = json.load(open(fname_added_tokens))
else:
added_tokens = {}
- vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
+ if self.vocabtype == "bpe":
+ vocab_size: int = len(self.sentencepiece_tokenizer)
+ else:
+ vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
@@ -255,22 +262,32 @@ class SentencePieceVocab:
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
tokenizer = self.sentencepiece_tokenizer
- for i in range(tokenizer.vocab_size()):
+ if self.vocabtype == "bpe":
+ from transformers.models.gpt2 import tokenization_gpt2
+ byte_encoder = tokenization_gpt2.bytes_to_unicode()
+ byte_decoder = {v: k for k, v in byte_encoder.items()}
+ for i, item in enumerate(tokenizer):
text: bytes
- if tokenizer.is_unknown(i):
- text = " \u2047 ".encode("utf-8")
- elif tokenizer.is_control(i):
- text = b""
- elif tokenizer.is_byte(i):
- piece = tokenizer.id_to_piece(i)
- if len(piece) != 6:
- raise Exception(f"Invalid token: {piece}")
- byte_value = int(piece[3:-1], 16)
- text = struct.pack("B", byte_value)
- else:
- text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
- score: float = tokenizer.get_score(i)
+ text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
+ score: float = -i
yield text, score
+ else:
+ for i in range(tokenizer.vocab_size()):
+ text: bytes
+ if tokenizer.is_unknown(i):
+ text = " \u2047 ".encode("utf-8")
+ elif tokenizer.is_control(i):
+ text = b""
+ elif tokenizer.is_byte(i):
+ piece = tokenizer.id_to_piece(i)
+ if len(piece) != 6:
+ raise Exception(f"Invalid token: {piece}")
+ byte_value = int(piece[3:-1], 16)
+ text = struct.pack("B", byte_value)
+ else:
+ text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+ score: float = tokenizer.get_score(i)
+ yield text, score
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list:
@@ -1196,14 +1213,18 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
return {name: model[name] for name in TENSORS_LIST if name in model}
-def load_vocab(path: Path) -> SentencePieceVocab:
+def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
+ print(f"vocabtype: {vocabtype}")
# Be extra-friendly and accept either a file or a directory. Also, if it's
# a directory, it might be the model directory, and tokenizer.model might
# be in the parent of that.
if path.is_dir():
- path2 = path / "tokenizer.model"
+ vocab_file = "tokenizer.model"
+ if vocabtype == 'bpe':
+ vocab_file = "vocab.json"
+ path2 = path / vocab_file
# Use `.parent` instead of /.. to handle the symlink case better.
- path3 = path.parent / "tokenizer.model"
+ path3 = path.parent / vocab_file
if path2.exists():
path = path2
elif path3.exists():
@@ -1214,7 +1235,8 @@ def load_vocab(path: Path) -> SentencePieceVocab:
"if it's in another directory, pass the directory as --vocab-dir")
added_tokens_path = path.parent / "added_tokens.json"
print(f"Loading vocab file {path}")
- return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+ return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None,
+ vocabtype)
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
@@ -1252,6 +1274,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path,
help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+ parser.add_argument("--vocabtype", default='spm', choices=["spm", "bpe"], help="vocab format (default: spm)")
args = parser.parse_args(args_in)
vocab: Vocab
@@ -1259,7 +1282,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
model_plus = lazy_load_file(args.model)
do_dump_model(model_plus)
elif args.vocab_only:
- vocab = load_vocab(args.vocab_dir or args.model)
+ vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
assert args.outfile, "need --outfile if using --vocab-only"
outfile = args.outfile
OutputFile.write_vocab_only(outfile, vocab)
@@ -1273,7 +1296,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
vocab = model_plus.vocab
else:
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
- vocab = load_vocab(vocab_dir)
+ vocab = load_vocab(vocab_dir, args.vocabtype)
params = Params.load(model_plus)
model = model_plus.model
model = do_necessary_conversions(model, params)
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index f9dc0aaa6..6fa55b319 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -8,7 +8,11 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
-static const float rms_norm_eps = 1e-6f;
+#ifdef LLAMA_DEFAULT_RMS_EPS
+static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+#else
+static const float rms_norm_eps = 5e-6f;
+#endif
float frand() {
return (float)rand()/(float)RAND_MAX;
diff --git a/examples/common.cpp b/examples/common.cpp
index 0e88a128a..dd964c8a7 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -432,6 +432,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
exit(0);
} else if (arg == "--random-prompt") {
params.random_prompt = true;
+ } else if (arg == "--in-prefix-bos") {
+ params.input_prefix_bos = true;
} else if (arg == "--in-prefix") {
if (++i >= argc) {
invalid_param = true;
@@ -517,6 +519,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, " not supported with --interactive or other interactive options\n");
fprintf(stdout, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
fprintf(stdout, " --random-prompt start with a randomized prompt.\n");
+ fprintf(stdout, " --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
fprintf(stdout, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stdout, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
fprintf(stdout, " -f FNAME, --file FNAME\n");
diff --git a/examples/common.h b/examples/common.h
index 894a0850a..672dcf77c 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -34,7 +34,7 @@ struct gpt_params {
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
- float rms_norm_eps = 1e-6; // rms norm epsilon
+ float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
float rope_freq_base = 10000.0f; // RoPE base frequency
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
@@ -82,6 +82,7 @@ struct gpt_params {
bool interactive_first = false; // wait for user input immediately
bool multiline_input = false; // reverse the usage of `\`
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool instruct = false; // instruction mode (used for Alpaca models)
bool penalize_nl = true; // consider newlines as a repeatable token
bool perplexity = false; // compute perplexity over the prompt
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 16ddc2274..3796a9230 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -325,6 +325,10 @@ int main(int argc, char ** argv) {
}
}
+ if (params.input_prefix_bos) {
+ fprintf(stderr, "Input prefix with BOS\n");
+ }
+
if (!params.input_prefix.empty()) {
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
}
@@ -633,16 +637,6 @@ int main(int argc, char ** argv) {
last_n_tokens.push_back(id);
}
- // replace end of text token with newline token when in interactive mode
- if (id == llama_token_eos() && params.interactive && !params.instruct) {
- id = llama_token_newline.front();
- if (params.antiprompt.size() != 0) {
- // tokenize and inject first reverse prompt
- const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
- embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
- }
- }
-
// add it to the context
embd.push_back(id);
@@ -708,11 +702,34 @@ int main(int argc, char ** argv) {
}
}
+ // deal with end of text token in interactive mode
+ if (last_n_tokens.back() == llama_token_eos()) {
+ if (params.interactive) {
+ if (params.antiprompt.size() != 0) {
+ // tokenize and inject first reverse prompt
+ const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+ embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+ is_antiprompt = true;
+ }
+
+ is_interacting = true;
+ printf("\n");
+ console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+ fflush(stdout);
+ } else if (params.instruct) {
+ is_interacting = true;
+ }
+ }
+
if (n_past > 0 && is_interacting) {
if (params.instruct) {
printf("\n> ");
}
+ if (params.input_prefix_bos) {
+ embd_inp.push_back(llama_token_bos());
+ }
+
std::string buffer;
if (!params.input_prefix.empty()) {
buffer += params.input_prefix;
@@ -776,13 +793,9 @@ int main(int argc, char ** argv) {
}
// end of text token
- if (!embd.empty() && embd.back() == llama_token_eos()) {
- if (params.instruct) {
- is_interacting = true;
- } else {
- fprintf(stderr, " [end of text]\n");
- break;
- }
+ if (!embd.empty() && embd.back() == llama_token_eos() && !(params.instruct || params.interactive)) {
+ fprintf(stderr, " [end of text]\n");
+ break;
}
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
diff --git a/examples/server/index.html.hpp b/examples/server/index.html.hpp
index 0769170ff..1b21d4d55 100644
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
@@ -1017,129 +1017,137 @@ unsigned char index_html[] = {
0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d,
0x64, 0x20, 0x3d, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x74,
0x65, 0x78, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x23,
- 0x7b, 0x31, 0x2c, 0x36, 0x7d, 0x20, 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f,
- 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x68, 0x33, 0x3e, 0x24, 0x31,
- 0x3c, 0x2f, 0x68, 0x33, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65,
- 0x28, 0x2f, 0x5c, 0x2a, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c,
- 0x2a, 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, 0x72,
- 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f,
- 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f,
- 0x5f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x5f, 0x2f, 0x67, 0x2c,
- 0x20, 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31,
- 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a,
+ 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x26, 0x2f,
+ 0x67, 0x2c, 0x20, 0x27, 0x26, 0x61, 0x6d, 0x70, 0x3b, 0x27, 0x29, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70,
- 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f,
- 0x29, 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e,
- 0x24, 0x31, 0x3c, 0x2f, 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61,
- 0x63, 0x65, 0x28, 0x2f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x2f,
- 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, 0x2f,
- 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x3c, 0x2f, 0x67, 0x2c, 0x20, 0x27,
+ 0x26, 0x6c, 0x74, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28,
+ 0x2f, 0x3e, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x67, 0x74, 0x3b, 0x27,
+ 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72,
+ 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x23, 0x7b, 0x31,
+ 0x2c, 0x36, 0x7d, 0x20, 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, 0x67, 0x69,
+ 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x68, 0x33, 0x3e, 0x24, 0x31, 0x3c, 0x2f,
+ 0x68, 0x33, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f,
- 0x60, 0x60, 0x60, 0x2e, 0x2a, 0x3f, 0x5c, 0x6e, 0x28, 0x5b, 0x5c, 0x73,
- 0x5c, 0x53, 0x5d, 0x2a, 0x3f, 0x29, 0x60, 0x60, 0x60, 0x2f, 0x67, 0x2c,
- 0x20, 0x27, 0x3c, 0x70, 0x72, 0x65, 0x3e, 0x3c, 0x63, 0x6f, 0x64, 0x65,
- 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x3c, 0x2f,
- 0x70, 0x72, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28,
- 0x2f, 0x60, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x60, 0x2f, 0x67, 0x2c, 0x20,
- 0x27, 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63,
- 0x6f, 0x64, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28,
- 0x2f, 0x5c, 0x6e, 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x62,
- 0x72, 0x20, 0x2f, 0x3e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+ 0x5c, 0x2a, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x2a, 0x5c,
+ 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e,
+ 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67,
+ 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5f, 0x5f,
+ 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x5f, 0x2f, 0x67, 0x2c, 0x20, 0x27,
+ 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f,
+ 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61,
+ 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c,
+ 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31,
+ 0x3c, 0x2f, 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65,
+ 0x28, 0x2f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x2f, 0x67, 0x2c,
+ 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x65, 0x6d,
+ 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x60, 0x60,
+ 0x60, 0x2e, 0x2a, 0x3f, 0x5c, 0x6e, 0x28, 0x5b, 0x5c, 0x73, 0x5c, 0x53,
+ 0x5d, 0x2a, 0x3f, 0x29, 0x60, 0x60, 0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27,
+ 0x3c, 0x70, 0x72, 0x65, 0x3e, 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24,
+ 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x3c, 0x2f, 0x70, 0x72,
+ 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x60,
+ 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c,
+ 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64,
+ 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c,
+ 0x6e, 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x62, 0x72, 0x20,
+ 0x2f, 0x3e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60,
+ 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, 0x64, 0x61, 0x6e, 0x67, 0x65, 0x72,
+ 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65,
+ 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x3d, 0x24, 0x7b, 0x7b, 0x20, 0x5f, 0x5f,
+ 0x68, 0x74, 0x6d, 0x6c, 0x3a, 0x20, 0x6d, 0x64, 0x20, 0x7d, 0x7d, 0x20,
+ 0x2f, 0x3e, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a,
+ 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d,
+ 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
+ 0x6f, 0x6e, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x61,
+ 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x6c, 0x6c,
+ 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c,
+ 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d,
- 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, 0x64, 0x61, 0x6e, 0x67,
- 0x65, 0x72, 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e,
- 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x3d, 0x24, 0x7b, 0x7b, 0x20,
- 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3a, 0x20, 0x6d, 0x64, 0x20, 0x7d,
- 0x7d, 0x20, 0x2f, 0x3e, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
- 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
- 0x20, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61,
- 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x28,
- 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21,
- 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76,
- 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68,
- 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x2f, 0x3e, 0x60,
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74,
- 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
- 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e,
- 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70, 0x65,
- 0x72, 0x5f, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x5f, 0x6d, 0x73, 0x2e, 0x74,
- 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x29, 0x7d, 0x6d, 0x73, 0x20,
- 0x70, 0x65, 0x72, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x2c, 0x20, 0x24,
- 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e,
- 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63,
- 0x74, 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f, 0x73, 0x65, 0x63, 0x6f,
- 0x6e, 0x64, 0x2e, 0x74, 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x32,
- 0x29, 0x7d, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x70, 0x65,
- 0x72, 0x20, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x0a, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e,
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20,
- 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63,
- 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x41, 0x70, 0x70, 0x28, 0x70, 0x72, 0x6f,
- 0x70, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x2f, 0x3e, 0x60, 0x0a, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c,
- 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64,
- 0x69, 0x76, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x61,
- 0x69, 0x6e, 0x65, 0x72, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72,
- 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x3c, 0x68, 0x31, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e,
- 0x63, 0x70, 0x70, 0x3c, 0x2f, 0x68, 0x31, 0x3e, 0x0a, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x68, 0x65, 0x61,
- 0x64, 0x65, 0x72, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x69, 0x64,
- 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x22, 0x3e, 0x0a,
+ 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73,
+ 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74,
+ 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72,
+ 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f,
+ 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x5f, 0x6d, 0x73, 0x2e, 0x74, 0x6f, 0x46,
+ 0x69, 0x78, 0x65, 0x64, 0x28, 0x29, 0x7d, 0x6d, 0x73, 0x20, 0x70, 0x65,
+ 0x72, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x2c, 0x20, 0x24, 0x7b, 0x6c,
+ 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61,
+ 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65,
+ 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64,
+ 0x2e, 0x74, 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x32, 0x29, 0x7d,
+ 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x70, 0x65, 0x72, 0x20,
+ 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
+ 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
+ 0x6f, 0x6e, 0x20, 0x41, 0x70, 0x70, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73,
+ 0x29, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
+ 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76,
+ 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e,
+ 0x65, 0x72, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x3c, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x3e, 0x0a,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x3c, 0x24, 0x7b, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74,
- 0x65, 0x64, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3f, 0x20, 0x43,
- 0x68, 0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3a, 0x20, 0x43, 0x6f, 0x6e,
- 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x7d, 0x20, 0x2f, 0x3e, 0x0a,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f,
- 0x6d, 0x61, 0x69, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f,
- 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x77, 0x72, 0x69, 0x74, 0x65, 0x22,
- 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
- 0x49, 0x6e, 0x70, 0x75, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x65,
- 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x6f, 0x74, 0x65,
- 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x3c, 0x70, 0x3e, 0x3c, 0x24, 0x7b, 0x4d, 0x6f, 0x64,
- 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e,
- 0x49, 0x6e, 0x66, 0x6f, 0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e,
+ 0x3c, 0x68, 0x31, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70,
+ 0x70, 0x3c, 0x2f, 0x68, 0x31, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, 0x65,
+ 0x72, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x3c, 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22,
+ 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24,
+ 0x7b, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64,
+ 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3f, 0x20, 0x43, 0x68, 0x61,
+ 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3a, 0x20, 0x43, 0x6f, 0x6e, 0x66, 0x69,
+ 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x6d, 0x61,
+ 0x69, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
+ 0x69, 0x64, 0x3d, 0x22, 0x77, 0x72, 0x69, 0x74, 0x65, 0x22, 0x3e, 0x0a,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x3c, 0x24, 0x7b, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, 0x6e,
+ 0x70, 0x75, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x65, 0x63, 0x74,
+ 0x69, 0x6f, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x3c, 0x70, 0x3e, 0x50, 0x6f, 0x77, 0x65, 0x72, 0x65, 0x64, 0x20,
- 0x62, 0x79, 0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22,
- 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, 0x74, 0x68,
- 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x67, 0x65, 0x72, 0x67,
- 0x61, 0x6e, 0x6f, 0x76, 0x2f, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63,
- 0x70, 0x70, 0x22, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70,
- 0x70, 0x3c, 0x2f, 0x61, 0x3e, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x3c, 0x61,
- 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, 0x73,
- 0x3a, 0x2f, 0x2f, 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x22, 0x3e,
- 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x3c, 0x2f, 0x61, 0x3e, 0x2e,
- 0x3c, 0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e,
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64,
- 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x3b,
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x28, 0x41, 0x70, 0x70,
- 0x29, 0x2c, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e,
- 0x62, 0x6f, 0x64, 0x79, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73,
- 0x63, 0x72, 0x69, 0x70, 0x74, 0x3e, 0x0a, 0x3c, 0x2f, 0x68, 0x65, 0x61,
- 0x64, 0x3e, 0x0a, 0x0a, 0x3c, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x3c,
- 0x2f, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x0a, 0x3c, 0x2f, 0x68, 0x74,
- 0x6d, 0x6c, 0x3e, 0x0a
+ 0x20, 0x3c, 0x70, 0x3e, 0x3c, 0x24, 0x7b, 0x4d, 0x6f, 0x64, 0x65, 0x6c,
+ 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x6e,
+ 0x66, 0x6f, 0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x0a, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c,
+ 0x70, 0x3e, 0x50, 0x6f, 0x77, 0x65, 0x72, 0x65, 0x64, 0x20, 0x62, 0x79,
+ 0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74,
+ 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62,
+ 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x67, 0x65, 0x72, 0x67, 0x61, 0x6e,
+ 0x6f, 0x76, 0x2f, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70,
+ 0x22, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x3c,
+ 0x2f, 0x61, 0x3e, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x3c, 0x61, 0x20, 0x68,
+ 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f,
+ 0x2f, 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x22, 0x3e, 0x67, 0x67,
+ 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x3c, 0x2f, 0x61, 0x3e, 0x2e, 0x3c, 0x2f,
+ 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e, 0x0a, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76,
+ 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x3b, 0x0a, 0x20,
+ 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+ 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x28, 0x41, 0x70, 0x70, 0x29, 0x2c,
+ 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x62, 0x6f,
+ 0x64, 0x79, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x63, 0x72,
+ 0x69, 0x70, 0x74, 0x3e, 0x0a, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, 0x3e,
+ 0x0a, 0x0a, 0x3c, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x3c, 0x2f, 0x62,
+ 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x0a, 0x3c, 0x2f, 0x68, 0x74, 0x6d, 0x6c,
+ 0x3e, 0x0a
};
-unsigned int index_html_len = 13696;
+unsigned int index_html_len = 13790;
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index f5ed2d427..ea93de4aa 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -390,6 +390,9 @@
// poor mans markdown replacement
const Markdownish = (params) => {
const md = params.text
+ .replace(/&/g, '&')
+ .replace(//g, '>')
.replace(/^#{1,6} (.*)$/gim, '
$1
')
.replace(/\*\*(.*?)\*\*/g, '$1')
.replace(/__(.*?)__/g, '$1')
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 00d77de5d..142b06b83 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -657,6 +657,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
+ fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
@@ -782,6 +783,14 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
params.n_gqa = std::stoi(argv[i]);
}
+ else if (arg == "-eps" || arg == "--rms-norm-eps") {
+ if (++i >= argc)
+ {
+ invalid_param = true;
+ break;
+ }
+ params.rms_norm_eps = std::stof(argv[i]);
+ }
else if (arg == "--rope-freq-base")
{
if (++i >= argc)
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 4bbf6b782..54dc2beed 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -16,7 +16,7 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
-static const float rms_norm_eps = 1e-6f;
+static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
struct random_normal_distribution {
std::mt19937 gen;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 87a166061..d31fc79c1 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1564,12 +1564,14 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
- // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
- const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
-
float sumf_d = 0.0f;
float sumf_m = 0.0f;
+#ifndef GGML_QKK_64
+
+ // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
+ const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
+
const float d = bq4_K->d;
const float dmin = bq4_K->dmin;
@@ -1614,6 +1616,43 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
}
return d*sumf_d - dmin*sumf_m;
+
+#else
+
+ uint16_t aux16[2];
+ const uint8_t * s = (const uint8_t *)aux16;
+
+ const uint16_t * a = (const uint16_t *)bq4_K->scales;
+ aux16[0] = a[0] & 0x0f0f;
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
+
+ const float dall = bq4_K->d[0];
+ const float dmin = bq4_K->d[1];
+
+ const float d8_1 = bq8_1[0].d;
+ const float d8_2 = bq8_1[1].d;
+
+ const int ui1 = *((const int *)bq8_1[0].qs + iqs);
+ const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
+ const int ui3 = *((const int *)bq8_1[1].qs + iqs);
+ const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
+
+ const int * q4 = (const int *)bq4_K->qs + iqs;
+ const int v1 = q4[0];
+ const int v2 = q4[4];
+
+ const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
+ const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
+ const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
+ const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
+
+ sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
+ sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
+
+ return dall * sumf_d - dmin * sumf_m;
+
+#endif
+
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -1625,6 +1664,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+#ifndef GGML_QKK_64
+
const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
@@ -1680,6 +1721,42 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
}
return d*sumf_d - dmin*sumf_m;
+
+#else
+
+ const int8_t * s = bq5_K->scales;
+
+ const float d = bq5_K->d;
+
+ const float d8_1 = bq8_1[0].d;
+ const float d8_2 = bq8_1[1].d;
+
+ const int ui1 = *((const int *)bq8_1[0].qs + iqs);
+ const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
+ const int ui3 = *((const int *)bq8_1[1].qs + iqs);
+ const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
+
+ const int * ql = (const int *)bq5_K->qs + iqs;
+ const int vl1 = ql[0];
+ const int vl2 = ql[4];
+
+ const int step = 4 * iqs; // 0, 4, 8, 12
+ const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
+ const int in = step%8; // 0, 4, 0, 4
+ const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
+
+ const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
+ const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
+ const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
+ const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
+
+ const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
+ + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
+
+ return d * sumf_d;
+
+#endif
+
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
diff --git a/ggml-metal.h b/ggml-metal.h
index 928f1705c..16f1a0caa 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -61,6 +61,13 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
// get data from the device into host memory
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+// try to find operations that can be run concurrently in the graph
+// you should run it again if the topology of your graph changes
+void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+
+// if the graph has been optimized for concurrently dispatch
+bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+
// same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
diff --git a/ggml-metal.m b/ggml-metal.m
index c1db3d165..74a6bff40 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -36,6 +36,9 @@ struct ggml_metal_context {
int n_buffers;
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
+ int concur_list[GGML_MAX_NODES];
+ int concur_list_len;
+
// custom kernels
#define GGML_METAL_DECL_KERNEL(name) \
id function_##name; \
@@ -98,6 +101,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->device = MTLCreateSystemDefaultDevice();
ctx->queue = [ctx->device newCommandQueue];
ctx->n_buffers = 0;
+ ctx->concur_list_len = 0;
// determine if we can use MPS
if (MPSSupportsMTLDevice(ctx->device)) {
@@ -217,6 +221,13 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
ctx->n_cb = n_cb;
}
+bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
+ if (ctx->concur_list_len) {
+ return true;
+ }
+ return false;
+}
+
// finds the Metal buffer that contains the tensor data on the GPU device
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
// Metal buffer based on the host memory pointer
@@ -355,11 +366,98 @@ void ggml_metal_get_tensor(
memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
}
+void ggml_metal_graph_find_concurrency(
+ struct ggml_metal_context * ctx,
+ struct ggml_cgraph * gf) {
+ int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
+ int nodes_unused[GGML_MAX_NODES];
+
+ for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;}
+ for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;}
+ ctx->concur_list_len = 0;
+
+ int n_left = gf->n_nodes;
+ int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
+ int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
+
+ while (n_left > 0) {
+ // number of nodes at a layer (that can be issued concurrently)
+ int concurrency = 0;
+ for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
+ if (nodes_unused[i]) {
+ // if the requirements for gf->nodes[i] are satisfied
+ int exe_flag=1;
+ // scan all srcs
+ for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
+ struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
+ if (src_cur) {
+ // if is leaf nodes it's satisfied.
+ if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;}
+
+ // otherwise this src should be the output from previous nodes.
+ int is_found = 0;
+ // scan 2*search_depth back because we inserted barrier.
+ for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
+ if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;}
+ }
+ if (is_found == 0) {exe_flag = 0; break;}
+ }
+ }
+ if (exe_flag) {
+ // check if nodes[i]'s data will be overwritten by a node before nodes[i].
+ // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
+ int64_t data_start = (int64_t) gf->nodes[i]->data;
+ int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
+ for (int j = n_start; j < i; j++) {
+ if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
+ && gf->nodes[j]->op != GGML_OP_VIEW \
+ && gf->nodes[j]->op != GGML_OP_TRANSPOSE \
+ && gf->nodes[j]->op != GGML_OP_PERMUTE) {
+ if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
+ ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
+ continue;
+ } else {
+ exe_flag = 0;
+ }
+ }
+ }
+ }
+ if (exe_flag) {
+ ctx->concur_list[level_pos + concurrency] = i;
+ nodes_unused[i] = 0;
+ concurrency++;
+ ctx->concur_list_len++;
+ }
+ }
+ }
+ n_left -= concurrency;
+ // adding a barrier different layer
+ ctx->concur_list[level_pos + concurrency] = -1;
+ ctx->concur_list_len++;
+ // jump all sorted nodes at nodes_bak
+ while (!nodes_unused[n_start]) {n_start++;}
+ level_pos += concurrency + 1;
+ }
+
+ if (ctx->concur_list_len > GGML_MAX_NODES) {
+ fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
+ }
+}
+
void ggml_metal_graph_compute(
struct ggml_metal_context * ctx,
struct ggml_cgraph * gf) {
metal_printf("%s: evaluating graph\n", __func__);
+ // if there is ctx->concur_list, dispatch concurrently
+ // else fallback to serial dispatch
+ MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
+
+ const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES;
+
+ const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes;
+ edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
+
// create multiple command buffers and enqueue them
// then, we encode the graph into the command buffers in parallel
@@ -378,7 +476,7 @@ void ggml_metal_graph_compute(
dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
- const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
+ const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
dispatch_async(queue, ^{
size_t offs_src0 = 0;
@@ -389,10 +487,21 @@ void ggml_metal_graph_compute(
id encoder = nil;
- const int node_start = (cb_idx + 0) * n_nodes_per_cb;
- const int node_end = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
+ const int node_start = (cb_idx + 0) * n_nodes_per_cb;
+ const int node_end = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
+
+ for (int ind = node_start; ind < node_end; ++ind) {
+ const int i = has_concur ? ctx->concur_list[ind] : ind;
+
+ if (i == -1) {
+ if (encoder == nil) {
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
+ continue;
+ }
+ [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
+ continue;
+ }
- for (int i = node_start; i < node_end; ++i) {
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
@@ -463,7 +572,7 @@ void ggml_metal_graph_compute(
case GGML_OP_ADD:
{
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
if (ggml_nelements(src1) == ne10) {
@@ -484,7 +593,7 @@ void ggml_metal_graph_compute(
case GGML_OP_MUL:
{
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
if (ggml_nelements(src1) == ne10) {
@@ -505,7 +614,7 @@ void ggml_metal_graph_compute(
case GGML_OP_SCALE:
{
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const float scale = *(const float *) src1->data;
@@ -524,7 +633,7 @@ void ggml_metal_graph_compute(
case GGML_UNARY_OP_SILU:
{
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
[encoder setComputePipelineState:ctx->pipeline_silu];
@@ -538,7 +647,7 @@ void ggml_metal_graph_compute(
case GGML_UNARY_OP_RELU:
{
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
[encoder setComputePipelineState:ctx->pipeline_relu];
@@ -552,7 +661,7 @@ void ggml_metal_graph_compute(
case GGML_UNARY_OP_GELU:
{
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
[encoder setComputePipelineState:ctx->pipeline_gelu];
@@ -572,7 +681,7 @@ void ggml_metal_graph_compute(
case GGML_OP_SOFT_MAX:
{
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const int nth = 32;
@@ -590,7 +699,7 @@ void ggml_metal_graph_compute(
case GGML_OP_DIAG_MASK_INF:
{
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const int n_past = ((int32_t *)(dst->op_params))[0];
@@ -653,7 +762,7 @@ void ggml_metal_graph_compute(
}
} else {
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
int nth0 = 32;
@@ -780,7 +889,7 @@ void ggml_metal_graph_compute(
case GGML_OP_GET_ROWS:
{
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
switch (src0->type) {
@@ -809,7 +918,7 @@ void ggml_metal_graph_compute(
case GGML_OP_RMS_NORM:
{
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
float eps;
@@ -832,7 +941,7 @@ void ggml_metal_graph_compute(
case GGML_OP_NORM:
{
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const float eps = 1e-5f;
@@ -854,7 +963,7 @@ void ggml_metal_graph_compute(
case GGML_OP_ALIBI:
{
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
GGML_ASSERT((src0t == GGML_TYPE_F32));
@@ -897,7 +1006,7 @@ void ggml_metal_graph_compute(
case GGML_OP_ROPE:
{
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const int n_past = ((int32_t *) dst->op_params)[0];
@@ -941,7 +1050,7 @@ void ggml_metal_graph_compute(
case GGML_OP_CONT:
{
if (encoder == nil) {
- encoder = [command_buffer computeCommandEncoder];
+ encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
}
const int nth = 32;
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 987376d56..696b33ce7 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -387,87 +387,90 @@ kernel void kernel_rms_norm(
}
}
-// function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i])
-float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl) {
+// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q4 quants begin (0 or QK4_0/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
float d = qb_curr->d;
- float4 acc = 0.f;
- device uint16_t * qs = ((device uint16_t *)qb_curr + 1);
- for (int i = 0; i < 16; i+=2) {
- acc[0] += yl[i] * (qs[i / 2] & 0x000F);
- acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
- acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00);
- acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
+ float2 acc = 0.f;
+ device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
+ for (int i = 0; i < 8; i+=2) {
+ acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
+ + yl[i + 1] * (qs[i / 2] & 0x0F00);
+ acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
+ + yl[i + 9] * (qs[i / 2] & 0xF000);
}
- return d * (sumy * -8.f + acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f);
+ return d * (sumy * -8.f + acc[0] + acc[1]);
}
-// function for calculate inner product between a q4_1 block and 32 floats (yl), sumy is SUM(yl[i])
-float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl) {
+// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q4 quants begin (0 or QK4_0/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
float d = qb_curr->d;
float m = qb_curr->m;
- float4 acc = 0.f;
- device uint16_t * qs = ((device uint16_t *)qb_curr + 2);
- for (int i = 0; i < 16; i+=2) {
- acc[0] += yl[i] * (qs[i / 2] & 0x000F);
- acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
- acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00);
- acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
+ device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
+ float2 acc = 0.f;
+ for (int i = 0; i < 8; i+=2) {
+ acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
+ + yl[i + 1] * (qs[i / 2] & 0x0F00);
+ acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
+ + yl[i + 9] * (qs[i / 2] & 0xF000);
}
- return d * (acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f) + sumy * m;
+ return d * (acc[0] + acc[1]) + sumy * m;
}
// putting them in the kernel cause a significant performance penalty
#define N_DST 4 // each SIMD group works on 4 rows
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
-template
+//Note: This is a template, but strictly speaking it only applies to
+// quantizations where the block size is 32. It also does not
+// giard against the number of rows not being divisible by
+// N_DST, so this is another explicit assumption of the implementation.
+template
void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
uint2 tgpig, uint tiisg, uint sgitg) {
const int nb = ne00/QK4_0;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
- device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
+ const int first_row = (r0 * nsg + sgitg) * nr;
+ device const block_q_type * x = (device const block_q_type *) src0 + first_row * nb;
device const float * y = (device const float *) src1 + r1*ne10;
- float4 y_curr[8]; // src1 vector cache
- float sumf[N_DST]={0.f}, all_sum;
- thread float * yl=(thread float *)y_curr;
+ float yl[16]; // src1 vector cache
+ float sumf[nr]={0.f};
- // each thread in a SIMD group deals with 1 block.
- for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
+ const int ix = tiisg/2;
+ const int il = 8*(tiisg%2);
+
+ device const float * yb = y + ix * QK4_0 + il;
+
+ // each thread in a SIMD group deals with half a block.
+ for (int ib = ix; ib < nb; ib += nw/2) {
float sumy = 0;
- for (int i = 0; i < QK4_0 / 4; i++) {
- y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i);
- sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
+ for (int i = 0; i < 8; i += 2) {
+ sumy += yb[i] + yb[i+1];
+ yl[i+0] = yb[i+ 0];
+ yl[i+1] = yb[i+ 1]/256.f;
+ sumy += yb[i+16] + yb[i+17];
+ yl[i+8] = yb[i+16]/16.f;
+ yl[i+9] = yb[i+17]/4096.f;
}
- for (int row = 0; row < N_DST; row++) {
- sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl);
+ for (int row = 0; row < nr; row++) {
+ sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il);
}
+
+ yb += QK4_0 * 16;
}
- // from now loads two rows every time and 16 blocks per row
- int ir = tiisg / (N_SIMDWIDTH / 2);
- int ib = tiisg % (N_SIMDWIDTH / 2);
- for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) {
- int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start
- float sumy = 0;
- for (int i = 0; i < QK4_0 / 4; i++) {
- y_curr[i] = *((device float4 *)(y + (nb_start + ib) * QK4_0) + i);
- sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
- }
-
- for (int row = 0; row < N_DST; row+=2) {
- if (nb_start + ib < nb) {
- sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl);
- }
- }
- }
-
- for (int row = 0; row < N_DST; ++row) {
- all_sum = simd_sum(sumf[row]);
- if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
- dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
+ for (int row = 0; row < nr; ++row) {
+ const float tot = simd_sum(sumf[row]);
+ if (tiisg == 0 && first_row + row < ne01) {
+ dst[r1*ne0 + first_row + row] = tot;
}
}
}
@@ -483,7 +486,7 @@ kernel void kernel_mul_mat_q4_0_f32(
uint2 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
- mul_vec_q_n_f32(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
+ mul_vec_q_n_f32(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
}
kernel void kernel_mul_mat_q4_1_f32(
@@ -497,7 +500,7 @@ kernel void kernel_mul_mat_q4_1_f32(
uint2 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
- mul_vec_q_n_f32(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
+ mul_vec_q_n_f32(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
}
kernel void kernel_mul_mat_f16_f32(
diff --git a/ggml.c b/ggml.c
index 11226c834..b77f99267 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4071,8 +4071,8 @@ bool ggml_is_numa(void) {
////////////////////////////////////////////////////////////////////////////////
void ggml_print_object(const struct ggml_object * obj) {
- GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
- obj->offs, obj->size, (const void *) obj->next);
+ GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
+ obj->type, obj->offs, obj->size, (const void *) obj->next);
}
void ggml_print_objects(const struct ggml_context * ctx) {
@@ -4212,7 +4212,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
}
size_t ggml_tensor_overhead(void) {
- return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
}
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
@@ -4229,6 +4229,15 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
+static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+ return
+ tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
+ tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+ tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
@@ -4374,7 +4383,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
return NULL;
}
- const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
+ const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
*ctx = (struct ggml_context) {
/*.mem_size =*/ mem_size,
@@ -4463,12 +4472,14 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
struct ggml_object * obj = ctx->objects_begin;
while (obj != NULL) {
- struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
+ if (obj->type == GGML_OBJECT_TENSOR) {
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
- const size_t size = ggml_nbytes(tensor);
+ const size_t size = ggml_nbytes(tensor);
- if (max_size < size) {
- max_size = size;
+ if (max_size < size) {
+ max_size = size;
+ }
}
obj = obj->next;
@@ -4500,12 +4511,7 @@ static void ggml_scratch_load(struct ggml_context * ctx) {
////////////////////////////////////////////////////////////////////////////////
-static struct ggml_tensor * ggml_new_tensor_impl(
- struct ggml_context * ctx,
- enum ggml_type type,
- int n_dims,
- const int64_t* ne,
- void* data) {
+static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
// always insert objects at the end of the context's memory pool
struct ggml_object * obj_cur = ctx->objects_end;
@@ -4513,63 +4519,28 @@ static struct ggml_tensor * ggml_new_tensor_impl(
const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
const size_t cur_end = cur_offs + cur_size;
- size_t size_needed = 0;
-
- if (data == NULL && !ctx->no_alloc) {
- size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
- for (int i = 1; i < n_dims; i++) {
- size_needed *= ne[i];
- }
- // align to GGML_MEM_ALIGN
- size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
- }
+ // align to GGML_MEM_ALIGN
+ size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
char * const mem_buffer = ctx->mem_buffer;
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
- if (ctx->scratch.data == NULL || data != NULL) {
- size_needed += GGML_TENSOR_SIZE;
-
- if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
- __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
- assert(false);
- return NULL;
- }
-
- *obj_new = (struct ggml_object) {
- .offs = cur_end + GGML_OBJECT_SIZE,
- .size = size_needed,
- .next = NULL,
- };
- } else {
- if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
- __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
- assert(false);
- return NULL;
- }
-
- if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
- __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
- assert(false);
- return NULL;
- }
-
- data = (char * const) ctx->scratch.data + ctx->scratch.offs;
-
- *obj_new = (struct ggml_object) {
- .offs = cur_end + GGML_OBJECT_SIZE,
- .size = GGML_TENSOR_SIZE,
- .next = NULL,
- };
-
- //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
-
- ctx->scratch.offs += size_needed;
+ if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
+ GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
+ __func__, cur_end + size_needed, ctx->mem_size);
+ assert(false);
+ return NULL;
}
+ *obj_new = (struct ggml_object) {
+ .offs = cur_end + GGML_OBJECT_SIZE,
+ .size = size_needed,
+ .next = NULL,
+ .type = type,
+ };
+
+ ggml_assert_aligned(mem_buffer + obj_new->offs);
+
if (obj_cur != NULL) {
obj_cur->next = obj_new;
} else {
@@ -4581,9 +4552,46 @@ static struct ggml_tensor * ggml_new_tensor_impl(
//printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
- struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
+ return obj_new;
+}
- ggml_assert_aligned(result);
+static struct ggml_tensor * ggml_new_tensor_impl(
+ struct ggml_context * ctx,
+ enum ggml_type type,
+ int n_dims,
+ const int64_t* ne,
+ void* data) {
+
+ size_t data_size = 0;
+
+ if (data == NULL && !ctx->no_alloc) {
+ data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
+ for (int i = 1; i < n_dims; i++) {
+ data_size *= ne[i];
+ }
+ }
+
+ if (ctx->scratch.data != NULL && data == NULL) {
+ // allocate tensor data in the scratch buffer
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
+ assert(false);
+ return NULL;
+ }
+
+ data = (char * const) ctx->scratch.data + ctx->scratch.offs;
+
+ ctx->scratch.offs += data_size;
+
+ data_size = 0;
+ }
+
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
+
+ // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
+
+ struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
*result = (struct ggml_tensor) {
/*.type =*/ type,
@@ -4974,11 +4982,6 @@ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
}
-static void ggml_set_unary_op(struct ggml_tensor * tensor, enum ggml_unary_op op) {
- GGML_ASSERT(tensor->op = GGML_OP_UNARY);
- ggml_set_op_params_i32(tensor, 0, (int32_t) op);
-}
-
const char * ggml_get_name(const struct ggml_tensor * tensor) {
return tensor->name;
}
@@ -5017,9 +5020,11 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
char * const mem_buffer = ctx->mem_buffer;
while (obj != NULL) {
- struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
- if (strcmp(cur->name, name) == 0) {
- return cur;
+ if (obj->type == GGML_OBJECT_TENSOR) {
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
+ if (strcmp(cur->name, name) == 0) {
+ return cur;
+ }
}
obj = obj->next;
@@ -7021,14 +7026,16 @@ struct ggml_tensor * ggml_flash_attn(
}
//struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, q->ne);
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
+
+ int32_t t = masked ? 1 : 0;
+ ggml_set_op_params(result, &t, sizeof(t));
result->op = GGML_OP_FLASH_ATTN;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = q;
result->src[1] = k;
result->src[2] = v;
- result->src[3] = ggml_new_i32(ctx, masked ? 1 : 0);
return result;
}
@@ -7052,7 +7059,7 @@ struct ggml_tensor * ggml_flash_ff(
}
//struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne);
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
result->op = GGML_OP_FLASH_FF;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7118,13 +7125,15 @@ struct ggml_tensor * ggml_flash_attn_back(
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+ int32_t masked_i = masked ? 1 : 0;
+ ggml_set_op_params(result, &masked_i, sizeof(masked_i));
+
result->op = GGML_OP_FLASH_ATTN_BACK;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = q;
result->src[1] = k;
result->src[2] = v;
result->src[3] = d;
- result->src[4] = ggml_new_i32(ctx, masked ? 1 : 0);
return result;
}
@@ -7212,7 +7221,7 @@ static struct ggml_tensor * ggml_unary_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
- ggml_set_unary_op(result, op);
+ ggml_set_op_params_i32(result, 0, (int32_t) op);
result->op = GGML_OP_UNARY;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -9814,8 +9823,8 @@ static void ggml_compute_forward_gelu_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
- GGML_ASSERT(ggml_is_contiguous(src0));
- GGML_ASSERT(ggml_is_contiguous(dst));
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9873,8 +9882,8 @@ static void ggml_compute_forward_gelu_quick_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
- GGML_ASSERT(ggml_is_contiguous(src0));
- GGML_ASSERT(ggml_is_contiguous(dst));
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9932,8 +9941,8 @@ static void ggml_compute_forward_silu_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
- GGML_ASSERT(ggml_is_contiguous(src0));
- GGML_ASSERT(ggml_is_contiguous(dst));
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -9992,9 +10001,9 @@ static void ggml_compute_forward_silu_back_f32(
const struct ggml_tensor * src0,
const struct ggml_tensor * grad,
struct ggml_tensor * dst) {
- GGML_ASSERT(ggml_is_contiguous(grad));
- GGML_ASSERT(ggml_is_contiguous(src0));
- GGML_ASSERT(ggml_is_contiguous(dst));
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
+ GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_are_same_shape(src0, grad));
@@ -14764,7 +14773,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} break;
case GGML_OP_FLASH_ATTN:
{
- const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
+ const int32_t t = ggml_get_op_params_i32(tensor, 0);
GGML_ASSERT(t == 0 || t == 1);
const bool masked = t != 0;
ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
@@ -14775,7 +14784,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} break;
case GGML_OP_FLASH_ATTN_BACK:
{
- int32_t t = ggml_get_i32_1d(tensor->src[4], 0);
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
GGML_ASSERT(t == 0 || t == 1);
bool masked = t != 0;
ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
@@ -15393,7 +15402,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
struct ggml_tensor * flash_grad = NULL;
if (src0->grad || src1->grad || tensor->src[2]->grad) {
- int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
GGML_ASSERT(t == 0 || t == 1);
bool masked = t != 0;
flash_grad =
@@ -15665,6 +15674,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
}
}
+static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
+
+static size_t hash(void * p) {
+ return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
+}
+
+static bool hash_insert(void * hash_table[], void * p) {
+ size_t h = hash(p);
+
+ // linear probing
+ size_t i = h;
+ while (hash_table[i] != NULL && hash_table[i] != p) {
+ i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
+ if (i == h) {
+ // hash table is full
+ GGML_ASSERT(false);
+ }
+ }
+
+ if (hash_table[i] == p) {
+ return true;
+ }
+
+ // insert
+ hash_table[i] = p;
+ return false;
+}
+
static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
if (node->grad == NULL) {
// this usually happens when we generate intermediate nodes from constants in the backward pass
@@ -15675,16 +15712,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
}
// check if already visited
- for (int i = 0; i < cgraph->n_nodes; i++) {
- if (cgraph->nodes[i] == node) {
- return;
- }
- }
-
- for (int i = 0; i < cgraph->n_leafs; i++) {
- if (cgraph->leafs[i] == node) {
- return;
- }
+ if (hash_insert(cgraph->visited_hash_table, node)) {
+ return;
}
for (int i = 0; i < GGML_MAX_SRC; ++i) {
@@ -15747,6 +15776,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
/*.nodes =*/ { NULL },
/*.grads =*/ { NULL },
/*.leafs =*/ { NULL },
+ /*.hash_table =*/ { NULL },
/*.perf_runs =*/ 0,
/*.perf_cycles =*/ 0,
/*.perf_time_us =*/ 0,
@@ -15788,13 +15818,42 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
if (node->is_param) {
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
- ggml_build_forward_impl(&result, node->grad, true);
+ ggml_build_forward_expand(&result, node->grad);
}
}
return result;
}
+struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
+
+ *cgraph = (struct ggml_cgraph) {
+ /*.n_nodes =*/ 0,
+ /*.n_leafs =*/ 0,
+ /*.nodes =*/ { NULL },
+ /*.grads =*/ { NULL },
+ /*.leafs =*/ { NULL },
+ /*.hash_table =*/ { NULL },
+ /*.perf_runs =*/ 0,
+ /*.perf_cycles =*/ 0,
+ /*.perf_time_us =*/ 0,
+ };
+
+ return cgraph;
+}
+
+struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
+ struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
+ ggml_build_forward_impl(cgraph, tensor, false);
+ return cgraph;
+}
+
+size_t ggml_graph_overhead(void) {
+ return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
+}
+
//
// thread data
//
@@ -16510,10 +16569,9 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
- struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
- GGML_ASSERT(buf);
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
- cplan.work_data = buf->data;
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
ggml_graph_compute(cgraph, &cplan);
}
diff --git a/ggml.h b/ggml.h
index 1870b62e8..9919cce7c 100644
--- a/ggml.h
+++ b/ggml.h
@@ -208,6 +208,7 @@
#define GGML_UNUSED(x) (void)(x)
+#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
#define GGML_ASSERT(x) \
do { \
@@ -396,6 +397,12 @@ extern "C" {
GGML_UNARY_OP_SILU,
};
+ enum ggml_object_type {
+ GGML_OBJECT_TENSOR,
+ GGML_OBJECT_GRAPH,
+ GGML_OBJECT_WORK_BUFFER
+ };
+
// ggml object
struct ggml_object {
size_t offs;
@@ -403,7 +410,9 @@ extern "C" {
struct ggml_object * next;
- char padding[8];
+ enum ggml_object_type type;
+
+ char padding[4];
};
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
@@ -424,7 +433,7 @@ extern "C" {
enum ggml_op op;
// op params - allocated as int32_t for alignment
- int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)];
+ int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
bool is_param;
@@ -442,7 +451,7 @@ extern "C" {
void * extra; // extra things e.g. for ggml-cuda.cu
- char padding[8];
+ char padding[4];
};
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -463,6 +472,11 @@ extern "C" {
void * abort_callback_data;
};
+ // next prime after GGML_MAX_NODES
+ // #define GGML_GRAPH_HASHTABLE_SIZE 4099
+ // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
+ #define GGML_GRAPH_HASHTABLE_SIZE 8273
+
// computation graph
struct ggml_cgraph {
int n_nodes;
@@ -472,12 +486,16 @@ extern "C" {
struct ggml_tensor * grads[GGML_MAX_NODES];
struct ggml_tensor * leafs[GGML_MAX_NODES];
+ void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
+
// performance
int perf_runs;
int64_t perf_cycles;
int64_t perf_time_us;
};
+ static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
+
// scratch buffer
struct ggml_scratch {
size_t offs;
@@ -1384,11 +1402,17 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * tensor);
+
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
+ // graph allocation in a context
+ GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx);
+ GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
+ GGML_API size_t ggml_graph_overhead(void);
+
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
diff --git a/k_quants.c b/k_quants.c
index e790abf88..e792c57ac 100644
--- a/k_quants.c
+++ b/k_quants.c
@@ -1666,6 +1666,62 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
*s = hsum_float_8(acc) + summs;
+#elif defined __AVX__
+
+ const __m128i m3 = _mm_set1_epi8(3);
+
+ __m256 acc = _mm256_setzero_ps();
+
+ uint32_t ud, um;
+ const uint8_t * restrict db = (const uint8_t *)&ud;
+ const uint8_t * restrict mb = (const uint8_t *)&um;
+
+ float summs = 0;
+
+ // TODO: optimize this
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+ const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
+
+ const uint8_t * restrict q2 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
+ ud = (sc[0] >> 0) & 0x0f0f0f0f;
+ um = (sc[0] >> 4) & 0x0f0f0f0f;
+
+ int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
+ summs += dmin * smin;
+
+ const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
+ const __m128i q2_0 = _mm_and_si128(q2bits, m3);
+ const __m128i q2_1 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
+ const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
+ const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
+
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+ const __m128i p0 = _mm_maddubs_epi16(q2_0, _mm256_extractf128_si256(q8_0, 0));
+ const __m128i p1 = _mm_maddubs_epi16(q2_1, _mm256_extractf128_si256(q8_0, 1));
+ const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
+ const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
+
+ const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
+ const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
+ const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
+ const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
+
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2)), acc);
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3)), acc);
+ }
+
+ *s = hsum_float_8(acc) + summs;
+
#else
float sumf = 0;
@@ -2295,6 +2351,93 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
*s = hsum_float_8(acc);
+#elif defined __AVX__
+
+ const __m128i m3 = _mm_set1_epi8(3);
+ const __m128i m1 = _mm_set1_epi8(1);
+
+ __m256 acc = _mm256_setzero_ps();
+
+ uint64_t aux64;
+
+ uint16_t aux16[2];
+ const int8_t * aux8 = (const int8_t *)aux16;
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+
+ const uint8_t * restrict q3 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const uint16_t a = *(const uint16_t *)x[i].scales;
+ aux16[0] = a & 0x0f0f;
+ aux16[1] = (a >> 4) & 0x0f0f;
+
+ const __m128i scale_0 = _mm_set1_epi16(aux8[0] - 8);
+ const __m128i scale_1 = _mm_set1_epi16(aux8[2] - 8);
+ const __m128i scale_2 = _mm_set1_epi16(aux8[1] - 8);
+ const __m128i scale_3 = _mm_set1_epi16(aux8[3] - 8);
+
+ memcpy(&aux64, x[i].hmask, 8);
+
+ __m128i q3h_0 = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
+ __m128i q3h_1 = _mm_srli_epi16(q3h_0, 2);
+ __m128i q3h_2 = _mm_srli_epi16(q3h_0, 4);
+ __m128i q3h_3 = _mm_srli_epi16(q3h_0, 6);
+ q3h_0 = _mm_slli_epi16(_mm_andnot_si128(q3h_0, m1), 2);
+ q3h_1 = _mm_slli_epi16(_mm_andnot_si128(q3h_1, m1), 2);
+ q3h_2 = _mm_slli_epi16(_mm_andnot_si128(q3h_2, m1), 2);
+ q3h_3 = _mm_slli_epi16(_mm_andnot_si128(q3h_3, m1), 2);
+
+ // load low 2 bits
+ const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
+
+ // prepare low and high bits
+ const __m128i q3l_0 = _mm_and_si128(q3bits, m3);
+ const __m128i q3l_1 = _mm_and_si128(_mm_srli_epi16(q3bits, 2), m3);
+ const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits, 4), m3);
+ const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits, 6), m3);
+
+ // load Q8 quants
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+ // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm_maddubs_epi16,
+ // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+ // and 2 if the high bit was set)
+ const __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, _mm256_extractf128_si256(q8_0, 0));
+ const __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, _mm256_extractf128_si256(q8_0, 1));
+ const __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, _mm256_extractf128_si256(q8_1, 0));
+ const __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, _mm256_extractf128_si256(q8_1, 1));
+
+ __m128i p16_0 = _mm_maddubs_epi16(q3l_0, _mm256_extractf128_si256(q8_0, 0));
+ __m128i p16_1 = _mm_maddubs_epi16(q3l_1, _mm256_extractf128_si256(q8_0, 1));
+ __m128i p16_2 = _mm_maddubs_epi16(q3l_2, _mm256_extractf128_si256(q8_1, 0));
+ __m128i p16_3 = _mm_maddubs_epi16(q3l_3, _mm256_extractf128_si256(q8_1, 1));
+
+ p16_0 = _mm_sub_epi16(p16_0, q8s_0);
+ p16_1 = _mm_sub_epi16(p16_1, q8s_1);
+ p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+ p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+
+ // multiply with scales
+ p16_0 = _mm_madd_epi16(scale_0, p16_0);
+ p16_1 = _mm_madd_epi16(scale_1, p16_1);
+ p16_2 = _mm_madd_epi16(scale_2, p16_2);
+ p16_3 = _mm_madd_epi16(scale_3, p16_3);
+
+ p16_0 = _mm_add_epi32(p16_0, p16_2);
+ p16_1 = _mm_add_epi32(p16_1, p16_3);
+ __m256i p16 = _mm256_set_m128i(p16_1, p16_0);
+
+ // multiply with block scale and accumulate
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
+
+ }
+
+ *s = hsum_float_8(acc);
+
#else
int8_t aux8[QK_K];
@@ -2781,6 +2924,60 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
*s = hsum_float_8(acc) - summs;
+#elif defined __AVX__
+
+ const __m128i m4 = _mm_set1_epi8(0xF);
+
+ __m256 acc = _mm256_setzero_ps();
+
+ float summs = 0;
+
+ uint16_t aux16[2];
+ const uint8_t * scales = (const uint8_t *)aux16;
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d;
+ const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d;
+ const __m256 vd = _mm256_set1_ps(d);
+
+ const uint16_t * a = (const uint16_t *)x[i].scales;
+ aux16[0] = a[0] & 0x0f0f;
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
+
+ summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
+
+ const uint8_t * restrict q4 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
+ const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0);
+ const __m128i q4bits_1 = _mm256_extractf128_si256(q4bits, 1);
+ const __m128i q4_0 = _mm_and_si128(q4bits_0, m4);
+ const __m128i q4_1 = _mm_and_si128(q4bits_1, m4);
+ const __m128i q4_2 = _mm_and_si128(_mm_srli_epi16(q4bits_0, 4), m4);
+ const __m128i q4_3 = _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4);
+
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+ const __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
+ const __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
+ const __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
+ const __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
+
+ const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
+ const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
+
+ const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
+ const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
+
+ }
+
+ *s = hsum_float_8(acc) - summs;
+
#else
uint8_t aux8[QK_K];
@@ -3295,6 +3492,63 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
*s = hsum_float_8(acc);
+#elif defined __AVX__
+
+ const __m128i m4 = _mm_set1_epi8(0xF);
+ const __m128i mone = _mm_set1_epi8(1);
+
+ __m256 acc = _mm256_setzero_ps();
+
+ for (int i = 0; i < nb; ++i) {
+
+ const uint8_t * restrict q5 = x[i].qs;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+
+ const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
+
+ const __m128i scale_0 = _mm_set1_epi16(x[i].scales[0]);
+ const __m128i scale_1 = _mm_set1_epi16(x[i].scales[1]);
+ const __m128i scale_2 = _mm_set1_epi16(x[i].scales[2]);
+ const __m128i scale_3 = _mm_set1_epi16(x[i].scales[3]);
+
+ int64_t aux64;
+ memcpy(&aux64, x[i].qh, 8);
+ const __m128i haux128_0 = _mm_set_epi64x(aux64 >> 1, aux64);
+ const __m128i haux128_1 = _mm_srli_epi16(haux128_0, 2);
+
+ const __m128i q5h_0 = _mm_slli_epi16(_mm_andnot_si128(haux128_0, mone), 4);
+ const __m128i q5h_1 = _mm_slli_epi16(_mm_andnot_si128(haux128_1, mone), 4);
+ const __m128i q5h_2 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_0, 4), mone), 4);
+ const __m128i q5h_3 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_1, 4), mone), 4);
+
+ const __m128i q5l_0 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 0), m4);
+ const __m128i q5l_1 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 1), m4);
+ const __m128i q5l_2 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 0), 4), m4);
+ const __m128i q5l_3 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 1), 4), m4);
+
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+ const __m128i p16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5l_0, _mm256_extractf128_si256(q8_0, 0)));
+ const __m128i p16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5l_1, _mm256_extractf128_si256(q8_0, 1)));
+ const __m128i p16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5l_2, _mm256_extractf128_si256(q8_1, 0)));
+ const __m128i p16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5l_3, _mm256_extractf128_si256(q8_1, 1)));
+ const __m128i s16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5h_0, _mm256_extractf128_si256(q8_0, 0)));
+ const __m128i s16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5h_1, _mm256_extractf128_si256(q8_0, 1)));
+ const __m128i s16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5h_2, _mm256_extractf128_si256(q8_1, 0)));
+ const __m128i s16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5h_3, _mm256_extractf128_si256(q8_1, 1)));
+
+ const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
+ const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
+
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
+
+ }
+
+ *s = hsum_float_8(acc);
+
#else
int8_t aux8[QK_K];
@@ -3857,6 +4111,77 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
*s = hsum_float_8(acc);
+#elif defined __AVX__
+
+ const __m128i m4 = _mm_set1_epi8(0xF);
+ const __m128i m2 = _mm_set1_epi8(3);
+ const __m128i m32s = _mm_set1_epi8(32);
+
+ __m256 acc = _mm256_setzero_ps();
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
+
+ const uint8_t * restrict q4 = x[i].ql;
+ const uint8_t * restrict qh = x[i].qh;
+ const int8_t * restrict q8 = y[i].qs;
+
+ const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
+ const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
+ const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
+ const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
+
+ __m128i sumi_0 = _mm_setzero_si128();
+ __m128i sumi_1 = _mm_setzero_si128();
+
+ const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
+ const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
+
+ const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
+ const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
+
+ const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH, m2), 4);
+ const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 2), m2), 4);
+ const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 4), m2), 4);
+ const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 6), m2), 4);
+
+ const __m128i q4_0 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 0), m4), q4h_0);
+ const __m128i q4_1 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 1), m4), q4h_1);
+ const __m128i q4_2 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 0), 4), m4), q4h_2);
+ const __m128i q4_3 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 1), 4), m4), q4h_3);
+
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
+
+ __m128i q8s_0 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 0));
+ __m128i q8s_1 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 1));
+ __m128i q8s_2 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 0));
+ __m128i q8s_3 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 1));
+
+ __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
+ __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
+ __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
+ __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
+
+ p16_0 = _mm_sub_epi16(p16_0, q8s_0);
+ p16_1 = _mm_sub_epi16(p16_1, q8s_1);
+ p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+ p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+
+ p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
+ p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
+ p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
+ p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
+
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
+ }
+
+ *s = hsum_float_8(acc);
+
#else
int8_t aux8[QK_K];
diff --git a/llama.cpp b/llama.cpp
index b42b41008..024af99a5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -186,7 +186,7 @@ struct llama_hparams {
// LLaMAv2
// TODO: load from model data hparams
float f_ffn_mult = 1.0f;
- float f_rms_norm_eps = 1e-6f;
+ float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
float rope_freq_base = 10000.0f;
float rope_freq_scale = 1.0f;
@@ -870,7 +870,7 @@ struct llama_context_params llama_context_default_params() {
/*.n_ctx =*/ 512,
/*.n_batch =*/ 512,
/*.n_gqa =*/ 1,
- /*.rms_norm_eps =*/ 1e-6f,
+ /*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS,
/*.gpu_layers =*/ 0,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
@@ -1424,7 +1424,7 @@ static bool llama_eval_internal(
struct ggml_context * ctx0 = ggml_init(params);
- ggml_cgraph gf = {};
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
@@ -1541,8 +1541,8 @@ static bool llama_eval_internal(
ggml_set_name(v, "v");
// important: storing RoPE-ed version of K in the KV cache!
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
}
struct ggml_tensor * Q =
@@ -1712,16 +1712,21 @@ static bool llama_eval_internal(
//cur = ggml_soft_max_inplace(ctx0, cur);
// run the computation
- ggml_build_forward_expand(&gf, cur);
+ ggml_build_forward_expand(gf, cur);
+
+ // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
#if GGML_USE_MPI
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
#endif
#ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) {
+ if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
+ ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
+ }
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
- ggml_metal_graph_compute(lctx.ctx_metal, &gf);
+ ggml_metal_graph_compute(lctx.ctx_metal, gf);
ggml_metal_get_tensor (lctx.ctx_metal, cur);
} else {
// IMPORTANT:
@@ -1740,34 +1745,34 @@ static bool llama_eval_internal(
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
}
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
}
#else
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
#endif
#if GGML_USE_MPI
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
#endif
// update kv token count
lctx.kv_self.n = n_past + N;
- struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
if (cgraph_fname) {
- ggml_graph_export(&gf, cgraph_fname);
+ ggml_graph_export(gf, cgraph_fname);
}
#ifdef GGML_PERF
// print timing information per ggml operation (for debugging purposes)
// requires GGML_PERF to be defined
- ggml_graph_print(&gf);
+ ggml_graph_print(gf);
#endif
// plot the computation graph in dot format (for debugging purposes)
//if (n_past%100 == 0) {
- // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
+ // ggml_graph_dump_dot(gf, NULL, "llama.dot");
//}
// extract logits
@@ -3172,7 +3177,7 @@ struct llama_context * llama_new_context_with_model(
ctx->embedding.resize(hparams.n_embd);
}
- ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
diff --git a/llama.h b/llama.h
index 843b0bf5f..df46f9b9c 100644
--- a/llama.h
+++ b/llama.h
@@ -53,6 +53,10 @@
#define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif
+#ifndef LLAMA_DEFAULT_RMS_EPS
+#define LLAMA_DEFAULT_RMS_EPS 5e-6f
+#endif
+
#ifdef __cplusplus
extern "C" {
#endif
diff --git a/scripts/build-info.sh b/scripts/build-info.sh
index 507d7e153..ed0d6c56a 100755
--- a/scripts/build-info.sh
+++ b/scripts/build-info.sh
@@ -16,7 +16,8 @@ fi
echo "#ifndef BUILD_INFO_H"
echo "#define BUILD_INFO_H"
echo ""
-echo "#define BUILD_NUMBER $BUILD_NUMBER"
-echo "#define BUILD_COMMIT \"$BUILD_COMMIT\""
+echo "#define BUILD_NUMBER $BUILD_NUMBER" | tr -d '\n'
+echo ""
+echo "#define BUILD_COMMIT \"$BUILD_COMMIT\"" | tr -d '\n'
echo ""
echo "#endif // BUILD_INFO_H"