Merge branch 'master' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	Makefile
#	examples/quantize/quantize.cpp
#	ggml.c
#	pocs/vdot/vdot.cpp
#	scripts/build-info.cmake
#	scripts/build-info.h.in
#	scripts/build-info.sh
#	tests/test-opt.cpp
#	tests/test-quantize-fns.cpp
#	tests/test-quantize-perf.cpp
#	tests/test-sampling.cpp
This commit is contained in:
Concedo 2023-09-16 12:14:19 +08:00
commit c96fb3984d
39 changed files with 1098 additions and 340 deletions

View file

@ -212,6 +212,7 @@ if (LLAMA_ALL_WARNINGS)
-Wextra
-Wpedantic
-Wcast-qual
-Wmissing-declarations
-Wno-unused-function
-Wno-multichar
)
@ -230,7 +231,7 @@ if (LLAMA_ALL_WARNINGS)
endif()
if (MSVC)
if (WIN32)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
if (BUILD_SHARED_LIBS)

View file

@ -78,7 +78,7 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}
void process_escapes(std::string& input) {
static void process_escapes(std::string& input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;

View file

@ -3,6 +3,7 @@
#pragma once
#include "llama.h"
#include "build-info.h"
#define LOG_NO_FILE_LINE_FUNCTION
#include "log.h"
@ -20,8 +21,13 @@
#define DIRECTORY_SEPARATOR '/'
#endif // _WIN32
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", ##__VA_ARGS__); exit(1); } while (0)
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
} while(0)
//
// CLI argument parsing

View file

@ -158,7 +158,7 @@ namespace console {
}
}
char32_t getchar32() {
static char32_t getchar32() {
#if defined(_WIN32)
HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
wchar_t high_surrogate = 0;
@ -212,7 +212,7 @@ namespace console {
#endif
}
void pop_cursor() {
static void pop_cursor() {
#if defined(_WIN32)
if (hConsole != NULL) {
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
@ -233,7 +233,7 @@ namespace console {
putc('\b', out);
}
int estimateWidth(char32_t codepoint) {
static int estimateWidth(char32_t codepoint) {
#if defined(_WIN32)
(void)codepoint;
return 1;
@ -242,7 +242,7 @@ namespace console {
#endif
}
int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
#if defined(_WIN32)
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
@ -303,7 +303,7 @@ namespace console {
#endif
}
void replace_last(char ch) {
static void replace_last(char ch) {
#if defined(_WIN32)
pop_cursor();
put_codepoint(&ch, 1, 1);
@ -312,7 +312,7 @@ namespace console {
#endif
}
void append_utf8(char32_t ch, std::string & out) {
static void append_utf8(char32_t ch, std::string & out) {
if (ch <= 0x7F) {
out.push_back(static_cast<unsigned char>(ch));
} else if (ch <= 0x7FF) {
@ -333,7 +333,7 @@ namespace console {
}
// Helper function to remove the last UTF-8 character from a string
void pop_back_utf8_char(std::string & line) {
static void pop_back_utf8_char(std::string & line) {
if (line.empty()) {
return;
}
@ -349,7 +349,7 @@ namespace console {
line.erase(pos);
}
bool readline_advanced(std::string & line, bool multiline_input) {
static bool readline_advanced(std::string & line, bool multiline_input) {
if (out != stdout) {
fflush(stdout);
}
@ -452,7 +452,7 @@ namespace console {
return has_more;
}
bool readline_simple(std::string & line, bool multiline_input) {
static bool readline_simple(std::string & line, bool multiline_input) {
#if defined(_WIN32)
std::wstring wline;
if (!std::getline(std::wcin, wline)) {

View file

@ -9,7 +9,7 @@
namespace grammar_parser {
// NOTE: assumes valid utf8 (but checks for overrun)
// copied from llama.cpp
std::pair<uint32_t, const char *> decode_utf8(const char * src) {
static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t first_byte = static_cast<uint8_t>(*src);
uint8_t highbits = first_byte >> 4;
@ -24,19 +24,19 @@ namespace grammar_parser {
return std::make_pair(value, pos);
}
uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
return result.first->second;
}
uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
return next_id;
}
void add_rule(
static void add_rule(
parse_state & state,
uint32_t rule_id,
const std::vector<llama_grammar_element> & rule) {
@ -46,11 +46,11 @@ namespace grammar_parser {
state.rules[rule_id] = rule;
}
bool is_word_char(char c) {
static bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
}
std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
const char * pos = src;
const char * end = src + size;
uint32_t value = 0;
@ -73,7 +73,7 @@ namespace grammar_parser {
return std::make_pair(value, pos);
}
const char * parse_space(const char * src, bool newline_ok) {
static const char * parse_space(const char * src, bool newline_ok) {
const char * pos = src;
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
@ -88,7 +88,7 @@ namespace grammar_parser {
return pos;
}
const char * parse_name(const char * src) {
static const char * parse_name(const char * src) {
const char * pos = src;
while (is_word_char(*pos)) {
pos++;
@ -99,7 +99,7 @@ namespace grammar_parser {
return pos;
}
std::pair<uint32_t, const char *> parse_char(const char * src) {
static std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') {
switch (src[1]) {
case 'x': return parse_hex(src + 2, 2);
@ -129,7 +129,7 @@ namespace grammar_parser {
uint32_t rule_id,
bool is_nested);
const char * parse_sequence(
static const char * parse_sequence(
parse_state & state,
const char * src,
const std::string & rule_name,
@ -247,7 +247,7 @@ namespace grammar_parser {
return pos;
}
const char * parse_rule(parse_state & state, const char * src) {
static const char * parse_rule(parse_state & state, const char * src) {
const char * name_end = parse_name(src);
const char * pos = parse_space(name_end, false);
size_t name_len = name_end - src;
@ -285,7 +285,7 @@ namespace grammar_parser {
}
}
void print_grammar_char(FILE * file, uint32_t c) {
static void print_grammar_char(FILE * file, uint32_t c) {
if (0x20 <= c && c <= 0x7f) {
fprintf(file, "%c", static_cast<char>(c));
} else {
@ -294,7 +294,7 @@ namespace grammar_parser {
}
}
bool is_char_element(llama_grammar_element elem) {
static bool is_char_element(llama_grammar_element elem) {
switch (elem.type) {
case LLAMA_GRETYPE_CHAR: return true;
case LLAMA_GRETYPE_CHAR_NOT: return true;
@ -304,7 +304,7 @@ namespace grammar_parser {
}
}
void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
for (auto elem : rule) {
switch (elem.type) {
case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
@ -334,7 +334,7 @@ namespace grammar_parser {
fprintf(file, "\n");
}
void print_rule(
static void print_rule(
FILE * file,
uint32_t rule_id,
const std::vector<llama_grammar_element> & rule,

View file

@ -57,10 +57,22 @@ def count_model_parts(dir_model: str) -> int:
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args()
args = parse_args()

View file

@ -55,10 +55,22 @@ def count_model_parts(dir_model: Path) -> int:
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args()
args = parse_args()

View file

@ -56,10 +56,22 @@ def count_model_parts(dir_model: Path) -> int:
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args()
args = parse_args()

248
convert-starcoder-hf-to-gguf.py Executable file
View file

@ -0,0 +1,248 @@
#!/usr/bin/env python3
# HF starcoder --> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def bytes_to_unicode():
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
return dict(zip(bs, (chr(n) for n in cs)))
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.STARCODER
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["n_layer"]
gguf_writer.add_name("StarCoder")
gguf_writer.add_context_length(hparams["n_positions"])
gguf_writer.add_embedding_length(hparams["n_embd"])
gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"])
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
gguf_writer.add_file_type(ftype)
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []
tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
sys.exit(1)
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
print("gguf: get gpt2 tokenizer vocab")
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i in range(vocab_size):
if i in reverse_vocab:
try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
tokens.append(text)
scores.append(0.0) # dymmy
toktypes.append(gguf.TokenType.NORMAL) # dummy
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform
n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
head_dim = hparams["n_embd"] // n_head
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(dir_model / part_name, map_location="cpu")
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View file

@ -9,12 +9,12 @@
#endif
#ifdef LLAMA_DEFAULT_RMS_EPS
static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
#else
static const float rms_norm_eps = 5e-6f;
constexpr float rms_norm_eps = 5e-6f;
#endif
float frand() {
static float frand() {
return (float)rand()/(float)RAND_MAX;
}
@ -25,19 +25,21 @@ struct random_normal_distribution {
float max;
};
void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
static void init_random_normal_distribution(
struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
) {
rnd->gen = std::mt19937(seed);
rnd->nd = std::normal_distribution<float>{mean, std};
rnd->min = min;
rnd->max = max;
}
float frand_normal(struct random_normal_distribution * rnd) {
static float frand_normal(struct random_normal_distribution * rnd) {
const float r = rnd->nd(rnd->gen);
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
}
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) {
@ -48,13 +50,9 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
ggml_graph_compute(graph, &plan);
}
struct ggml_tensor * randomize_tensor(
struct ggml_tensor * tensor,
int ndims,
const int64_t ne[],
float fmin,
float fmax) {
static struct ggml_tensor * randomize_tensor(
struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
) {
switch (ndims) {
case 1:
for (int i0 = 0; i0 < ne[0]; i0++) {
@ -95,11 +93,9 @@ struct ggml_tensor * randomize_tensor(
return tensor;
}
struct ggml_tensor * randomize_tensor_normal(
struct ggml_tensor * tensor,
int ndims,
const int64_t ne[],
struct random_normal_distribution * rnd) {
static struct ggml_tensor * randomize_tensor_normal(
struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
) {
float scale = 1.0; // xavier
switch (ndims) {
case 1:
@ -159,7 +155,7 @@ struct llama_hparams {
}
};
uint32_t get_n_ff(const struct llama_hparams* hparams) {
static uint32_t get_n_ff(const struct llama_hparams* hparams) {
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
return n_ff;
}
@ -260,7 +256,7 @@ struct llama_model_lora {
std::vector<llama_layer_lora> layers;
};
void init_model(struct llama_model * model) {
static void init_model(struct llama_model * model) {
const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd;
@ -297,7 +293,7 @@ void init_model(struct llama_model * model) {
}
void init_model_lora(struct llama_model_lora * model) {
static void init_model_lora(struct llama_model_lora * model) {
const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd;
@ -340,7 +336,7 @@ void init_model_lora(struct llama_model_lora * model) {
}
}
void set_param_model(struct llama_model * model) {
static void set_param_model(struct llama_model * model) {
const auto& hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer;
@ -366,7 +362,7 @@ void set_param_model(struct llama_model * model) {
}
}
void set_param_model_lora(struct llama_model_lora * model) {
static void set_param_model_lora(struct llama_model_lora * model) {
const auto& hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer;
@ -397,7 +393,7 @@ void set_param_model_lora(struct llama_model_lora * model) {
}
}
void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
const auto & hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer;
@ -426,7 +422,9 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std
}
void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) {
static void randomize_model_lora(
struct llama_model_lora * model, int seed, float mean, float std, float min, float max
) {
const auto & hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer;
@ -459,7 +457,7 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean,
}
}
bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
const auto & hparams = model->hparams;
const uint32_t n_ctx = hparams.n_ctx;
@ -495,7 +493,7 @@ bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int
return true;
}
bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
const auto & hparams = model->hparams;
const uint32_t n_ctx = hparams.n_ctx;
@ -531,15 +529,15 @@ bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora *
return true;
}
struct ggml_tensor * forward(
struct llama_model * model,
struct llama_kv_cache * cache,
struct ggml_context * ctx0,
struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input,
const int n_tokens,
const int n_past) {
static struct ggml_tensor * forward(
struct llama_model * model,
struct llama_kv_cache * cache,
struct ggml_context * ctx0,
struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input,
const int n_tokens,
const int n_past
) {
const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache;
@ -756,25 +754,25 @@ struct ggml_tensor * forward(
return inpL;
}
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
GGML_ASSERT(tensor->n_dims == 1);
GGML_ASSERT(tensor->ne[0] == ne0);
}
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
GGML_ASSERT(tensor->n_dims == 2);
GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1);
}
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
GGML_ASSERT(tensor->n_dims == 3);
GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1);
GGML_ASSERT(tensor->ne[2] == ne2);
}
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
GGML_ASSERT(tensor->n_dims == 4);
GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1);
@ -782,16 +780,16 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
GGML_ASSERT(tensor->ne[3] == ne3);
}
struct ggml_tensor * forward_batch(
struct llama_model * model,
struct llama_kv_cache * cache,
struct ggml_context * ctx0,
struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input,
const int n_tokens,
const int n_past,
const int n_batch) {
static struct ggml_tensor * forward_batch(
struct llama_model * model,
struct llama_kv_cache * cache,
struct ggml_context * ctx0,
struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input,
const int n_tokens,
const int n_past,
const int n_batch
) {
const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache;
@ -1073,16 +1071,15 @@ struct ggml_tensor * forward_batch(
return inpL;
}
struct ggml_tensor * forward_lora(
struct llama_model_lora * model,
struct llama_kv_cache * cache,
struct ggml_context * ctx0,
struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input,
const int n_tokens,
const int n_past) {
static struct ggml_tensor * forward_lora(
struct llama_model_lora * model,
struct llama_kv_cache * cache,
struct ggml_context * ctx0,
struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input,
const int n_tokens,
const int n_past
) {
const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache;
@ -1328,7 +1325,7 @@ struct ggml_tensor * forward_lora(
return inpL;
}
void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
assert(logits->n_dims == 2);
assert(probs->n_dims == 2);
assert(best_samples->n_dims == 1);
@ -1359,7 +1356,10 @@ void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, str
}
}
void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
static void sample_softmax_batch(
struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
struct ggml_tensor * best_samples
) {
GGML_ASSERT(best_samples->n_dims == 2);
GGML_ASSERT(logits->n_dims == 3);
GGML_ASSERT(probs->n_dims == 3);
@ -1393,7 +1393,7 @@ void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits
}
}
void print_row(struct ggml_tensor * probs, int i) {
static void print_row(struct ggml_tensor * probs, int i) {
for (int k = 0; k < probs->ne[0]; ++k) {
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
printf(" %.2f", p);
@ -1401,7 +1401,7 @@ void print_row(struct ggml_tensor * probs, int i) {
printf("\n");
}
void print_matrix(struct ggml_tensor * probs) {
static void print_matrix(struct ggml_tensor * probs) {
assert(probs->n_dims == 2);
for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) {
@ -1412,7 +1412,7 @@ void print_matrix(struct ggml_tensor * probs) {
}
}
void print_token(int token, int n_vocab) {
static void print_token(int token, int n_vocab) {
for (int k = 0; k < token; ++k) {
printf(" ");
}
@ -1423,14 +1423,14 @@ void print_token(int token, int n_vocab) {
printf("\n");
}
void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
for (int i=0; i<tokens->ne[0]; ++i) {
int token = ggml_get_i32_1d(tokens, i);
print_token(token, n_vocab);
}
}
void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
int n_tokens = tokens_input->ne[0];
int n_vocab = targets->ne[0];
float randomness = 0.0f;
@ -1451,7 +1451,9 @@ void get_example_targets(int example_id, struct ggml_tensor * tokens_input, stru
}
}
void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
static void get_example_targets_batch(
struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
) {
GGML_ASSERT(tokens_input->n_dims == 2);
GGML_ASSERT( targets->n_dims == 3);
int n_tokens = tokens_input->ne[0];
@ -1474,7 +1476,7 @@ void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct
}
}
void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
int n_tokens = tokens_input->ne[0];
int n_vocab = targets->ne[0];
for (int i=0; i<n_tokens-n_shift; ++i) {
@ -1485,12 +1487,16 @@ void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * tar
}
}
struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
static struct ggml_tensor * square_error_loss(
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
) {
// todo: instead of a-b: a[1:]-b[:-1]
return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
}
struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
static struct ggml_tensor * cross_entropy_loss(
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
) {
const float eps = 1e-3f;
return
ggml_sum(ctx,

View file

@ -3,6 +3,3 @@ add_executable(${TARGET} beam-search.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,6 +1,5 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <cassert>
#include <cinttypes>
@ -30,7 +29,8 @@ struct ostream_beam_view {
llama_context * ctx;
llama_beam_view beam_view;
};
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
@ -46,7 +46,7 @@ struct beam_search_callback_data {
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc.
bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
}
@ -56,7 +56,7 @@ bool is_at_eob(const beam_search_callback_data & callback_data, const llama_toke
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
// Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {

View file

@ -1,7 +1,8 @@
set(TARGET benchmark)
add_executable(${TARGET} benchmark-matmult.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)

View file

@ -1,5 +1,5 @@
#include "common.h"
#include "ggml.h"
#include "build-info.h"
#include <locale.h>
#include <assert.h>
@ -99,7 +99,7 @@ int main(int argc, char ** argv) {
exit(1);
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
print_build_info();
printf("Starting Test\n");
// create the ggml context

View file

@ -115,7 +115,7 @@ struct TransformerWeights {
}
};
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
// we calloc instead of malloc to keep valgrind happy
w->token_embedding_table = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
@ -158,7 +158,7 @@ void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
}
}
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
@ -189,7 +189,7 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar
return 0;
}
void print_sample_weights(TransformerWeights *w){
static void print_sample_weights(TransformerWeights *w){
printf("----- Quick print of first of the weight vales of all the variables\n");
printf("%f\n", w->token_embedding_table[0]);
printf("%f\n", w->rms_att_weight[0]);
@ -324,7 +324,7 @@ struct train_params {
int mem_compute1_gb;
};
void print_params(struct my_llama_hparams * params) {
static void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
printf("%s: n_embd: %d\n", __func__, params->n_embd);
@ -335,7 +335,7 @@ void print_params(struct my_llama_hparams * params) {
printf("%s: n_rot: %d\n", __func__, params->n_rot);
}
void init_model(struct my_llama_model * model) {
static void init_model(struct my_llama_model * model) {
const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd;
@ -408,17 +408,17 @@ void init_model(struct my_llama_model * model) {
}
}
float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr;
}
int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr;
}
void print_row(struct ggml_tensor * probs, int i) {
static void print_row(struct ggml_tensor * probs, int i) {
for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i);
printf(" %f", p);
@ -426,7 +426,7 @@ void print_row(struct ggml_tensor * probs, int i) {
printf("\n");
}
void print_matrix(struct ggml_tensor * probs) {
static void print_matrix(struct ggml_tensor * probs) {
assert(probs->n_dims == 2);
for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) {
@ -531,7 +531,7 @@ struct llama_file {
}
};
bool is_ggml_file(const char *filename) {
static bool is_ggml_file(const char * filename) {
llama_file file(filename, "rb");
if (file.size < 4) {
return false;
@ -540,7 +540,7 @@ bool is_ggml_file(const char *filename) {
return magic == GGUF_MAGIC;
}
static std::string llama_escape_whitespaces(const std::string& text) {
static std::string llama_escape_whitespaces(const std::string & text) {
std::ostringstream out;
for (char c : text) {
if (c == ' ') out << "\xe2\x96\x81";
@ -549,7 +549,7 @@ static std::string llama_escape_whitespaces(const std::string& text) {
return out.str();
}
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
if (is_ggml_file(filename)) {
struct ggml_context * ctx_data = NULL;
@ -637,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
}
}
void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
int ct;
switch (gg_weights->n_dims){
case 1:
@ -673,7 +673,9 @@ void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * kar
}
}
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
static void save_as_llama_model(
struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
) {
// convert AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings
// float* -> struct ggml_tensor
@ -785,7 +787,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
gguf_free(ctx);
}
struct train_params get_default_train_params() {
static struct train_params get_default_train_params() {
struct train_params params;
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
params.fn_llama2c_output_model = "ak_llama_model.bin";
@ -835,7 +837,7 @@ struct train_params get_default_train_params() {
return params;
}
void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
@ -846,7 +848,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
fprintf(stderr, "\n");
}
bool params_parse(int argc, char ** argv, struct train_params * params) {
static bool params_parse(int argc, char ** argv, struct train_params * params) {
bool invalid_param = false;
bool reqd_param_found = false;
std::string arg;
@ -901,7 +903,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
return true;
}
std::string basename(const std::string &path) {
static std::string basename(const std::string &path) {
size_t pos = path.find_last_of("/\\");
if (pos == std::string::npos) {
return path;

View file

@ -1,3 +1,4 @@
#include "common.h"
#include "embd-input.h"
#include <cassert>
@ -22,7 +23,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
return nullptr;
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = uint32_t(time(NULL));

View file

@ -3,7 +3,6 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
extern "C" {

View file

@ -1,6 +1,5 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <ctime>
@ -17,7 +16,7 @@ int main(int argc, char ** argv) {
params.embedding = true;
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);

View file

@ -13,14 +13,14 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
template<typename T>
template <typename T>
static std::string to_string(const T & val) {
std::stringstream ss;
ss << val;
return ss.str();
}
bool gguf_ex_write(const std::string & fname) {
static bool gguf_ex_write(const std::string & fname) {
struct gguf_context * ctx = gguf_init_empty();
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
@ -85,7 +85,7 @@ bool gguf_ex_write(const std::string & fname) {
}
// just read tensor info
bool gguf_ex_read_0(const std::string & fname) {
static bool gguf_ex_read_0(const std::string & fname) {
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ NULL,
@ -143,7 +143,7 @@ bool gguf_ex_read_0(const std::string & fname) {
}
// read and create ggml_context containing the tensors and their data
bool gguf_ex_read_1(const std::string & fname) {
static bool gguf_ex_read_1(const std::string & fname) {
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {

View file

@ -74,14 +74,6 @@ static T stdev(const std::vector<T> & v) {
return stdev;
}
static bool ggml_cpu_has_metal() {
#if defined(GGML_USE_METAL)
return true;
#else
return false;
#endif
}
static std::string get_cpu_info() {
std::string id;
#ifdef __linux__

View file

@ -41,7 +41,8 @@ static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;
void write_logfile(
static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model,
const std::vector<llama_token> & input_tokens, const std::string & output,
const std::vector<llama_token> & output_tokens
@ -86,7 +87,7 @@ void write_logfile(
}
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
void sigint_handler(int signo) {
static void sigint_handler(int signo) {
if (signo == SIGINT) {
if (!is_interacting) {
is_interacting = true;
@ -148,6 +149,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);

View file

@ -1,6 +1,5 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <cmath>
#include <cstdio>
@ -28,9 +27,10 @@ struct results_log_softmax {
float prob;
};
void write_logfile(const llama_context * ctx, const gpt_params & params,
const llama_model * model, const struct results_perplexity & results) {
static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model,
const struct results_perplexity & results
) {
if (params.logdir.empty()) {
return;
}
@ -76,7 +76,7 @@ void write_logfile(const llama_context * ctx, const gpt_params & params,
fclose(logfile);
}
std::vector<float> softmax(const std::vector<float>& logits) {
static std::vector<float> softmax(const std::vector<float>& logits) {
std::vector<float> probs(logits.size());
float max_logit = logits[0];
for (float v : logits) max_logit = std::max(max_logit, v);
@ -92,7 +92,7 @@ std::vector<float> softmax(const std::vector<float>& logits) {
return probs;
}
results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
float max_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
double sum_exp = 0.0;
@ -100,9 +100,10 @@ results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
}
void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history) {
static void process_logits(
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history
) {
std::mutex mutex;
int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@ -130,7 +131,7 @@ void process_logits(int n_vocab, const float * logits, const int * tokens, int n
}
results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
@ -260,8 +261,7 @@ results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params)
return {tokens, std::exp(nll / count), logit_history, prob_history};
}
results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
if (params.ppl_stride > 0) {
return perplexity_v2(ctx, params);
}
@ -400,8 +400,9 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
return {tokens, ppl, logit_history, prob_history};
}
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
int n_vocab, int n_thread) {
static std::vector<float> hellaswag_evaluate_tokens(
llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, int n_vocab, int n_thread
) {
std::vector<float> result;
result.reserve(tokens.size() * n_vocab);
size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
@ -421,7 +422,7 @@ std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vec
return result;
}
void hellaswag_score(llama_context * ctx, const gpt_params & params) {
static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// Calculates hellaswag score (acc_norm) from prompt
//
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@ -668,7 +669,7 @@ int main(int argc, char ** argv) {
params.n_ctx += params.ppl_stride/2;
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);

View file

@ -2,4 +2,5 @@ set(TARGET quantize-stats)
add_executable(${TARGET} quantize-stats.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,7 +1,6 @@
#include "ggml.h"
#include "build-info.h"
#define LLAMA_API_INTERNAL
#include "common.h"
#include "ggml.h"
#include "llama.h"
#include <algorithm>
@ -34,8 +33,8 @@ struct quantize_stats_params {
std::vector<enum ggml_type> include_types;
};
const size_t HISTOGRAM_BUCKETS = 150;
const double HISTOGRAM_RANGE = 0.03;
constexpr size_t HISTOGRAM_BUCKETS = 150;
constexpr double HISTOGRAM_RANGE = 0.03;
struct error_stats {
size_t num_samples;
@ -44,8 +43,7 @@ struct error_stats {
uint64_t error_histogram[HISTOGRAM_BUCKETS];
};
void quantize_stats_print_usage(int /*argc*/, char ** argv) {
static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
quantize_stats_params params;
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
@ -71,7 +69,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
}
// Check if a layer is included/excluded by command line
bool layer_included(const quantize_stats_params & params, const std::string & layer) {
static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
for (const auto& excluded : params.exclude_layers) {
if (std::regex_search(layer, std::regex(excluded))) {
return false;
@ -86,7 +84,7 @@ bool layer_included(const quantize_stats_params & params, const std::string & la
}
// Update error statistics given vectors with the before/after result of quantization
void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
for (int64_t i = 0; i < nelements; i++) {
double diff = input[i] - output[i];
stats.total_error += diff * diff;
@ -96,14 +94,14 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou
stats.num_samples += nelements;
}
void combine_error_stats(error_stats & into, const error_stats & from) {
static void combine_error_stats(error_stats & into, const error_stats & from) {
into.num_samples += from.num_samples;
into.total_error += from.total_error;
if (from.max_error > into.max_error) into.max_error = from.max_error;
for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
}
double find_quantile(const error_stats & stats, double quantile) {
static double find_quantile(const error_stats & stats, double quantile) {
double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
double accum = 0;
@ -116,7 +114,7 @@ double find_quantile(const error_stats & stats, double quantile) {
return INFINITY;
}
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
double rmse = sqrt(stats.total_error / (double) stats.num_samples);
double median = find_quantile(stats, .5);
double pct95 = find_quantile(stats, .95);
@ -143,17 +141,10 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
void test_roundtrip_on_chunk(
const ggml_tensor * layer,
int64_t offset,
int64_t chunk_size,
const ggml_type_traits_t & qfns,
bool use_reference,
float * input_scratch,
char * quantized_scratch,
float * output_scratch,
error_stats & stats) {
static void test_roundtrip_on_chunk(
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
) {
if (layer->type == GGML_TYPE_F16) {
for (int i = 0; i < chunk_size; i++) {
input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
@ -174,18 +165,11 @@ void test_roundtrip_on_chunk(
// Run quantization function for a single layer and update error stats
void test_roundtrip_on_layer(
std::string & name,
bool print_layer_stats,
const ggml_type_traits_t & qfns,
bool use_reference,
const ggml_tensor * layer,
std::vector<float> & input_scratch,
std::vector<char> & quantized_scratch,
std::vector<float> & output_scratch,
error_stats & total_error,
int max_thread = 0) {
static void test_roundtrip_on_layer(
std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
) {
assert(tensor_is_contiguous(layer));
error_stats layer_error {};
uint64_t nelements = ggml_nelements(layer);
@ -314,7 +298,7 @@ int main(int argc, char ** argv) {
return 1;
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
print_build_info();
// load the model
fprintf(stderr, "Loading model\n");

View file

@ -2,6 +2,7 @@ set(TARGET quantize)
add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)

View file

@ -1,3 +1,4 @@
#include "common.h"
#include "llama.h"
#include <cstdio>
@ -38,7 +39,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
};
bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
std::string ftype_str;
for (auto ch : ftype_str_in) {
@ -70,7 +71,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
// usage:
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
//
void usage(const char * executable) {
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
@ -159,6 +160,8 @@ int main(int argc, char ** argv) {
}
}
print_build_info();
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
if (params.nthread > 0) {
fprintf(stderr, " using %d threads", params.nthread);

View file

@ -1,6 +1,5 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <vector>
#include <cstdio>
@ -17,7 +16,7 @@ int main(int argc, char ** argv) {
return 1;
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
print_build_info();
if (params.n_predict < 0) {
params.n_predict = 16;

View file

@ -1083,8 +1083,9 @@ static json format_final_response(llama_server_context &llama, const std::string
return res;
}
static json format_partial_response(llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs)
{
static json format_partial_response(
llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs
) {
json res = json{
{"content", content},
{"stop", false},
@ -1215,7 +1216,7 @@ static void log_server_request(const Request &req, const Response &res)
});
}
bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
}
@ -1225,7 +1226,7 @@ bool is_at_eob(llama_server_context & server_context, const llama_token * tokens
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
auto & llama = *static_cast<llama_server_context*>(callback_data);
// Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
@ -1258,7 +1259,8 @@ struct token_translator {
std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
};
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama)
{
auto & gtps = llama.generated_token_probs;
auto translator = token_translator{llama.ctx};
auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };

View file

@ -3,6 +3,3 @@ add_executable(${TARGET} simple.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,5 +1,3 @@
#include "build-info.h"
#include "common.h"
#include "llama.h"

View file

@ -131,6 +131,10 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
}
static bool ggml_is_view(struct ggml_tensor * t) {
return t->view_src != NULL;
}
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
#ifdef GGML_ALLOCATOR_DEBUG
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
@ -338,8 +342,8 @@ static void free_vmem(void * base_addr, size_t size) {
// allocate uncommitted virtual memory to measure the size of the graph
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
// 1TB for 64-bit, 1GB for 32-bit
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
// 128GB for 64-bit, 1GB for 32-bit
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
do {
*base_addr = alloc_vmem(*size);
if (*base_addr != NULL) {
@ -399,10 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
//////////// compute graph allocator
static bool ggml_is_view(struct ggml_tensor * t) {
return t->view_src != NULL;
}
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
if (a->type != b->type) {
return false;

View file

@ -78,6 +78,7 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
GGML_METAL_DECL_KERNEL(rms_norm);
GGML_METAL_DECL_KERNEL(norm);
GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
@ -89,6 +90,7 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
@ -237,6 +239,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
GGML_METAL_ADD_KERNEL(rms_norm);
GGML_METAL_ADD_KERNEL(norm);
GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
@ -248,6 +251,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
@ -309,6 +313,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
GGML_METAL_DEL_KERNEL(rms_norm);
GGML_METAL_DEL_KERNEL(norm);
GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
@ -320,6 +325,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
@ -885,6 +891,7 @@ void ggml_metal_graph_compute(
ne00%32 == 0 &&
ne11 > 1) {
switch (src0->type) {
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
@ -919,6 +926,11 @@ void ggml_metal_graph_compute(
// use custom matrix x vector kernel
switch (src0t) {
case GGML_TYPE_F32:
{
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
nrows = 4;
} break;
case GGML_TYPE_F16:
{
nth0 = 32;

View file

@ -118,7 +118,7 @@ kernel void kernel_soft_max(
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
// parallel max
float lmax = psrc0[tpitg[0]];
float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
lmax = MAX(lmax, psrc0[i00]);
}
@ -158,7 +158,7 @@ kernel void kernel_soft_max_4(
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
// parallel max
float4 lmax4 = psrc4[tpitg[0]];
float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
lmax4 = fmax(lmax4, psrc4[i00]);
}
@ -523,6 +523,79 @@ kernel void kernel_mul_mat_q8_0_f32(
}
}
#define N_F32_F32 4
kernel void kernel_mul_mat_f32_f32(
device const char * src0,
device const char * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
const int64_t r0 = tgpig.x;
const int64_t rb = tgpig.y*N_F32_F32;
const int64_t im = tgpig.z;
device const float * x = (device const float *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
if (ne00 < 128) {
for (int row = 0; row < N_F32_F32; ++row) {
int r1 = rb + row;
if (r1 >= ne11) {
break;
}
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
float sumf = 0;
for (int i = tiisg; i < ne00; i += 32) {
sumf += (float) x[i] * (float) y[i];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
} else {
device const float4 * x4 = (device const float4 *)x;
for (int row = 0; row < N_F32_F32; ++row) {
int r1 = rb + row;
if (r1 >= ne11) {
break;
}
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
device const float4 * y4 = (device const float4 *) y;
float sumf = 0;
for (int i = tiisg; i < ne00/4; i += 32) {
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
}
}
kernel void kernel_mul_mat_f16_f32_1row(
device const char * src0,
device const char * src1,
@ -1399,13 +1472,13 @@ kernel void kernel_mul_mat_q4_K_f32(
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01[[buffer(4)]],
constant int64_t & ne02[[buffer(5)]],
constant int64_t & ne10[[buffer(9)]],
constant int64_t & ne12[[buffer(11)]],
constant int64_t & ne0[[buffer(15)]],
constant int64_t & ne1[[buffer(16)]],
constant uint & gqa[[buffer(17)]],
constant int64_t & ne01 [[buffer(4)]],
constant int64_t & ne02 [[buffer(5)]],
constant int64_t & ne10 [[buffer(9)]],
constant int64_t & ne12 [[buffer(11)]],
constant int64_t & ne0 [[buffer(15)]],
constant int64_t & ne1 [[buffer(16)]],
constant uint & gqa [[buffer(17)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
@ -2012,7 +2085,6 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
for (int i = 0; i < 16; ++i) {
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
}
}
template <typename type4x4>
@ -2269,6 +2341,7 @@ typedef void (mat_mm_t)(
constant uint & gqa,
threadgroup uchar *, uint3, uint, uint);
template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm<float4x4, 1, dequantize_f32>;
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;

90
ggml.c
View file

@ -17290,14 +17290,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
} else {
// wait for other threads to finish
const int last = node_n;
do {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_METAL)
//apple does nothing
#else
while (true) {
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
// depending on the workload and the operating system.
// since it is not clear what is the best approach, it should potentially become user-configurable
// ref: https://github.com/ggerganov/ggml/issues/291
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
sched_yield();
#endif
#endif
node_n = atomic_load(&state->shared->node_n);
} while (node_n == last);
if (node_n != last) break;
};
}
// check if we should stop
@ -18348,7 +18352,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
for (int i = 0; i < cgraph->n_leafs; i++) {
struct ggml_tensor * node = cgraph->leafs[i];
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
i,
node->ne[0], node->ne[1],
ggml_op_name(node->op),
@ -20111,27 +20115,27 @@ const char * gguf_type_name(enum gguf_type type) {
return GGUF_TYPE_NAME[type];
}
int gguf_get_version(struct gguf_context * ctx) {
int gguf_get_version(const struct gguf_context * ctx) {
return ctx->header.version;
}
size_t gguf_get_alignment(struct gguf_context * ctx) {
size_t gguf_get_alignment(const struct gguf_context * ctx) {
return ctx->alignment;
}
size_t gguf_get_data_offset(struct gguf_context * ctx) {
size_t gguf_get_data_offset(const struct gguf_context * ctx) {
return ctx->offset;
}
void * gguf_get_data(struct gguf_context * ctx) {
void * gguf_get_data(const struct gguf_context * ctx) {
return ctx->data;
}
int gguf_get_n_kv(struct gguf_context * ctx) {
int gguf_get_n_kv(const struct gguf_context * ctx) {
return ctx->header.n_kv;
}
int gguf_find_key(struct gguf_context * ctx, const char * key) {
int gguf_find_key(const struct gguf_context * ctx, const char * key) {
// return -1 if key not found
int keyfound = -1;
@ -20147,85 +20151,85 @@ int gguf_find_key(struct gguf_context * ctx, const char * key) {
return keyfound;
}
const char * gguf_get_key(struct gguf_context * ctx, int i) {
const char * gguf_get_key(const struct gguf_context * ctx, int i) {
return ctx->kv[i].key.data;
}
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int i) {
return ctx->kv[i].type;
}
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.arr.type;
}
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
const void * gguf_get_arr_data(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.arr.data;
}
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
struct gguf_kv * kv = &ctx->kv[key_id];
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
return str->data;
}
int gguf_get_arr_n(struct gguf_context * ctx, int i) {
int gguf_get_arr_n(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.arr.n;
}
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.uint8;
}
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.int8;
}
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.uint16;
}
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.int16;
}
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.uint32;
}
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.int32;
}
float gguf_get_val_f32(struct gguf_context * ctx, int i) {
float gguf_get_val_f32(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.float32;
}
uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.uint64;
}
int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.int64;
}
double gguf_get_val_f64(struct gguf_context * ctx, int i) {
double gguf_get_val_f64(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.float64;
}
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
bool gguf_get_val_bool(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.bool_;
}
const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
const char * gguf_get_val_str (const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.str.data;
}
int gguf_get_n_tensors(struct gguf_context * ctx) {
int gguf_get_n_tensors(const struct gguf_context * ctx) {
return ctx->header.n_tensors;
}
int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
// return -1 if tensor not found
int tensorfound = -1;
@ -20241,11 +20245,11 @@ int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
return tensorfound;
}
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
return ctx->infos[i].offset;
}
char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
return ctx->infos[i].name.data;
}
@ -20528,7 +20532,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
buf->offset += el_size;
}
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
// write header
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
@ -20643,7 +20647,7 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
}
}
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
FILE * file = fopen(fname, "wb");
if (!file) {
GGML_ASSERT(false && "failed to open file for writing");
@ -20660,7 +20664,7 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only
fclose(file);
}
size_t gguf_get_meta_size(struct gguf_context * ctx) {
size_t gguf_get_meta_size(const struct gguf_context * ctx) {
// no allocs - only compute size
struct gguf_buf buf = gguf_buf_init(0);
@ -20669,7 +20673,7 @@ size_t gguf_get_meta_size(struct gguf_context * ctx) {
return buf.offset;
}
void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
struct gguf_buf buf = gguf_buf_init(16*1024);
gguf_write_to_buf(ctx, &buf, true);
@ -20745,6 +20749,14 @@ int ggml_cpu_has_arm_fma(void) {
#endif
}
int ggml_cpu_has_metal(void) {
#if defined(GGML_USE_METAL)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_f16c(void) {
#if defined(__F16C__)
return 1;

74
ggml.h
View file

@ -195,6 +195,14 @@
# define GGML_DEPRECATED(func, hint) func
#endif
#ifndef __GNUC__
# define GGML_ATTRIBUTE_FORMAT(...)
#elif defined(__MINGW32__)
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
#else
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
@ -270,7 +278,7 @@ extern "C" {
#if defined(__ARM_NEON) && defined(__CUDACC__)
typedef half ggml_fp16_t;
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
#elif defined(__ARM_NEON)
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t;
@ -685,6 +693,7 @@ extern "C" {
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
GGML_ATTRIBUTE_FORMAT(2, 3)
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
//
@ -1866,39 +1875,39 @@ extern "C" {
GGML_API const char * gguf_type_name(enum gguf_type type);
GGML_API int gguf_get_version (struct gguf_context * ctx);
GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
GGML_API void * gguf_get_data (struct gguf_context * ctx);
GGML_API int gguf_get_version (const struct gguf_context * ctx);
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
GGML_API void * gguf_get_data (const struct gguf_context * ctx);
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
// results are undefined if the wrong type is used for the key
GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i);
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i);
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i);
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i);
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i);
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i);
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i);
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i);
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i);
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i);
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i);
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i);
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i);
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
// overrides existing values or adds a new one
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
@ -1943,11 +1952,11 @@ extern "C" {
//
// write the entire context to a binary file
GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
//
// system info
@ -1961,6 +1970,7 @@ extern "C" {
GGML_API int ggml_cpu_has_fma (void);
GGML_API int ggml_cpu_has_neon (void);
GGML_API int ggml_cpu_has_arm_fma (void);
GGML_API int ggml_cpu_has_metal (void);
GGML_API int ggml_cpu_has_f16c (void);
GGML_API int ggml_cpu_has_fp16_va (void);
GGML_API int ggml_cpu_has_wasm_simd (void);

View file

@ -77,13 +77,14 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
class MODEL_ARCH(IntEnum):
LLAMA : int = auto()
FALCON : int = auto()
BAICHUAN:int = auto()
GPT2 : int = auto()
GPTJ : int = auto()
GPTNEOX: int = auto()
MPT : int = auto()
LLAMA : int = auto()
FALCON : int = auto()
BAICHUAN : int = auto()
GPT2 : int = auto()
GPTJ : int = auto()
GPTNEOX : int = auto()
MPT : int = auto()
STARCODER : int = auto()
class MODEL_TENSOR(IntEnum):
@ -107,13 +108,14 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.LLAMA: "llama",
MODEL_ARCH.FALCON: "falcon",
MODEL_ARCH.BAICHUAN:"baichuan",
MODEL_ARCH.GPT2: "gpt2",
MODEL_ARCH.GPTJ: "gptj",
MODEL_ARCH.GPTNEOX: "gptneox",
MODEL_ARCH.MPT: "mpt",
MODEL_ARCH.LLAMA: "llama",
MODEL_ARCH.FALCON: "falcon",
MODEL_ARCH.BAICHUAN: "baichuan",
MODEL_ARCH.GPT2: "gpt2",
MODEL_ARCH.GPTJ: "gptj",
MODEL_ARCH.GPTNEOX: "gptneox",
MODEL_ARCH.MPT: "mpt",
MODEL_ARCH.STARCODER: "starcoder",
}
MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
@ -171,6 +173,18 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
},
MODEL_ARCH.STARCODER: {
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.POS_EMBD: "position_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
},
MODEL_ARCH.GPT2: {
# TODO
},

387
llama.cpp
View file

@ -1,3 +1,4 @@
#define LLAMA_API_INTERNAL
#include "llama.h"
#include "ggml.h"
@ -109,7 +110,7 @@ static size_t utf8_len(char src) {
return lookup[highbits];
}
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
std::string result;
for (size_t pos = 0; ; pos += search.length()) {
auto new_pos = s.find(search, pos);
@ -161,17 +162,19 @@ enum llm_arch {
LLM_ARCH_GPTJ,
LLM_ARCH_GPTNEOX,
LLM_ARCH_MPT,
LLM_ARCH_STARCODER,
LLM_ARCH_UNKNOWN,
};
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" },
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GPT2, "gpt2" },
{ LLM_ARCH_GPTJ, "gptj" },
{ LLM_ARCH_GPTNEOX, "gptneox" },
{ LLM_ARCH_MPT, "mpt" },
{ LLM_ARCH_BAICHUAN,"baichuan" },
{ LLM_ARCH_LLAMA, "llama" },
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GPT2, "gpt2" },
{ LLM_ARCH_GPTJ, "gptj" },
{ LLM_ARCH_GPTNEOX, "gptneox" },
{ LLM_ARCH_MPT, "mpt" },
{ LLM_ARCH_BAICHUAN, "baichuan" },
{ LLM_ARCH_STARCODER, "starcoder" },
};
enum llm_kv {
@ -377,6 +380,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
},
},
{
LLM_ARCH_STARCODER,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_POS_EMBD, "position_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
},
},
{
LLM_ARCH_UNKNOWN,
{
@ -900,9 +918,11 @@ static llama_state g_state;
// available llama models
enum e_model {
MODEL_UNKNOWN,
MODEL_1B,
MODEL_3B,
MODEL_7B,
MODEL_13B,
MODEL_15B,
MODEL_30B,
MODEL_34B,
MODEL_40B,
@ -971,13 +991,22 @@ struct llama_layer {
struct ggml_tensor * wo;
struct ggml_tensor * wqkv;
// attention bias
struct ggml_tensor * bo;
struct ggml_tensor * bqkv;
// normalization
struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
// ff
struct ggml_tensor * w1; // ffn_gate
struct ggml_tensor * w2; // ffn_down
struct ggml_tensor * w3; // ffn_up
// ff bias
struct ggml_tensor * b2; // ffn_down
struct ggml_tensor * b3; // ffn_up
};
struct llama_kv_cache {
@ -1055,6 +1084,7 @@ struct llama_model {
llama_vocab vocab;
struct ggml_tensor * tok_embeddings;
struct ggml_tensor * pos_embeddings;
struct ggml_tensor * output_norm;
struct ggml_tensor * output_norm_b;
@ -1566,7 +1596,7 @@ struct llama_model_loader {
// load LLaMA models
//
std::string llama_model_ftype_name(enum llama_ftype ftype) {
static std::string llama_model_ftype_name(enum llama_ftype ftype) {
if (ftype & LLAMA_FTYPE_GUESSED) {
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
}
@ -1599,9 +1629,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
static const char * llama_model_type_name(e_model type) {
switch (type) {
case MODEL_1B: return "1B";
case MODEL_3B: return "3B";
case MODEL_7B: return "7B";
case MODEL_13B: return "13B";
case MODEL_15B: return "15B";
case MODEL_30B: return "30B";
case MODEL_34B: return "34B";
case MODEL_40B: return "40B";
@ -1719,6 +1751,17 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_STARCODER:
{
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
switch (hparams.n_layer) {
case 24: model.type = e_model::MODEL_1B; break;
case 36: model.type = e_model::MODEL_3B; break;
case 42: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_15B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
default: (void)0;
};
@ -2172,6 +2215,85 @@ static void llm_load_tensors(
}
}
} break;
case LLM_ARCH_STARCODER:
{
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
// output
{
ggml_backend backend_norm;
ggml_backend backend_output;
if (n_gpu_layers > int(n_layer)) {
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
#else
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
#endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
} else {
backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU;
}
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) {
vram_weights += ggml_nbytes(model.output_norm);
vram_weights += ggml_nbytes(model.output_norm_b);
}
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
vram_weights += ggml_nbytes(model.output);
}
}
const uint32_t n_ff = hparams.n_ff;
const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
}
}
} break;
default:
throw std::runtime_error("unknown architecture");
};
@ -3315,6 +3437,235 @@ static struct ggml_cgraph * llm_build_falcon(
return gf;
}
static struct ggml_cgraph * llm_build_starcoder(
llama_context & lctx,
const llama_token * tokens,
const float * embd,
int n_tokens,
int n_past) {
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
const int N = n_tokens;
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & kv_self = lctx.kv_self;
GGML_ASSERT(!!kv_self.ctx);
const int64_t n_embd = hparams.n_embd;
const int64_t n_layer = hparams.n_layer;
const int64_t n_ctx = hparams.n_ctx;
const int64_t n_head = hparams.n_head;
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head = hparams.n_embd_head();
const int64_t n_embd_gqa = hparams.n_embd_gqa();
GGML_ASSERT(n_embd_head == hparams.n_rot);
const float norm_eps = hparams.f_norm_eps;
auto & buf_compute = lctx.buf_compute;
struct ggml_init_params params = {
/*.mem_size =*/ buf_compute.size,
/*.mem_buffer =*/ buf_compute.data,
/*.no_alloc =*/ false,
};
params.no_alloc = true;
struct ggml_context * ctx0 = ggml_init(params);
ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_tensor * cur;
struct ggml_tensor * token;
struct ggml_tensor * position;
struct ggml_tensor * inpL;
if (tokens) {
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
ggml_allocr_alloc(lctx.alloc, inp_tokens);
if (!ggml_allocr_is_measure(lctx.alloc)) {
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
}
ggml_set_name(inp_tokens, "inp_tokens");
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
} else {
#ifdef GGML_USE_MPI
GGML_ASSERT(false && "not implemented");
#endif
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
ggml_allocr_alloc(lctx.alloc, token);
if (!ggml_allocr_is_measure(lctx.alloc)) {
memcpy(token->data, embd, N * n_embd * ggml_element_size(inpL));
}
}
{
// Compute position embeddings.
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
ggml_allocr_alloc(lctx.alloc, inp_positions);
if (!ggml_allocr_is_measure(lctx.alloc)) {
for (int i = 0; i < N; ++i) {
((int32_t *) inp_positions->data)[i] = n_past + i;
}
}
ggml_set_name(inp_positions, "inp_positions");
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
}
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_allocr_alloc(lctx.alloc, KQ_scale);
if (!ggml_allocr_is_measure(lctx.alloc)) {
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
}
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
inpL = ggml_add(ctx0, token, position);
ggml_set_name(inpL, "inpL");
for (int il = 0; il < n_layer; ++il) {
{
// Norm
cur = ggml_norm(ctx0, inpL, norm_eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
}
{
// Self Attention
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
struct ggml_tensor * Qcur = tmpq;
struct ggml_tensor * Kcur = tmpk;
{
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
ggml_set_name(Vcur, "Vcur");
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
ggml_set_name(k, "k");
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
}
struct ggml_tensor * Q =
ggml_permute(ctx0,
ggml_cpy(ctx0,
Qcur,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)),
0, 2, 1, 3);
ggml_set_name(Q, "Q");
struct ggml_tensor * K =
ggml_view_3d(ctx0, kv_self.k,
n_embd_head, n_past + N, n_head_kv,
ggml_element_size(kv_self.k)*n_embd_gqa,
ggml_element_size(kv_self.k)*n_embd_head,
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
ggml_set_name(K, "K");
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
ggml_set_name(KQ, "KQ");
// KQ_scaled = KQ / sqrt(n_embd_head)
// KQ_scaled shape [n_past + N, N, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
ggml_set_name(KQ_scaled, "KQ_scaled");
// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
ggml_set_name(KQ_masked, "KQ_masked");
// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
ggml_set_name(KQ_soft_max, "KQ_soft_max");
// split cached V into n_head heads
struct ggml_tensor * V =
ggml_view_3d(ctx0, kv_self.v,
n_past + N, n_embd_head, n_head_kv,
ggml_element_size(kv_self.v)*n_ctx,
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
ggml_set_name(V, "V");
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
ggml_set_name(KQV, "KQV");
// KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
ggml_set_name(KQV_merged, "KQV_merged");
// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ggml_cpy(ctx0,
KQV_merged,
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
ggml_set_name(cur, "KQV_merged_contiguous");
}
// Projection
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
// Add the input
cur = ggml_add(ctx0, cur, inpL);
struct ggml_tensor * inpFF = cur;
// FF
{
// Norm
{
cur = ggml_norm(ctx0, inpFF, norm_eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
}
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
// GELU activation
cur = ggml_gelu(ctx0, cur);
// Projection
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
}
inpL = ggml_add(ctx0, cur, inpFF);
}
// Output Norm
{
cur = ggml_norm(ctx0, inpL, norm_eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
}
ggml_set_name(cur, "result_norm");
cur = ggml_mul_mat(ctx0, model.output, cur);
ggml_set_name(cur, "result_output");
ggml_build_forward_expand(gf, cur);
ggml_free(ctx0);
return gf;
}
static struct ggml_cgraph * llama_build_graph(
llama_context & lctx,
const llama_token * tokens,
@ -3338,6 +3689,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
} break;
case LLM_ARCH_STARCODER:
{
result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past);
} break;
default:
GGML_ASSERT(false);
};
@ -3951,7 +4306,7 @@ struct llama_grammar_candidate {
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
const char * src,
llama_partial_utf8 partial_start) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@ -5552,7 +5907,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
// TODO: after the GGUF PR, this likely won't work and needs to be updated
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
static int llama_apply_lora_from_file_internal(
const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
) {
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
const int64_t t_start_lora_us = ggml_time_us();
@ -6099,7 +6456,7 @@ struct llama_context * llama_new_context_with_model(
return ctx;
}
struct llama_context * llama_init_from_file(
static struct llama_context * llama_init_from_file(
const char * path_model,
struct llama_context_params params) {
struct llama_model * model = llama_load_model_from_file(path_model, params);
@ -6304,7 +6661,7 @@ struct llama_data_file_context : llama_data_context {
* llama_copy_state_data(ctx, &data_ctx);
*
*/
void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
// copy rng
{
std::stringstream rng_ss;
@ -6842,7 +7199,9 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
}
// For internal test use
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
) {
return ctx->model.tensors_by_name;
}

View file

@ -540,7 +540,9 @@ extern "C" {
struct ggml_tensor;
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
#endif // LLAMA_API_INTERNAL

View file

@ -13,7 +13,7 @@
typedef int codepoint;
std::string codepoint_to_utf8(codepoint cp) {
static std::string codepoint_to_utf8(codepoint cp) {
std::string result;
if (0x00 <= cp && cp <= 0x7f) {
result.push_back(cp);