Merge branch 'master' into concedo_experimental

# Conflicts:
#	CMakeLists.txt
#	README.md
This commit is contained in:
Concedo 2023-04-18 17:38:10 +08:00
commit ac61e34d5f
13 changed files with 1032 additions and 125 deletions

124
convert-lora-to-ggml.py Normal file
View file

@ -0,0 +1,124 @@
import json
import os
import re
import struct
import sys
from typing import Any, Dict, Sequence, TextIO
import torch
from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
HF_SUBLAYER_TO_GGML = {
"self_attn.q_proj": "attention.wq",
"self_attn.k_proj": "attention.wk",
"self_attn.v_proj": "attention.wv",
"self_attn.o_proj": "attention.wo",
"mlp.gate_proj": "feed_forward.w1",
"mlp.down_proj": "feed_forward.w2",
"mlp.up_proj": "feed_forward.w3",
"input_layernorm": "attention_norm",
"post_attention_layernorm": "ffn_norm",
# "norm": "norm",
# "embed_tokens": "tok_embeddings",
# "lm_head": "output",
}
def translate_tensor_name(t: str) -> str:
match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
if match:
nn = match.group(1)
sub_layer = match.group(2)
lora_type = match.group(3)
sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
if sub_layer_renamed is None:
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
sys.exit(1)
output_string = (
f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
)
return output_string
else:
print(f"Error: unrecognized tensor {t}")
sys.exit(1)
def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
fout.write(b"ggla"[::-1]) # magic (ggml lora)
fout.write(struct.pack("i", 1)) # file version
fout.write(struct.pack("ii", params["r"], params["lora_alpha"]))
def write_tensor_header(
self, name: str, shape: Sequence[int], data_type: DataType
) -> None:
sname = name.encode("utf-8")
fout.write(
struct.pack(
"iii",
len(shape),
len(sname),
DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
)
)
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
fout.write(sname)
fout.seek((fout.tell() + 31) & -32)
if len(sys.argv) != 2:
print(f"Usage: python {sys.argv[0]} <path>")
print(
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
)
sys.exit(1)
input_json = os.path.join(sys.argv[1], "adapter_config.json")
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
model = torch.load(input_model, map_location="cpu")
with open(input_json, "r") as f:
params = json.load(f)
if params["peft_type"] != "LORA":
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
sys.exit(1)
if params["fan_in_fan_out"] == True:
print("Error: param fan_in_fan_out is not supported")
sys.exit(1)
if params["bias"] is not None and params["bias"] != "none":
print("Error: param bias is not supported")
sys.exit(1)
# TODO: these seem to be layers that have been trained but without lora.
# doesn't seem widely used but eventually should be supported
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
print("Error: param modules_to_save is not supported")
sys.exit(1)
with open(output_path, "wb") as fout:
fout.truncate()
write_file_header(fout, params)
for k, v in model.items():
if k.endswith("lora_A.weight"):
if v.dtype != torch.float16 and v.dtype != torch.float32:
v = v.float()
v = v.T
else:
v = v.float()
t = v.numpy()
tname = translate_tensor_name(k)
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype)
t.tofile(fout)
print(f"Converted {input_json} and {input_model} to {output_path}")

View file

@ -1085,6 +1085,7 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path:
namestr = {
GGMLFileType.AllF32: "f32",
GGMLFileType.MostlyF16: "f16",
GGMLFileType.MostlyQ4_0: "q4_0",
GGMLFileType.MostlyQ4_1: "q4_1",
GGMLFileType.PerLayerIsQ4_1: "q4_1",
}[params.file_type]
@ -1108,7 +1109,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outtype", choices=["f32", "f16", "q4_1"], help="output format (default: based on input)")
parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")

View file

@ -139,6 +139,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.model = argv[i];
} else if (arg == "--lora") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_adapter = argv[i];
params.use_mmap = false;
} else if (arg == "--lora-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_base = argv[i];
} else if (arg == "-i" || arg == "--interactive") {
params.interactive = true;
} else if (arg == "--embedding") {
@ -242,6 +255,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
}
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");

View file

@ -31,11 +31,12 @@ struct gpt_params {
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string prompt = "";
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_prefix = ""; // string to prefix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string lora_adapter = ""; // lora adapter path
std::string lora_base = ""; // base model path for the lora adapter
bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs

View file

@ -114,6 +114,17 @@ int main(int argc, char ** argv) {
}
}
if (!params.lora_adapter.empty()) {
int err = llama_apply_lora_from_file(ctx,
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
return 1;
}
}
// print system information
{
fprintf(stderr, "\n");

View file

@ -134,6 +134,17 @@ int main(int argc, char ** argv) {
}
}
if (!params.lora_adapter.empty()) {
int err = llama_apply_lora_from_file(ctx,
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
return 1;
}
}
// print system information
{
fprintf(stderr, "\n");

View file

@ -221,7 +221,7 @@ int main(int argc, char ** argv) {
break;
}
int j;
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) {
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) j)) != 0; j++) {
// find match
}
if (j < GGML_TYPE_COUNT) {

564
ggml.c
View file

@ -1420,6 +1420,34 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
#endif
}
static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q4_0] = {
.dequantize_row_q = dequantize_row_q4_0,
.quantize_row_q = quantize_row_q4_0,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
.quantize_row_q_dot = quantize_row_q8_0,
.vec_dot_q = ggml_vec_dot_q4_0_q8_0,
},
[GGML_TYPE_Q4_1] = {
.dequantize_row_q = dequantize_row_q4_1,
.quantize_row_q = quantize_row_q4_1,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
.quantize_row_q_dot = quantize_row_q4_1,
.vec_dot_q = ggml_vec_dot_q4_1,
},
// TODO: GGML_TYPE_Q8_0
};
// For internal test use
quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
GGML_ASSERT(i < GGML_TYPE_COUNT);
return quantize_fns[i];
}
//
// simd mappings
//
@ -1977,33 +2005,187 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
}
#if __AVX512F__ && QK4_0 == 32
static inline __m512 dot_q4_0_oneblock_avx512(
static inline __m512i bytes_from_q4_0_twoblocks_avx512( const __m512i blocks ) {
// The 64 bytes of `blocks` contain two consecutive Q4_0 blocks loaded from memory:
// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
// |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32|
// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
// | :. =_ () [] <> () Zz Yy|
// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
// |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
// |Xx Ww Vv Uu Tt Ss Rr Qq Pp Oo Nn Mm Ll Kk Jj Ii Hh Gg Ff Ee Dd Cc Bb Aa |
// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
//
// Bytes 04..19 (block #0) and 24..39 (block #1) both contain 32 nibbles (4-bit unsigned integers).
// We have exactly 64 nibbles, so we want to place each nibble into a separate byte.
// Bytes 00..03 and 20..23 contain scales, which are irrelevant to this function.
// Bytes 40..63 are masked when loading the data, so they are zeroed out.
#ifdef __AVX512VBMI__
const __m512i byte_perm = _mm512_set_epi8(
39, 38, 39, 38, 37, 36, 37, 36, 35, 34, 35, 34, 33, 32, 33, 32,
31, 30, 31, 30, 29, 28, 29, 28, 27, 26, 27, 26, 25, 24, 25, 24,
19, 18, 19, 18, 17, 16, 17, 16, 15, 14, 15, 14, 13, 12, 13, 12,
11, 10, 11, 10, 9, 8, 9, 8, 7, 6, 7, 6, 5, 4, 5, 4
);
const __m512i permuted = _mm512_permutexvar_epi8( byte_perm, blocks );
// After applying VPERMB, `permuted` looks like this:
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
// |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32|
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
// |:. =_ :. =_ () [] () [] <> () <> () Zz Yy Zz Yy Xx Ww Xx Ww Vv Uu Vv Uu Tt Ss Tt Ss Rr Qq Rr Qq|
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
// |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
// |Pp Oo Pp Oo Nn Mm Nn Mm Ll Kk Ll Kk Jj Ii Jj Ii Hh Gg Hh Gg Ff Ee Ff Ee Dd Cc Dd Cc Bb Aa Bb Aa|
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
#else
const __m512i word_perm = _mm512_set_epi16(
19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12,
9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2
);
const __m512i permuted = _mm512_permutexvar_epi16( word_perm, blocks );
// This is the fallback path for CPUs that don't support VPERMB. Since we permute 16-bit groups only,
// VPERMB can be replaced with VPERMW. We could always use VPERMW, but at least on Tiger Lake and
// Ice Lake VPERMW followed by a right shift is quite noticeably slower than VPERMB.
#endif
// Shift every odd-numbered 16-bit group to the right by 4 bits.
const __mmask32 shift_mask = 0xaaaaaaaa;
const __m512i shifted = _mm512_mask_srai_epi16( permuted, shift_mask, permuted, 4 );
// After applying VPSRAW, `shifted` looks like this (the "empty" nibbles are filled with zeroes):
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
// |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
// | : .= :. =_ ( )[ () [] < >( <> () Z zY Zz Yy X xW Xx Ww V vU Vv Uu T tS Tt Ss R rQ Rr Qq
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
// |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
// | P pO Pp Oo N nM Nn Mm L lK Ll Kk J jI Jj Ii H hG Hh Gg F fE Ff Ee D dC Dd Cc B bA Bb Aa|
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
// Now we just need to zero out the higher nibble in each byte, and we're done.
const __m512i low_nibble_mask = _mm512_set1_epi8( 0xf );
return _mm512_and_si512( low_nibble_mask, shifted );
// The final result looks like this:
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
// |63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32|
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
// | : = . _ ( [ ) ] < ( > ) Z Y z y X W x w V U v u T S t s R Q r q|
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
// |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
// | P O p o N M n m L K l k J I j i H G h g F E f e D C d c B A b a|
// +-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
}
static inline __m512 dot_q4_0_twoblocks_avx512(
__m512 acc,
const block_q4_0 * restrict x,
const block_q4_0 * restrict y,
int i
) {
// Compute combined scale for the block
__m512 d = _mm512_set1_ps( x[i].d * y[i].d );
// A pair of Q4_0 blocks spans 40 bytes, while an AVX-512 register has 64. The remaining 24 bytes
// can potentially be unaddressable, so we make sure to mask them out before the load, even though
// we don't use them at all. This might hurt the performance slightly, since the compiler is forced
// to use e.g. `VMOVDQU64 REG, MASK, [ADDR] + VPERMB ..., REG` instead of just `VPERMB ..., [ADDR]`.
const __mmask8 load_mask = 0x1f;
const __m512i blocks_0 = _mm512_maskz_loadu_epi64( load_mask, &x[i] );
const __m512i blocks_1 = _mm512_maskz_loadu_epi64( load_mask, &y[i] );
__m256i bx = bytesFromNibbles( x[i].qs );
__m256i by = bytesFromNibbles( y[i].qs );
// We want to multiply the scales, so we interpret both registers as 16 32-bit floats:
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
// | 15 | 14 | 13 | 12 | 11 | 10 | 09 | 08 | 07 | 06 | 05 | 04 | 03 | 02 | 01 | 00 |
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
// blocks_0_float
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
// | | | | | | | xx | xx | xx | xx | B | xx | xx | xx | xx | A |
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
// blocks_1_float
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
// | | | | | | | xx | xx | xx | xx | D | xx | xx | xx | xx | C |
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
const __m512 blocks_0_float = _mm512_castsi512_ps( blocks_0 );
const __m512 blocks_1_float = _mm512_castsi512_ps( blocks_1 );
// We absolutely shouldn't touch the floats marked with `xx`: they contain some
// random data, which might very well underflow. At least on Intel, this leads
// to a huge penalty that can't be ignored (easily 100x or more) unless you
// compile your code with something like `-ffast-math` to enable FTZ/DAZ flags.
// (and ggml can't assume that you do)...
const __mmask16 scale_mul_mask = 0x21;
#ifdef __clang__
// ...however, clang decides to optimize the multiplication mask away:
// https://godbolt.org/z/P8PqdsfvW
// gcc and MSVC do the sane thing. This horrible workaround forces clang to emit the mask.
__m512i scales;
__asm__(
"vmulps %1, %2, %0%{%3%}"
: "=v" ( scales )
: "vm" ( blocks_0_float ), "v" ( blocks_1_float ), "Yk" ( scale_mul_mask )
);
#else
const __m512 scales = _mm512_maskz_mul_ps( scale_mul_mask, blocks_0_float, blocks_1_float );
#endif
const __m512i scale_perm = _mm512_set_epi32(
5, 5, 5, 5, 5, 5, 5, 5,
0, 0, 0, 0, 0, 0, 0, 0
);
const __m512 permuted_scales = _mm512_permutexvar_ps( scale_perm, scales );
// After VMULPS and VPERMPS, `permuted_scales` looks like this:
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
// | 15 | 14 | 13 | 12 | 11 | 10 | 09 | 08 | 07 | 06 | 05 | 04 | 03 | 02 | 01 | 00 |
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
// | B*D| B*D| B*D| B*D| B*D| B*D| B*D| B*D| A*C| A*C| A*C| A*C| A*C| A*C| A*C| A*C|
// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
const __m256i off = _mm256_set1_epi8( 8 );
bx = _mm256_sub_epi8( bx, off );
by = _mm256_sub_epi8( by, off );
const __m512i bytes_0 = bytes_from_q4_0_twoblocks_avx512( blocks_0 );
const __m512i bytes_1 = bytes_from_q4_0_twoblocks_avx512( blocks_1 );
// Sign-extend 16 signed bytes into int16_t
__m512i x32 = _mm512_cvtepi8_epi16( bx );
__m512i y32 = _mm512_cvtepi8_epi16( by );
// Compute products of int16_t integers, add pairwise
__m512i i64 = _mm512_madd_epi16( x32, y32 );
// Now we want to compute dot products of 4-element byte vectors and store them in
// 32-bit integers. That is (only one 4-element vector is shown for clarity):
// +----+----+----+----+
// ... | 03 | 02 | 01 | 00 |
// +----+----+----+----+
// bytes_0
// +----+----+----+----+
// ... | D | C | B | A |
// +----+----+----+----+
// bytes_1
// +----+----+----+----+
// ... | H | G | F | E |
// +----+----+----+----+
// final_res_int
// +----+----+----+----+
// ... | A*E+B*F+C*G+D*H |
// +----+----+----+----+
const __m512i plus_8 = _mm512_set1_epi8( 8 );
const __m512i bytes_1_minus_8 = _mm512_sub_epi8( bytes_1, plus_8 );
// Convert int32_t to float
__m512 p = _mm512_cvtepi32_ps( i64 );
// Apply the scale, and accumulate
return _mm512_fmadd_ps( d, p, acc );
#ifdef __AVX512VNNI__
// We have VPDPBUSDS in AVX512-VNNI, which does exactly what we want, but with a catch:
// the *left* operand is supposed to be unsigned, while Q4_0 quantization subtracts 8
// from each nibble, so they can be negative. So, instead of `(bytes_0 - 8) * (bytes_1 - 8)`,
// we compute `bytes_0 * (bytes_1 - 8) + bytes_1 * (-8) + 64`. VPDPBUSDS uses an accumulator,
// which means we only need 2 instructions.
const __m512i dot_init = _mm512_set1_epi32( 4 * 64 );
const __m512i minus_8 = _mm512_set1_epi8( -8 );
const __m512i prod_0 = _mm512_dpbusds_epi32( dot_init, bytes_1, minus_8 );
const __m512i final_res_int = _mm512_dpbusds_epi32( prod_0, bytes_0, bytes_1_minus_8 );
#else
// As a fallback, we have VPMADDUBSW in AVX512-BW, which uses 16-bit products instead of 32-bit ones.
// It has the same catch as VPDPBUSDS: the left operand should be unsigned.
// This is essentially the AVX-512 version of the AVX-2 trick used by GH user Const-me
// ref: https://gist.github.com/Const-me/4d30e1fc767ab314596e16e90f53b6f4#file-matmultest-cpp-L119
const __m512i one = _mm512_set1_epi16( 1 );
const __m512i prod_0 = _mm512_maddubs_epi16( bytes_0, bytes_1_minus_8 );
const __m512i prod_1 = _mm512_maddubs_epi16( plus_8, bytes_1_minus_8 );
const __m512i diff = _mm512_sub_epi16( prod_0, prod_1 );
const __m512i final_res_int = _mm512_madd_epi16( diff, one );
#endif
// Finally, we multiply the permuted scales and the 32-bit dot products, then accumulate.
const __m512 final_res_float = _mm512_cvtepi32_ps( final_res_int );
return _mm512_fmadd_ps( permuted_scales, final_res_float, acc );
}
#endif
@ -2135,25 +2317,26 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
__m512 acc0 = _mm512_setzero_ps();
__m512 acc1 = _mm512_setzero_ps();
const int superblock_size = 8;
const int superblock_size = 16;
const int superblock_count = nb / superblock_size;
for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) {
int i = superblock_ix * superblock_size;
acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+0 );
acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+1 );
acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+2 );
acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+3 );
acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+4 );
acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+5 );
acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+6 );
acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+7 );
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+0 );
acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+2 );
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+4 );
acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+6 );
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+8 );
acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+10 );
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+12 );
acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+14 );
}
// Remainders
for (int i = superblock_count * superblock_size; i < nb; ++i) {
acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i );
for (int i = superblock_count * superblock_size; i < nb; i += 2) {
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i );
}
// Horizontal sum of all lanes of the accumulator
@ -5433,6 +5616,26 @@ static void ggml_compute_forward_dup_f16(
}
}
}
} else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) {
quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
size_t id = 0;
uint8_t * dst_ptr = (uint8_t *) dst->data;
size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
float * src0_f32 = (float *) params->wdata;
for (int i03 = 0; i03 < ne03; i03++) {
for (int i02 = 0; i02 < ne02; i02++) {
for (int i01 = 0; i01 < ne01; i01++) {
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
// convert to f32 and quantize
for (int i00 = 0; i00 < ne00; i00++) {
src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
}
quantize_row_q(src0_f32, dst_ptr + id, ne00);
id += dst_row_size;
}
}
}
} else {
GGML_ASSERT(false); // TODO: implement
}
@ -5625,6 +5828,21 @@ static void ggml_compute_forward_dup_f32(
}
}
}
} else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) {
quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
size_t id = 0;
uint8_t * dst_ptr = (uint8_t *) dst->data;
size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
for (int i03 = 0; i03 < ne03; i03++) {
for (int i02 = 0; i02 < ne02; i02++) {
for (int i01 = 0; i01 < ne01; i01++) {
const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
quantize_row_q(src0_ptr, dst_ptr + id, ne00);
id += dst_row_size;
}
}
}
} else {
GGML_ASSERT(false); // TODO: implement
}
@ -5813,6 +6031,212 @@ static void ggml_compute_forward_add_f32(
}
}
static void ggml_compute_forward_add_f16_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int ith = params->ith;
const int nth = params->nth;
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const size_t nb00 = src0->nb[0];
const size_t nb01 = src0->nb[1];
const size_t nb10 = src1->nb[0];
const size_t nb11 = src1->nb[1];
const size_t nb0 = dst->nb[0];
const size_t nb1 = dst->nb[1];
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F16);
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
if (nb10 == sizeof(float)) {
for (int j = ith; j < n; j += nth) {
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
for (int i = 0; i < nc; i++) {
float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr);
}
}
}
else {
// src1 is not contiguous
GGML_ASSERT(false);
}
}
static void ggml_compute_forward_add_f16_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int ith = params->ith;
const int nth = params->nth;
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const size_t nb00 = src0->nb[0];
const size_t nb01 = src0->nb[1];
const size_t nb10 = src1->nb[0];
const size_t nb11 = src1->nb[1];
const size_t nb0 = dst->nb[0];
const size_t nb1 = dst->nb[1];
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F16);
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
if (nb10 == sizeof(ggml_fp16_t)) {
for (int j = ith; j < n; j += nth) {
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
for (int i = 0; i < nc; i++) {
ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10);
dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr));
}
}
}
else {
// src1 is not contiguous
GGML_ASSERT(false);
}
}
static void ggml_compute_forward_add_q_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
//const int64_t ne10 = src1->ne[0];
//const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
//const int64_t ne0 = dst->ne[0];
//const int64_t ne1 = dst->ne[1];
const int64_t ne2 = dst->ne[2];
const int64_t ne3 = dst->ne[3];
const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1];
const int nb02 = src0->nb[2];
const int nb03 = src0->nb[3];
const int nb10 = src1->nb[0];
const int nb11 = src1->nb[1];
const int nb12 = src1->nb[2];
const int nb13 = src1->nb[3];
const int nb0 = dst->nb[0];
const int nb1 = dst->nb[1];
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
const int ith = params->ith;
const int nth = params->nth;
GGML_ASSERT(ne02 == ne12);
GGML_ASSERT(ne03 == ne13);
GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne3 == ne13);
const enum ggml_type type = src0->type;
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
GGML_ASSERT(nb10 == sizeof(float));
// dst cannot be transposed or permuted
GGML_ASSERT(nb0 <= nb1);
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1);
GGML_ASSERT(dst->type == src0->type);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
// total rows in src0
const int nr = ne01*ne02*ne03;
// rows per thread
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
float * wdata = (float*) params->wdata + ne00 * ith;
for (int ir = ir0; ir < ir1; ++ir) {
// src0 indices
const int i03 = ir/(ne02*ne01);
const int i02 = (ir - i03*ne02*ne01)/ne01;
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
// src1 and dst are same shape as src0 => same indices
const int i13 = i03;
const int i12 = i02;
const int i11 = i01;
const int i3 = i03;
const int i2 = i02;
const int i1 = i01;
void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0));
assert(ne00 % 32 == 0);
// unquantize row from src0 to temp buffer
dequantize_row_q(src0_row, wdata, ne00);
// add src1
ggml_vec_acc_f32(ne00, wdata, src1_row);
// quantize row to dst
quantize_row_q(wdata, dst_row, ne00);
}
}
static void ggml_compute_forward_add(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
@ -5823,6 +6247,23 @@ static void ggml_compute_forward_add(
{
ggml_compute_forward_add_f32(params, src0, src1, dst);
} break;
case GGML_TYPE_F16:
{
if (src1->type == GGML_TYPE_F16) {
ggml_compute_forward_add_f16_f16(params, src0, src1, dst);
}
else if (src1->type == GGML_TYPE_F32) {
ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
}
else {
GGML_ASSERT(false);
}
} break;
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
{
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
} break;
default:
{
GGML_ASSERT(false);
@ -7104,30 +7545,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
//}
}
static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q4_0] = {
.dequantize_row_q = dequantize_row_q4_0,
.quantize_row_q = quantize_row_q4_0,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
.quantize_row_q_dot = quantize_row_q8_0,
.vec_dot_q = ggml_vec_dot_q4_0_q8_0,
},
[GGML_TYPE_Q4_1] = {
.dequantize_row_q = dequantize_row_q4_1,
.quantize_row_q = quantize_row_q4_1,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
.quantize_row_q_dot = quantize_row_q4_1,
.vec_dot_q = ggml_vec_dot_q4_1,
},
// TODO: GGML_TYPE_Q8_0
};
// For internal test use
quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
GGML_ASSERT(i < GGML_TYPE_COUNT);
return quantize_fns[i];
}
static void ggml_compute_forward_mul_mat_q_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
@ -7909,11 +8326,11 @@ static void ggml_compute_forward_rope_f16(
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = ggml_fp16_to_fp32(src[0]);
const float x1 = ggml_fp16_to_fp32(src[1]);
const float x0 = GGML_FP16_TO_FP32(src[0]);
const float x1 = GGML_FP16_TO_FP32(src[1]);
dst_data[0] = ggml_fp32_to_fp16(x0*cos_theta - x1*sin_theta);
dst_data[1] = ggml_fp32_to_fp16(x0*sin_theta + x1*cos_theta);
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
}
}
}
@ -9989,13 +10406,29 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
struct ggml_tensor * node = cgraph->nodes[i];
switch (node->op) {
case GGML_OP_CPY:
case GGML_OP_DUP:
{
node->n_tasks = 1;
size_t cur = 0;
if (node->type == GGML_TYPE_Q4_0 || node->type == GGML_TYPE_Q4_1) {
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0];
}
work_size = MAX(work_size, cur);
} break;
case GGML_OP_ADD:
{
node->n_tasks = n_threads;
size_t cur = 0;
if (node->src0->type == GGML_TYPE_Q4_0 || node->src0->type == GGML_TYPE_Q4_1) {
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
}
work_size = MAX(work_size, cur);
} break;
case GGML_OP_SUB:
case GGML_OP_MUL:
@ -10076,7 +10509,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
{
node->n_tasks = n_threads;
} break;
case GGML_OP_CPY:
case GGML_OP_CONT:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
@ -11310,6 +11742,22 @@ int ggml_cpu_has_avx512(void) {
#endif
}
int ggml_cpu_has_avx512_vbmi(void) {
#if defined(__AVX512VBMI__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_avx512_vnni(void) {
#if defined(__AVX512VNNI__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_fma(void) {
#if defined(__FMA__)
return 1;

8
ggml.h
View file

@ -430,6 +430,12 @@ struct ggml_tensor * ggml_add(
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * ggml_add_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * ggml_sub(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -808,6 +814,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
int ggml_cpu_has_avx(void);
int ggml_cpu_has_avx2(void);
int ggml_cpu_has_avx512(void);
int ggml_cpu_has_avx512_vbmi(void);
int ggml_cpu_has_avx512_vnni(void);
int ggml_cpu_has_fma(void);
int ggml_cpu_has_neon(void);
int ggml_cpu_has_arm_fma(void);

349
llama.cpp
View file

@ -1,6 +1,8 @@
// Defines fileno on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#include <cstdint>
#include <cstdio>
#endif
#include "llama_util.h"
@ -42,35 +44,51 @@ static const size_t MB = 1024*1024;
// TODO: dynamically determine these sizes
// needs modifications in ggml
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
{ MODEL_7B, 512ull*MB },
{ MODEL_13B, 512ull*MB },
{ MODEL_30B, 512ull*MB },
{ MODEL_65B, 512ull*MB },
};
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
{
static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
{ MODEL_7B, 512ull * MB },
{ MODEL_13B, 512ull * MB },
{ MODEL_30B, 512ull * MB },
{ MODEL_65B, 512ull * MB },
};
return _MEM_REQ_SCRATCH0;
}
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
{ MODEL_7B, 512ull*MB },
{ MODEL_13B, 512ull*MB },
{ MODEL_30B, 512ull*MB },
{ MODEL_65B, 512ull*MB },
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
{
static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
{ MODEL_7B, 512ull * MB },
{ MODEL_13B, 512ull * MB },
{ MODEL_30B, 512ull * MB },
{ MODEL_65B, 512ull * MB },
};
return _MEM_REQ_SCRATCH1;
};
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
{ MODEL_7B, 1026ull*MB },
{ MODEL_13B, 1608ull*MB },
{ MODEL_30B, 3124ull*MB },
{ MODEL_65B, 5120ull*MB },
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
{
static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
{ MODEL_7B, 1026ull * MB },
{ MODEL_13B, 1608ull * MB },
{ MODEL_30B, 3124ull * MB },
{ MODEL_65B, 5120ull * MB },
};
return _MEM_REQ_KV_SELF;
};
// this is mostly needed for temporary mul_mat buffers to dequantize the data
// not actually needed if BLAS is disabled
static const std::map<e_model, size_t> MEM_REQ_EVAL = {
{ MODEL_7B, 768ull*MB },
{ MODEL_13B, 1024ull*MB },
{ MODEL_30B, 1280ull*MB },
{ MODEL_65B, 1536ull*MB },
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
{
static std::map<e_model, size_t> _MEM_REQ_EVAL = {
{ MODEL_7B, 768ull * MB },
{ MODEL_13B, 1024ull * MB },
{ MODEL_30B, 1280ull * MB },
{ MODEL_65B, 1536ull * MB },
};
return _MEM_REQ_EVAL;
};
// default hparams (LLaMA 7B)
@ -624,6 +642,7 @@ struct llama_model_loader {
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
}
return get_tensor_for(lt);
}
@ -906,13 +925,13 @@ static void llama_model_load_internal(
const size_t mem_required =
ctx_size +
mmapped_size +
MEM_REQ_SCRATCH0.at(model.type) +
MEM_REQ_SCRATCH1.at(model.type) +
MEM_REQ_EVAL.at (model.type);
MEM_REQ_SCRATCH0().at(model.type) +
MEM_REQ_SCRATCH1().at(model.type) +
MEM_REQ_EVAL().at(model.type);
// this is the memory required by one llama_state
const size_t mem_required_state =
scale*MEM_REQ_KV_SELF.at(model.type);
scale*MEM_REQ_KV_SELF().at(model.type);
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@ -1739,10 +1758,10 @@ struct llama_context * llama_init_from_file(
ctx->embedding.resize(hparams.n_embd);
}
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
}
return ctx;
@ -1765,6 +1784,254 @@ int llama_model_quantize(
}
}
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
auto & model = ctx->model;
const int64_t t_start_lora_us = ggml_time_us();
auto fin = std::ifstream(path_lora, std::ios::binary);
if (!fin) {
fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
return 1;
}
// verify magic and version
{
uint32_t magic;
fin.read((char *) &magic, sizeof(magic));
if (magic != 'ggla') {
fprintf(stderr, "%s: bad file magic\n", __func__);
return 1;
}
uint32_t format_version;
fin.read((char *) &format_version, sizeof(format_version));
if (format_version != 1) {
fprintf(stderr, "%s: unsupported file version\n", __func__ );
return 1;
}
}
int32_t lora_r;
int32_t lora_alpha;
fin.read((char *) &lora_r, sizeof(lora_r));
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
float scaling = (float)lora_alpha / (float)lora_r;
fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
// create a temporary ggml context to store the lora tensors
// todo: calculate size from biggest possible tensor
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
struct ggml_init_params params;
params.mem_size = lora_buf.size();
params.mem_buffer = lora_buf.data();
params.no_alloc = false;
ggml_context * lora_ctx = ggml_init(params);
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
// create a name -> tensor map of the model to accelerate lookups
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
for (auto & kv: model.tensors_by_name) {
model_tensors.insert(kv);
}
// load base model
std::unique_ptr<llama_model_loader> model_loader;
ggml_context * base_ctx = NULL;
llama_buffer base_buf;
if (path_base_model) {
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
size_t ctx_size, mmapped_size;
model_loader->calc_sizes(&ctx_size, &mmapped_size);
base_buf.resize(ctx_size);
ggml_init_params base_params;
base_params.mem_size = base_buf.size;
base_params.mem_buffer = base_buf.addr;
base_params.no_alloc = model_loader->use_mmap;
base_ctx = ggml_init(base_params);
model_loader->ggml_ctx = base_ctx;
// maybe this should in llama_model_loader
if (model_loader->use_mmap) {
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
}
}
// read tensors and apply
bool warned = false;
int n_tensors = 0;
while (true) {
int32_t n_dims;
int32_t length;
int32_t ftype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
if (fin.eof()) {
break;
}
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
}
std::string name(length, 0);
fin.read(&name[0], length);
// check for lora suffix and get the type of tensor
const std::string lora_suffix = ".lora";
size_t pos = name.rfind(lora_suffix);
if (pos == std::string::npos) {
fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
return 1;
}
std::string lora_type = name.substr(pos + lora_suffix.length());
std::string base_name = name;
base_name.erase(pos);
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
if (model_tensors.find(base_name.data()) == model_tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
return 1;
}
// create ggml tensor
ggml_type wtype;
switch (ftype) {
case 0: wtype = GGML_TYPE_F32; break;
case 1: wtype = GGML_TYPE_F16; break;
default:
{
fprintf(stderr, "%s: invalid tensor data type '%d'\n",
__func__, ftype);
return false;
}
}
ggml_tensor* lora_tensor;
if (n_dims == 2) {
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
}
else {
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
return 1;
}
// load tensor data
size_t offset = fin.tellg();
size_t tensor_data_size = ggml_nbytes(lora_tensor);
offset = (offset + 31) & -32;
fin.seekg(offset);
fin.read((char*)lora_tensor->data, tensor_data_size);
lora_tensors[name] = lora_tensor;
// check if we have both A and B tensors and apply
if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
ggml_tensor * dest_t = model_tensors[base_name];
ggml_tensor * base_t;
if (model_loader) {
// load from base model
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
return 1;
}
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
lt.data = (uint8_t *) lt.ggml_tensor->data;
model_loader->load_data_for(lt);
lt.ggml_tensor->data = lt.data;
}
else {
base_t = dest_t;
}
if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
if (!warned) {
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
"use a f16 or f32 base model with --lora-base\n", __func__);
warned = true;
}
}
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
return 1;
}
// w = w + BA*s
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
if (scaling != 1.0f) {
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
BA = ggml_scale(lora_ctx, BA, scale_tensor);
}
ggml_tensor * r;
if (base_t == dest_t) {
r = ggml_add_inplace(lora_ctx, dest_t, BA);
}
else {
r = ggml_add(lora_ctx, base_t, BA);
r = ggml_cpy(lora_ctx, r, dest_t);
}
struct ggml_cgraph gf = ggml_build_forward(r);
gf.n_threads = n_threads;
ggml_graph_compute(lora_ctx, &gf);
// we won't need these tensors again, reset the context to save memory
ggml_free(lora_ctx);
lora_ctx = ggml_init(params);
lora_tensors.clear();
n_tensors++;
if (n_tensors % 4 == 0)
fprintf(stderr, ".");
}
}
// TODO: this should be in a destructor, it will leak on failure
ggml_free(lora_ctx);
if (base_ctx) {
ggml_free(base_ctx);
}
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
return 0;
}
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
try {
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
} catch (const std::string & err) {
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
return 1;
}
}
// Returns the KV cache that will contain the context for the
// ongoing prediction with the model.
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
@ -1922,18 +2189,20 @@ const char * llama_print_system_info(void) {
static std::string s;
s = "";
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
return s.c_str();
}

12
llama.h
View file

@ -96,6 +96,18 @@ extern "C" {
const char * fname_out,
enum llama_ftype ftype);
// Apply a LoRA adapter to a loaded model
// path_base_model is the path to a higher quality model to use as a base for
// the layers modified by the adapter. Can be NULL to use the current loaded model.
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
// will be applied on top of the previous one
// Returns 0 on success
LLAMA_API int llama_apply_lora_from_file(
struct llama_context * ctx,
const char * path_lora,
const char * path_base_model,
int n_threads);
// Returns the KV cache that will contain the context for the
// ongoing prediction with the model.
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);

View file

@ -168,7 +168,7 @@ struct llama_mmap {
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file) {
llama_mmap(struct llama_file * file, bool prefetch = true) {
size = file->size;
int fd = fileno(file->fp);
int flags = MAP_SHARED;
@ -180,10 +180,12 @@ struct llama_mmap {
throw format("mmap failed: %s", strerror(errno));
}
// Advise the kernel to preload the mapped memory
if (madvise(addr, file->size, MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
strerror(errno));
if (prefetch) {
// Advise the kernel to preload the mapped memory
if (madvise(addr, file->size, MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
strerror(errno));
}
}
}
@ -193,14 +195,13 @@ struct llama_mmap {
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file) {
llama_mmap(struct llama_file * file, bool prefetch = true) {
size = file->size;
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
DWORD error = GetLastError();
CloseHandle(hFile);
if (hMapping == NULL) {
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@ -215,13 +216,15 @@ struct llama_mmap {
}
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
// Advise the kernel to preload the mapped memory
WIN32_MEMORY_RANGE_ENTRY range;
range.VirtualAddress = addr;
range.NumberOfBytes = (SIZE_T)size;
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
if (prefetch) {
// Advise the kernel to preload the mapped memory
WIN32_MEMORY_RANGE_ENTRY range;
range.VirtualAddress = addr;
range.NumberOfBytes = (SIZE_T)size;
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")

View file

@ -5,13 +5,17 @@
#include <map>
#include <vector>
static const std::map<std::string, std::vector<llama_token>> k_tests = {
{ "Hello World", { 1, 10994, 2787, }, },
{ " Hello World", { 1, 15043, 2787, }, },
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
static const std::map<std::string, std::vector<llama_token>> & k_tests()
{
static std::map<std::string, std::vector<llama_token>> _k_tests = {
{ "Hello World", { 1, 10994, 2787, }, },
{ " Hello World", { 1, 15043, 2787, }, },
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
};
return _k_tests;
};
int main(int argc, char **argv) {
@ -47,7 +51,7 @@ int main(int argc, char **argv) {
return 2;
}
for (const auto & test_kv : k_tests) {
for (const auto & test_kv : k_tests()) {
std::vector<llama_token> res(test_kv.first.size());
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
res.resize(n);