Merge branch 'master' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # Makefile # examples/quantize/quantize.cpp # ggml.c # pocs/vdot/vdot.cpp # scripts/build-info.cmake # scripts/build-info.h.in # scripts/build-info.sh # tests/test-opt.cpp # tests/test-quantize-fns.cpp # tests/test-quantize-perf.cpp # tests/test-sampling.cpp
This commit is contained in:
commit
c96fb3984d
39 changed files with 1098 additions and 340 deletions
|
@ -212,6 +212,7 @@ if (LLAMA_ALL_WARNINGS)
|
|||
-Wextra
|
||||
-Wpedantic
|
||||
-Wcast-qual
|
||||
-Wmissing-declarations
|
||||
-Wno-unused-function
|
||||
-Wno-multichar
|
||||
)
|
||||
|
@ -230,7 +231,7 @@ if (LLAMA_ALL_WARNINGS)
|
|||
|
||||
endif()
|
||||
|
||||
if (MSVC)
|
||||
if (WIN32)
|
||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
|
|
|
@ -78,7 +78,7 @@ int32_t get_num_physical_cores() {
|
|||
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
||||
}
|
||||
|
||||
void process_escapes(std::string& input) {
|
||||
static void process_escapes(std::string& input) {
|
||||
std::size_t input_len = input.length();
|
||||
std::size_t output_idx = 0;
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#define LOG_NO_FILE_LINE_FUNCTION
|
||||
#include "log.h"
|
||||
|
@ -20,8 +21,13 @@
|
|||
#define DIRECTORY_SEPARATOR '/'
|
||||
#endif // _WIN32
|
||||
|
||||
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", ##__VA_ARGS__); exit(1); } while (0)
|
||||
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||
|
||||
#define print_build_info() do { \
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
|
||||
} while(0)
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
|
|
|
@ -158,7 +158,7 @@ namespace console {
|
|||
}
|
||||
}
|
||||
|
||||
char32_t getchar32() {
|
||||
static char32_t getchar32() {
|
||||
#if defined(_WIN32)
|
||||
HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
|
||||
wchar_t high_surrogate = 0;
|
||||
|
@ -212,7 +212,7 @@ namespace console {
|
|||
#endif
|
||||
}
|
||||
|
||||
void pop_cursor() {
|
||||
static void pop_cursor() {
|
||||
#if defined(_WIN32)
|
||||
if (hConsole != NULL) {
|
||||
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
|
||||
|
@ -233,7 +233,7 @@ namespace console {
|
|||
putc('\b', out);
|
||||
}
|
||||
|
||||
int estimateWidth(char32_t codepoint) {
|
||||
static int estimateWidth(char32_t codepoint) {
|
||||
#if defined(_WIN32)
|
||||
(void)codepoint;
|
||||
return 1;
|
||||
|
@ -242,7 +242,7 @@ namespace console {
|
|||
#endif
|
||||
}
|
||||
|
||||
int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
|
||||
static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
|
||||
#if defined(_WIN32)
|
||||
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
|
||||
if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
|
||||
|
@ -303,7 +303,7 @@ namespace console {
|
|||
#endif
|
||||
}
|
||||
|
||||
void replace_last(char ch) {
|
||||
static void replace_last(char ch) {
|
||||
#if defined(_WIN32)
|
||||
pop_cursor();
|
||||
put_codepoint(&ch, 1, 1);
|
||||
|
@ -312,7 +312,7 @@ namespace console {
|
|||
#endif
|
||||
}
|
||||
|
||||
void append_utf8(char32_t ch, std::string & out) {
|
||||
static void append_utf8(char32_t ch, std::string & out) {
|
||||
if (ch <= 0x7F) {
|
||||
out.push_back(static_cast<unsigned char>(ch));
|
||||
} else if (ch <= 0x7FF) {
|
||||
|
@ -333,7 +333,7 @@ namespace console {
|
|||
}
|
||||
|
||||
// Helper function to remove the last UTF-8 character from a string
|
||||
void pop_back_utf8_char(std::string & line) {
|
||||
static void pop_back_utf8_char(std::string & line) {
|
||||
if (line.empty()) {
|
||||
return;
|
||||
}
|
||||
|
@ -349,7 +349,7 @@ namespace console {
|
|||
line.erase(pos);
|
||||
}
|
||||
|
||||
bool readline_advanced(std::string & line, bool multiline_input) {
|
||||
static bool readline_advanced(std::string & line, bool multiline_input) {
|
||||
if (out != stdout) {
|
||||
fflush(stdout);
|
||||
}
|
||||
|
@ -452,7 +452,7 @@ namespace console {
|
|||
return has_more;
|
||||
}
|
||||
|
||||
bool readline_simple(std::string & line, bool multiline_input) {
|
||||
static bool readline_simple(std::string & line, bool multiline_input) {
|
||||
#if defined(_WIN32)
|
||||
std::wstring wline;
|
||||
if (!std::getline(std::wcin, wline)) {
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
namespace grammar_parser {
|
||||
// NOTE: assumes valid utf8 (but checks for overrun)
|
||||
// copied from llama.cpp
|
||||
std::pair<uint32_t, const char *> decode_utf8(const char * src) {
|
||||
static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
|
||||
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||
uint8_t first_byte = static_cast<uint8_t>(*src);
|
||||
uint8_t highbits = first_byte >> 4;
|
||||
|
@ -24,19 +24,19 @@ namespace grammar_parser {
|
|||
return std::make_pair(value, pos);
|
||||
}
|
||||
|
||||
uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
||||
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
||||
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
||||
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
|
||||
return result.first->second;
|
||||
}
|
||||
|
||||
uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
|
||||
static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
|
||||
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
||||
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
|
||||
return next_id;
|
||||
}
|
||||
|
||||
void add_rule(
|
||||
static void add_rule(
|
||||
parse_state & state,
|
||||
uint32_t rule_id,
|
||||
const std::vector<llama_grammar_element> & rule) {
|
||||
|
@ -46,11 +46,11 @@ namespace grammar_parser {
|
|||
state.rules[rule_id] = rule;
|
||||
}
|
||||
|
||||
bool is_word_char(char c) {
|
||||
static bool is_word_char(char c) {
|
||||
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
|
||||
}
|
||||
|
||||
std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
||||
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
||||
const char * pos = src;
|
||||
const char * end = src + size;
|
||||
uint32_t value = 0;
|
||||
|
@ -73,7 +73,7 @@ namespace grammar_parser {
|
|||
return std::make_pair(value, pos);
|
||||
}
|
||||
|
||||
const char * parse_space(const char * src, bool newline_ok) {
|
||||
static const char * parse_space(const char * src, bool newline_ok) {
|
||||
const char * pos = src;
|
||||
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
|
||||
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
|
||||
|
@ -88,7 +88,7 @@ namespace grammar_parser {
|
|||
return pos;
|
||||
}
|
||||
|
||||
const char * parse_name(const char * src) {
|
||||
static const char * parse_name(const char * src) {
|
||||
const char * pos = src;
|
||||
while (is_word_char(*pos)) {
|
||||
pos++;
|
||||
|
@ -99,7 +99,7 @@ namespace grammar_parser {
|
|||
return pos;
|
||||
}
|
||||
|
||||
std::pair<uint32_t, const char *> parse_char(const char * src) {
|
||||
static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
||||
if (*src == '\\') {
|
||||
switch (src[1]) {
|
||||
case 'x': return parse_hex(src + 2, 2);
|
||||
|
@ -129,7 +129,7 @@ namespace grammar_parser {
|
|||
uint32_t rule_id,
|
||||
bool is_nested);
|
||||
|
||||
const char * parse_sequence(
|
||||
static const char * parse_sequence(
|
||||
parse_state & state,
|
||||
const char * src,
|
||||
const std::string & rule_name,
|
||||
|
@ -247,7 +247,7 @@ namespace grammar_parser {
|
|||
return pos;
|
||||
}
|
||||
|
||||
const char * parse_rule(parse_state & state, const char * src) {
|
||||
static const char * parse_rule(parse_state & state, const char * src) {
|
||||
const char * name_end = parse_name(src);
|
||||
const char * pos = parse_space(name_end, false);
|
||||
size_t name_len = name_end - src;
|
||||
|
@ -285,7 +285,7 @@ namespace grammar_parser {
|
|||
}
|
||||
}
|
||||
|
||||
void print_grammar_char(FILE * file, uint32_t c) {
|
||||
static void print_grammar_char(FILE * file, uint32_t c) {
|
||||
if (0x20 <= c && c <= 0x7f) {
|
||||
fprintf(file, "%c", static_cast<char>(c));
|
||||
} else {
|
||||
|
@ -294,7 +294,7 @@ namespace grammar_parser {
|
|||
}
|
||||
}
|
||||
|
||||
bool is_char_element(llama_grammar_element elem) {
|
||||
static bool is_char_element(llama_grammar_element elem) {
|
||||
switch (elem.type) {
|
||||
case LLAMA_GRETYPE_CHAR: return true;
|
||||
case LLAMA_GRETYPE_CHAR_NOT: return true;
|
||||
|
@ -304,7 +304,7 @@ namespace grammar_parser {
|
|||
}
|
||||
}
|
||||
|
||||
void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
|
||||
static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
|
||||
for (auto elem : rule) {
|
||||
switch (elem.type) {
|
||||
case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
|
||||
|
@ -334,7 +334,7 @@ namespace grammar_parser {
|
|||
fprintf(file, "\n");
|
||||
}
|
||||
|
||||
void print_rule(
|
||||
static void print_rule(
|
||||
FILE * file,
|
||||
uint32_t rule_id,
|
||||
const std::vector<llama_grammar_element> & rule,
|
||||
|
|
|
@ -57,10 +57,22 @@ def count_model_parts(dir_model: str) -> int:
|
|||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
|
||||
parser.add_argument(
|
||||
"--vocab-only", action="store_true",
|
||||
help="extract only the vocab",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outfile", type=Path,
|
||||
help="path to write to; default: based on input",
|
||||
)
|
||||
parser.add_argument(
|
||||
"model", type=Path,
|
||||
help="directory containing model file, or model file itself (*.bin)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
||||
help="output format - use 0 for float32, 1 for float16",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
|
|
|
@ -55,10 +55,22 @@ def count_model_parts(dir_model: Path) -> int:
|
|||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
|
||||
parser.add_argument(
|
||||
"--vocab-only", action="store_true",
|
||||
help="extract only the vocab",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outfile", type=Path,
|
||||
help="path to write to; default: based on input",
|
||||
)
|
||||
parser.add_argument(
|
||||
"model", type=Path,
|
||||
help="directory containing model file, or model file itself (*.bin)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
||||
help="output format - use 0 for float32, 1 for float16",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
|
|
|
@ -56,10 +56,22 @@ def count_model_parts(dir_model: Path) -> int:
|
|||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
|
||||
parser.add_argument(
|
||||
"--vocab-only", action="store_true",
|
||||
help="extract only the vocab",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outfile", type=Path,
|
||||
help="path to write to; default: based on input",
|
||||
)
|
||||
parser.add_argument(
|
||||
"model", type=Path,
|
||||
help="directory containing model file, or model file itself (*.bin)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
||||
help="output format - use 0 for float32, 1 for float16",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
|
|
248
convert-starcoder-hf-to-gguf.py
Executable file
248
convert-starcoder-hf-to-gguf.py
Executable file
|
@ -0,0 +1,248 @@
|
|||
#!/usr/bin/env python3
|
||||
# HF starcoder --> gguf conversion
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer # type: ignore[import]
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8+n)
|
||||
n += 1
|
||||
return dict(zip(bs, (chr(n) for n in cs)))
|
||||
|
||||
|
||||
def count_model_parts(dir_model: Path) -> int:
|
||||
num_parts = 0
|
||||
for filename in os.listdir(dir_model):
|
||||
if filename.startswith("pytorch_model-"):
|
||||
num_parts += 1
|
||||
|
||||
if num_parts > 0:
|
||||
print("gguf: found " + str(num_parts) + " model parts")
|
||||
return num_parts
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
|
||||
dir_model = args.model
|
||||
ftype = args.ftype
|
||||
if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
||||
|
||||
print("gguf: loading model "+dir_model.name)
|
||||
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
# get number of model parts
|
||||
num_parts = count_model_parts(dir_model)
|
||||
|
||||
ARCH=gguf.MODEL_ARCH.STARCODER
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["n_layer"]
|
||||
|
||||
gguf_writer.add_name("StarCoder")
|
||||
gguf_writer.add_context_length(hparams["n_positions"])
|
||||
gguf_writer.add_embedding_length(hparams["n_embd"])
|
||||
gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_head_count(hparams["n_head"])
|
||||
gguf_writer.add_head_count_kv(1)
|
||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||
gguf_writer.add_file_type(ftype)
|
||||
|
||||
# TOKENIZATION
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: list[bytearray] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
||||
if not tokenizer_json_file.is_file():
|
||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# gpt2 tokenizer
|
||||
gguf_writer.add_tokenizer_model("gpt2")
|
||||
|
||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
|
||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
||||
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
byte_encoder = bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i in reverse_vocab:
|
||||
try:
|
||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||
except KeyError:
|
||||
text = bytearray()
|
||||
for c in reverse_vocab[i]:
|
||||
if ord(c) < 256: # single byte character
|
||||
text.append(byte_decoder[ord(c)])
|
||||
else: # multibyte special token character
|
||||
text.extend(c.encode('utf-8'))
|
||||
else:
|
||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||
pad_token = f"[PAD{i}]".encode("utf8")
|
||||
text = bytearray(pad_token)
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(0.0) # dymmy
|
||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||
|
||||
# params for qkv transform
|
||||
n_head = hparams["n_head"]
|
||||
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
||||
|
||||
head_dim = hparams["n_embd"] // n_head
|
||||
|
||||
# tensor info
|
||||
print("gguf: get tensor metadata")
|
||||
|
||||
if num_parts == 0:
|
||||
part_names = iter(("pytorch_model.bin",))
|
||||
else:
|
||||
part_names = (
|
||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||
)
|
||||
|
||||
for part_name in part_names:
|
||||
if args.vocab_only:
|
||||
break
|
||||
print("gguf: loading model part '" + part_name + "'")
|
||||
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
||||
|
||||
for name in model_part.keys():
|
||||
data = model_part[name]
|
||||
|
||||
old_dtype = data.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||
data = data.to(torch.float32)
|
||||
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
|
||||
gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
if not args.vocab_only:
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||
print("")
|
|
@ -9,12 +9,12 @@
|
|||
#endif
|
||||
|
||||
#ifdef LLAMA_DEFAULT_RMS_EPS
|
||||
static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
||||
constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
||||
#else
|
||||
static const float rms_norm_eps = 5e-6f;
|
||||
constexpr float rms_norm_eps = 5e-6f;
|
||||
#endif
|
||||
|
||||
float frand() {
|
||||
static float frand() {
|
||||
return (float)rand()/(float)RAND_MAX;
|
||||
}
|
||||
|
||||
|
@ -25,19 +25,21 @@ struct random_normal_distribution {
|
|||
float max;
|
||||
};
|
||||
|
||||
void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
|
||||
static void init_random_normal_distribution(
|
||||
struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
|
||||
) {
|
||||
rnd->gen = std::mt19937(seed);
|
||||
rnd->nd = std::normal_distribution<float>{mean, std};
|
||||
rnd->min = min;
|
||||
rnd->max = max;
|
||||
}
|
||||
|
||||
float frand_normal(struct random_normal_distribution * rnd) {
|
||||
static float frand_normal(struct random_normal_distribution * rnd) {
|
||||
const float r = rnd->nd(rnd->gen);
|
||||
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
|
||||
}
|
||||
|
||||
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
|
@ -48,13 +50,9 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
|
|||
ggml_graph_compute(graph, &plan);
|
||||
}
|
||||
|
||||
struct ggml_tensor * randomize_tensor(
|
||||
struct ggml_tensor * tensor,
|
||||
int ndims,
|
||||
const int64_t ne[],
|
||||
float fmin,
|
||||
float fmax) {
|
||||
|
||||
static struct ggml_tensor * randomize_tensor(
|
||||
struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
|
||||
) {
|
||||
switch (ndims) {
|
||||
case 1:
|
||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||
|
@ -95,11 +93,9 @@ struct ggml_tensor * randomize_tensor(
|
|||
return tensor;
|
||||
}
|
||||
|
||||
struct ggml_tensor * randomize_tensor_normal(
|
||||
struct ggml_tensor * tensor,
|
||||
int ndims,
|
||||
const int64_t ne[],
|
||||
struct random_normal_distribution * rnd) {
|
||||
static struct ggml_tensor * randomize_tensor_normal(
|
||||
struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
|
||||
) {
|
||||
float scale = 1.0; // xavier
|
||||
switch (ndims) {
|
||||
case 1:
|
||||
|
@ -159,7 +155,7 @@ struct llama_hparams {
|
|||
}
|
||||
};
|
||||
|
||||
uint32_t get_n_ff(const struct llama_hparams* hparams) {
|
||||
static uint32_t get_n_ff(const struct llama_hparams* hparams) {
|
||||
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
|
||||
return n_ff;
|
||||
}
|
||||
|
@ -260,7 +256,7 @@ struct llama_model_lora {
|
|||
std::vector<llama_layer_lora> layers;
|
||||
};
|
||||
|
||||
void init_model(struct llama_model * model) {
|
||||
static void init_model(struct llama_model * model) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
|
@ -297,7 +293,7 @@ void init_model(struct llama_model * model) {
|
|||
}
|
||||
|
||||
|
||||
void init_model_lora(struct llama_model_lora * model) {
|
||||
static void init_model_lora(struct llama_model_lora * model) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
|
@ -340,7 +336,7 @@ void init_model_lora(struct llama_model_lora * model) {
|
|||
}
|
||||
}
|
||||
|
||||
void set_param_model(struct llama_model * model) {
|
||||
static void set_param_model(struct llama_model * model) {
|
||||
const auto& hparams = model->hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
|
@ -366,7 +362,7 @@ void set_param_model(struct llama_model * model) {
|
|||
}
|
||||
}
|
||||
|
||||
void set_param_model_lora(struct llama_model_lora * model) {
|
||||
static void set_param_model_lora(struct llama_model_lora * model) {
|
||||
const auto& hparams = model->hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
|
@ -397,7 +393,7 @@ void set_param_model_lora(struct llama_model_lora * model) {
|
|||
}
|
||||
}
|
||||
|
||||
void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
|
||||
static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
|
@ -426,7 +422,9 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std
|
|||
}
|
||||
|
||||
|
||||
void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) {
|
||||
static void randomize_model_lora(
|
||||
struct llama_model_lora * model, int seed, float mean, float std, float min, float max
|
||||
) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
|
@ -459,7 +457,7 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean,
|
|||
}
|
||||
}
|
||||
|
||||
bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
|
||||
static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_ctx = hparams.n_ctx;
|
||||
|
@ -495,7 +493,7 @@ bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int
|
|||
return true;
|
||||
}
|
||||
|
||||
bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
|
||||
static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_ctx = hparams.n_ctx;
|
||||
|
@ -531,15 +529,15 @@ bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora *
|
|||
return true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * forward(
|
||||
struct llama_model * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
const int n_tokens,
|
||||
const int n_past) {
|
||||
|
||||
static struct ggml_tensor * forward(
|
||||
struct llama_model * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
const int n_tokens,
|
||||
const int n_past
|
||||
) {
|
||||
const int N = n_tokens;
|
||||
|
||||
struct llama_kv_cache& kv_self = *cache;
|
||||
|
@ -756,25 +754,25 @@ struct ggml_tensor * forward(
|
|||
return inpL;
|
||||
}
|
||||
|
||||
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
||||
static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
||||
GGML_ASSERT(tensor->n_dims == 1);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
}
|
||||
|
||||
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
||||
static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
||||
GGML_ASSERT(tensor->n_dims == 2);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||
}
|
||||
|
||||
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
||||
static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
||||
GGML_ASSERT(tensor->n_dims == 3);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||
GGML_ASSERT(tensor->ne[2] == ne2);
|
||||
}
|
||||
|
||||
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
||||
static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
||||
GGML_ASSERT(tensor->n_dims == 4);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||
|
@ -782,16 +780,16 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
|
|||
GGML_ASSERT(tensor->ne[3] == ne3);
|
||||
}
|
||||
|
||||
struct ggml_tensor * forward_batch(
|
||||
struct llama_model * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
const int n_tokens,
|
||||
const int n_past,
|
||||
const int n_batch) {
|
||||
|
||||
static struct ggml_tensor * forward_batch(
|
||||
struct llama_model * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
const int n_tokens,
|
||||
const int n_past,
|
||||
const int n_batch
|
||||
) {
|
||||
const int N = n_tokens;
|
||||
|
||||
struct llama_kv_cache& kv_self = *cache;
|
||||
|
@ -1073,16 +1071,15 @@ struct ggml_tensor * forward_batch(
|
|||
return inpL;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * forward_lora(
|
||||
struct llama_model_lora * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
const int n_tokens,
|
||||
const int n_past) {
|
||||
|
||||
static struct ggml_tensor * forward_lora(
|
||||
struct llama_model_lora * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
const int n_tokens,
|
||||
const int n_past
|
||||
) {
|
||||
const int N = n_tokens;
|
||||
|
||||
struct llama_kv_cache& kv_self = *cache;
|
||||
|
@ -1328,7 +1325,7 @@ struct ggml_tensor * forward_lora(
|
|||
return inpL;
|
||||
}
|
||||
|
||||
void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
|
||||
static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
|
||||
assert(logits->n_dims == 2);
|
||||
assert(probs->n_dims == 2);
|
||||
assert(best_samples->n_dims == 1);
|
||||
|
@ -1359,7 +1356,10 @@ void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, str
|
|||
}
|
||||
}
|
||||
|
||||
void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
|
||||
static void sample_softmax_batch(
|
||||
struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
|
||||
struct ggml_tensor * best_samples
|
||||
) {
|
||||
GGML_ASSERT(best_samples->n_dims == 2);
|
||||
GGML_ASSERT(logits->n_dims == 3);
|
||||
GGML_ASSERT(probs->n_dims == 3);
|
||||
|
@ -1393,7 +1393,7 @@ void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits
|
|||
}
|
||||
}
|
||||
|
||||
void print_row(struct ggml_tensor * probs, int i) {
|
||||
static void print_row(struct ggml_tensor * probs, int i) {
|
||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
|
||||
printf(" %.2f", p);
|
||||
|
@ -1401,7 +1401,7 @@ void print_row(struct ggml_tensor * probs, int i) {
|
|||
printf("\n");
|
||||
}
|
||||
|
||||
void print_matrix(struct ggml_tensor * probs) {
|
||||
static void print_matrix(struct ggml_tensor * probs) {
|
||||
assert(probs->n_dims == 2);
|
||||
for (int i = 0; i < probs->ne[1]; ++i) {
|
||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||
|
@ -1412,7 +1412,7 @@ void print_matrix(struct ggml_tensor * probs) {
|
|||
}
|
||||
}
|
||||
|
||||
void print_token(int token, int n_vocab) {
|
||||
static void print_token(int token, int n_vocab) {
|
||||
for (int k = 0; k < token; ++k) {
|
||||
printf(" ");
|
||||
}
|
||||
|
@ -1423,14 +1423,14 @@ void print_token(int token, int n_vocab) {
|
|||
printf("\n");
|
||||
}
|
||||
|
||||
void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
|
||||
static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
|
||||
for (int i=0; i<tokens->ne[0]; ++i) {
|
||||
int token = ggml_get_i32_1d(tokens, i);
|
||||
print_token(token, n_vocab);
|
||||
}
|
||||
}
|
||||
|
||||
void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
|
||||
static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
|
||||
int n_tokens = tokens_input->ne[0];
|
||||
int n_vocab = targets->ne[0];
|
||||
float randomness = 0.0f;
|
||||
|
@ -1451,7 +1451,9 @@ void get_example_targets(int example_id, struct ggml_tensor * tokens_input, stru
|
|||
}
|
||||
}
|
||||
|
||||
void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
|
||||
static void get_example_targets_batch(
|
||||
struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
|
||||
) {
|
||||
GGML_ASSERT(tokens_input->n_dims == 2);
|
||||
GGML_ASSERT( targets->n_dims == 3);
|
||||
int n_tokens = tokens_input->ne[0];
|
||||
|
@ -1474,7 +1476,7 @@ void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct
|
|||
}
|
||||
}
|
||||
|
||||
void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
|
||||
static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
|
||||
int n_tokens = tokens_input->ne[0];
|
||||
int n_vocab = targets->ne[0];
|
||||
for (int i=0; i<n_tokens-n_shift; ++i) {
|
||||
|
@ -1485,12 +1487,16 @@ void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * tar
|
|||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
||||
static struct ggml_tensor * square_error_loss(
|
||||
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
|
||||
) {
|
||||
// todo: instead of a-b: a[1:]-b[:-1]
|
||||
return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
|
||||
}
|
||||
|
||||
struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
||||
static struct ggml_tensor * cross_entropy_loss(
|
||||
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
|
||||
) {
|
||||
const float eps = 1e-3f;
|
||||
return
|
||||
ggml_sum(ctx,
|
||||
|
|
|
@ -3,6 +3,3 @@ add_executable(${TARGET} beam-search.cpp)
|
|||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
|
@ -30,7 +29,8 @@ struct ostream_beam_view {
|
|||
llama_context * ctx;
|
||||
llama_beam_view beam_view;
|
||||
};
|
||||
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
|
||||
|
||||
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
|
||||
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
||||
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
||||
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
|
||||
|
@ -46,7 +46,7 @@ struct beam_search_callback_data {
|
|||
|
||||
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
||||
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
||||
bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
|
||||
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
|
||||
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
|
||||
}
|
||||
|
||||
|
@ -56,7 +56,7 @@ bool is_at_eob(const beam_search_callback_data & callback_data, const llama_toke
|
|||
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
||||
// This is also called when the stop condition is met.
|
||||
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
||||
void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
||||
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
||||
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
||||
// Mark beams as EOS as needed.
|
||||
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
set(TARGET benchmark)
|
||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <locale.h>
|
||||
#include <assert.h>
|
||||
|
@ -99,7 +99,7 @@ int main(int argc, char ** argv) {
|
|||
exit(1);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
print_build_info();
|
||||
printf("Starting Test\n");
|
||||
|
||||
// create the ggml context
|
||||
|
|
|
@ -115,7 +115,7 @@ struct TransformerWeights {
|
|||
}
|
||||
};
|
||||
|
||||
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
||||
static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
||||
// we calloc instead of malloc to keep valgrind happy
|
||||
w->token_embedding_table = new float[p->vocab_size * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
|
@ -158,7 +158,7 @@ void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
|||
}
|
||||
}
|
||||
|
||||
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
|
||||
static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
|
||||
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
||||
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
|
||||
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
||||
|
@ -189,7 +189,7 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar
|
|||
return 0;
|
||||
}
|
||||
|
||||
void print_sample_weights(TransformerWeights *w){
|
||||
static void print_sample_weights(TransformerWeights *w){
|
||||
printf("----- Quick print of first of the weight vales of all the variables\n");
|
||||
printf("%f\n", w->token_embedding_table[0]);
|
||||
printf("%f\n", w->rms_att_weight[0]);
|
||||
|
@ -324,7 +324,7 @@ struct train_params {
|
|||
int mem_compute1_gb;
|
||||
};
|
||||
|
||||
void print_params(struct my_llama_hparams * params) {
|
||||
static void print_params(struct my_llama_hparams * params) {
|
||||
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
||||
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
||||
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
||||
|
@ -335,7 +335,7 @@ void print_params(struct my_llama_hparams * params) {
|
|||
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
||||
}
|
||||
|
||||
void init_model(struct my_llama_model * model) {
|
||||
static void init_model(struct my_llama_model * model) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
|
@ -408,17 +408,17 @@ void init_model(struct my_llama_model * model) {
|
|||
}
|
||||
}
|
||||
|
||||
float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||
static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
||||
return *ptr;
|
||||
}
|
||||
|
||||
int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||
static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
||||
return *ptr;
|
||||
}
|
||||
|
||||
void print_row(struct ggml_tensor * probs, int i) {
|
||||
static void print_row(struct ggml_tensor * probs, int i) {
|
||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||
float p = get_f32_2d(probs, k, i);
|
||||
printf(" %f", p);
|
||||
|
@ -426,7 +426,7 @@ void print_row(struct ggml_tensor * probs, int i) {
|
|||
printf("\n");
|
||||
}
|
||||
|
||||
void print_matrix(struct ggml_tensor * probs) {
|
||||
static void print_matrix(struct ggml_tensor * probs) {
|
||||
assert(probs->n_dims == 2);
|
||||
for (int i = 0; i < probs->ne[1]; ++i) {
|
||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||
|
@ -531,7 +531,7 @@ struct llama_file {
|
|||
}
|
||||
};
|
||||
|
||||
bool is_ggml_file(const char *filename) {
|
||||
static bool is_ggml_file(const char * filename) {
|
||||
llama_file file(filename, "rb");
|
||||
if (file.size < 4) {
|
||||
return false;
|
||||
|
@ -540,7 +540,7 @@ bool is_ggml_file(const char *filename) {
|
|||
return magic == GGUF_MAGIC;
|
||||
}
|
||||
|
||||
static std::string llama_escape_whitespaces(const std::string& text) {
|
||||
static std::string llama_escape_whitespaces(const std::string & text) {
|
||||
std::ostringstream out;
|
||||
for (char c : text) {
|
||||
if (c == ' ') out << "\xe2\x96\x81";
|
||||
|
@ -549,7 +549,7 @@ static std::string llama_escape_whitespaces(const std::string& text) {
|
|||
return out.str();
|
||||
}
|
||||
|
||||
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
||||
static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
||||
if (is_ggml_file(filename)) {
|
||||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
|
@ -637,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
|
|||
}
|
||||
}
|
||||
|
||||
void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
|
||||
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
|
||||
int ct;
|
||||
switch (gg_weights->n_dims){
|
||||
case 1:
|
||||
|
@ -673,7 +673,9 @@ void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * kar
|
|||
}
|
||||
}
|
||||
|
||||
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
|
||||
static void save_as_llama_model(
|
||||
struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
|
||||
) {
|
||||
// convert AK weights into GG weights one by one.
|
||||
// w->token_embedding_table -> model->tok_embeddings
|
||||
// float* -> struct ggml_tensor
|
||||
|
@ -785,7 +787,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
|||
gguf_free(ctx);
|
||||
}
|
||||
|
||||
struct train_params get_default_train_params() {
|
||||
static struct train_params get_default_train_params() {
|
||||
struct train_params params;
|
||||
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
|
||||
params.fn_llama2c_output_model = "ak_llama_model.bin";
|
||||
|
@ -835,7 +837,7 @@ struct train_params get_default_train_params() {
|
|||
return params;
|
||||
}
|
||||
|
||||
void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
|
||||
static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
|
@ -846,7 +848,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
|
|||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
bool params_parse(int argc, char ** argv, struct train_params * params) {
|
||||
static bool params_parse(int argc, char ** argv, struct train_params * params) {
|
||||
bool invalid_param = false;
|
||||
bool reqd_param_found = false;
|
||||
std::string arg;
|
||||
|
@ -901,7 +903,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
|
|||
return true;
|
||||
}
|
||||
|
||||
std::string basename(const std::string &path) {
|
||||
static std::string basename(const std::string &path) {
|
||||
size_t pos = path.find_last_of("/\\");
|
||||
if (pos == std::string::npos) {
|
||||
return path;
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
#include "common.h"
|
||||
#include "embd-input.h"
|
||||
|
||||
#include <cassert>
|
||||
|
@ -22,7 +23,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
print_build_info();
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = uint32_t(time(NULL));
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
extern "C" {
|
||||
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <ctime>
|
||||
|
||||
|
@ -17,7 +16,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
params.embedding = true;
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
print_build_info();
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
|
|
|
@ -13,14 +13,14 @@
|
|||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
template<typename T>
|
||||
template <typename T>
|
||||
static std::string to_string(const T & val) {
|
||||
std::stringstream ss;
|
||||
ss << val;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
bool gguf_ex_write(const std::string & fname) {
|
||||
static bool gguf_ex_write(const std::string & fname) {
|
||||
struct gguf_context * ctx = gguf_init_empty();
|
||||
|
||||
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
|
||||
|
@ -85,7 +85,7 @@ bool gguf_ex_write(const std::string & fname) {
|
|||
}
|
||||
|
||||
// just read tensor info
|
||||
bool gguf_ex_read_0(const std::string & fname) {
|
||||
static bool gguf_ex_read_0(const std::string & fname) {
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ NULL,
|
||||
|
@ -143,7 +143,7 @@ bool gguf_ex_read_0(const std::string & fname) {
|
|||
}
|
||||
|
||||
// read and create ggml_context containing the tensors and their data
|
||||
bool gguf_ex_read_1(const std::string & fname) {
|
||||
static bool gguf_ex_read_1(const std::string & fname) {
|
||||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
|
|
|
@ -74,14 +74,6 @@ static T stdev(const std::vector<T> & v) {
|
|||
return stdev;
|
||||
}
|
||||
|
||||
static bool ggml_cpu_has_metal() {
|
||||
#if defined(GGML_USE_METAL)
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::string get_cpu_info() {
|
||||
std::string id;
|
||||
#ifdef __linux__
|
||||
|
|
|
@ -41,7 +41,8 @@ static std::ostringstream * g_output_ss;
|
|||
static std::vector<llama_token> * g_output_tokens;
|
||||
static bool is_interacting = false;
|
||||
|
||||
void write_logfile(
|
||||
|
||||
static void write_logfile(
|
||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||
const std::vector<llama_token> & input_tokens, const std::string & output,
|
||||
const std::vector<llama_token> & output_tokens
|
||||
|
@ -86,7 +87,7 @@ void write_logfile(
|
|||
}
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
void sigint_handler(int signo) {
|
||||
static void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
if (!is_interacting) {
|
||||
is_interacting = true;
|
||||
|
@ -148,6 +149,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
|
@ -28,9 +27,10 @@ struct results_log_softmax {
|
|||
float prob;
|
||||
};
|
||||
|
||||
void write_logfile(const llama_context * ctx, const gpt_params & params,
|
||||
const llama_model * model, const struct results_perplexity & results) {
|
||||
|
||||
static void write_logfile(
|
||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||
const struct results_perplexity & results
|
||||
) {
|
||||
if (params.logdir.empty()) {
|
||||
return;
|
||||
}
|
||||
|
@ -76,7 +76,7 @@ void write_logfile(const llama_context * ctx, const gpt_params & params,
|
|||
fclose(logfile);
|
||||
}
|
||||
|
||||
std::vector<float> softmax(const std::vector<float>& logits) {
|
||||
static std::vector<float> softmax(const std::vector<float>& logits) {
|
||||
std::vector<float> probs(logits.size());
|
||||
float max_logit = logits[0];
|
||||
for (float v : logits) max_logit = std::max(max_logit, v);
|
||||
|
@ -92,7 +92,7 @@ std::vector<float> softmax(const std::vector<float>& logits) {
|
|||
return probs;
|
||||
}
|
||||
|
||||
results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
|
||||
static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
|
||||
float max_logit = logits[0];
|
||||
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
|
||||
double sum_exp = 0.0;
|
||||
|
@ -100,9 +100,10 @@ results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
|
|||
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
|
||||
}
|
||||
|
||||
void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
|
||||
double & nll, double & nll2, float * logit_history, float * prob_history) {
|
||||
|
||||
static void process_logits(
|
||||
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
|
||||
double & nll, double & nll2, float * logit_history, float * prob_history
|
||||
) {
|
||||
std::mutex mutex;
|
||||
int counter = 0;
|
||||
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
|
||||
|
@ -130,7 +131,7 @@ void process_logits(int n_vocab, const float * logits, const int * tokens, int n
|
|||
|
||||
}
|
||||
|
||||
results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
// Output: `perplexity: 13.5106 [114/114]`
|
||||
|
@ -260,8 +261,7 @@ results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params)
|
|||
return {tokens, std::exp(nll / count), logit_history, prob_history};
|
||||
}
|
||||
|
||||
results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
if (params.ppl_stride > 0) {
|
||||
return perplexity_v2(ctx, params);
|
||||
}
|
||||
|
@ -400,8 +400,9 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
|
|||
return {tokens, ppl, logit_history, prob_history};
|
||||
}
|
||||
|
||||
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
|
||||
int n_vocab, int n_thread) {
|
||||
static std::vector<float> hellaswag_evaluate_tokens(
|
||||
llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, int n_vocab, int n_thread
|
||||
) {
|
||||
std::vector<float> result;
|
||||
result.reserve(tokens.size() * n_vocab);
|
||||
size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
|
||||
|
@ -421,7 +422,7 @@ std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vec
|
|||
return result;
|
||||
}
|
||||
|
||||
void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
// Calculates hellaswag score (acc_norm) from prompt
|
||||
//
|
||||
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
|
||||
|
@ -668,7 +669,7 @@ int main(int argc, char ** argv) {
|
|||
params.n_ctx += params.ppl_stride/2;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
print_build_info();
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
|
|
|
@ -2,4 +2,5 @@ set(TARGET quantize-stats)
|
|||
add_executable(${TARGET} quantize-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
#include "ggml.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#define LLAMA_API_INTERNAL
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
@ -34,8 +33,8 @@ struct quantize_stats_params {
|
|||
std::vector<enum ggml_type> include_types;
|
||||
};
|
||||
|
||||
const size_t HISTOGRAM_BUCKETS = 150;
|
||||
const double HISTOGRAM_RANGE = 0.03;
|
||||
constexpr size_t HISTOGRAM_BUCKETS = 150;
|
||||
constexpr double HISTOGRAM_RANGE = 0.03;
|
||||
|
||||
struct error_stats {
|
||||
size_t num_samples;
|
||||
|
@ -44,8 +43,7 @@ struct error_stats {
|
|||
uint64_t error_histogram[HISTOGRAM_BUCKETS];
|
||||
};
|
||||
|
||||
|
||||
void quantize_stats_print_usage(int /*argc*/, char ** argv) {
|
||||
static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
|
||||
quantize_stats_params params;
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
|
@ -71,7 +69,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
|
|||
}
|
||||
|
||||
// Check if a layer is included/excluded by command line
|
||||
bool layer_included(const quantize_stats_params & params, const std::string & layer) {
|
||||
static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
|
||||
for (const auto& excluded : params.exclude_layers) {
|
||||
if (std::regex_search(layer, std::regex(excluded))) {
|
||||
return false;
|
||||
|
@ -86,7 +84,7 @@ bool layer_included(const quantize_stats_params & params, const std::string & la
|
|||
}
|
||||
|
||||
// Update error statistics given vectors with the before/after result of quantization
|
||||
void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
|
||||
static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
|
||||
for (int64_t i = 0; i < nelements; i++) {
|
||||
double diff = input[i] - output[i];
|
||||
stats.total_error += diff * diff;
|
||||
|
@ -96,14 +94,14 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou
|
|||
stats.num_samples += nelements;
|
||||
}
|
||||
|
||||
void combine_error_stats(error_stats & into, const error_stats & from) {
|
||||
static void combine_error_stats(error_stats & into, const error_stats & from) {
|
||||
into.num_samples += from.num_samples;
|
||||
into.total_error += from.total_error;
|
||||
if (from.max_error > into.max_error) into.max_error = from.max_error;
|
||||
for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
|
||||
}
|
||||
|
||||
double find_quantile(const error_stats & stats, double quantile) {
|
||||
static double find_quantile(const error_stats & stats, double quantile) {
|
||||
double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
|
||||
|
||||
double accum = 0;
|
||||
|
@ -116,7 +114,7 @@ double find_quantile(const error_stats & stats, double quantile) {
|
|||
return INFINITY;
|
||||
}
|
||||
|
||||
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
|
||||
static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
|
||||
double rmse = sqrt(stats.total_error / (double) stats.num_samples);
|
||||
double median = find_quantile(stats, .5);
|
||||
double pct95 = find_quantile(stats, .95);
|
||||
|
@ -143,17 +141,10 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
|
|||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||
}
|
||||
|
||||
void test_roundtrip_on_chunk(
|
||||
const ggml_tensor * layer,
|
||||
int64_t offset,
|
||||
int64_t chunk_size,
|
||||
const ggml_type_traits_t & qfns,
|
||||
bool use_reference,
|
||||
float * input_scratch,
|
||||
char * quantized_scratch,
|
||||
float * output_scratch,
|
||||
error_stats & stats) {
|
||||
|
||||
static void test_roundtrip_on_chunk(
|
||||
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
|
||||
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
|
||||
) {
|
||||
if (layer->type == GGML_TYPE_F16) {
|
||||
for (int i = 0; i < chunk_size; i++) {
|
||||
input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
|
||||
|
@ -174,18 +165,11 @@ void test_roundtrip_on_chunk(
|
|||
|
||||
|
||||
// Run quantization function for a single layer and update error stats
|
||||
void test_roundtrip_on_layer(
|
||||
std::string & name,
|
||||
bool print_layer_stats,
|
||||
const ggml_type_traits_t & qfns,
|
||||
bool use_reference,
|
||||
const ggml_tensor * layer,
|
||||
std::vector<float> & input_scratch,
|
||||
std::vector<char> & quantized_scratch,
|
||||
std::vector<float> & output_scratch,
|
||||
error_stats & total_error,
|
||||
int max_thread = 0) {
|
||||
|
||||
static void test_roundtrip_on_layer(
|
||||
std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
|
||||
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
|
||||
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
|
||||
) {
|
||||
assert(tensor_is_contiguous(layer));
|
||||
error_stats layer_error {};
|
||||
uint64_t nelements = ggml_nelements(layer);
|
||||
|
@ -314,7 +298,7 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
print_build_info();
|
||||
|
||||
// load the model
|
||||
fprintf(stderr, "Loading model\n");
|
||||
|
|
|
@ -2,6 +2,7 @@ set(TARGET quantize)
|
|||
add_executable(${TARGET} quantize.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
|
@ -38,7 +39,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
};
|
||||
|
||||
|
||||
bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||
std::string ftype_str;
|
||||
|
||||
for (auto ch : ftype_str_in) {
|
||||
|
@ -70,7 +71,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
|
|||
// usage:
|
||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||
//
|
||||
void usage(const char * executable) {
|
||||
static void usage(const char * executable) {
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
|
@ -159,6 +160,8 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
|
||||
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
|
||||
if (params.nthread > 0) {
|
||||
fprintf(stderr, " using %d threads", params.nthread);
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
|
@ -17,7 +16,7 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
print_build_info();
|
||||
|
||||
if (params.n_predict < 0) {
|
||||
params.n_predict = 16;
|
||||
|
|
|
@ -1083,8 +1083,9 @@ static json format_final_response(llama_server_context &llama, const std::string
|
|||
return res;
|
||||
}
|
||||
|
||||
static json format_partial_response(llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs)
|
||||
{
|
||||
static json format_partial_response(
|
||||
llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs
|
||||
) {
|
||||
json res = json{
|
||||
{"content", content},
|
||||
{"stop", false},
|
||||
|
@ -1215,7 +1216,7 @@ static void log_server_request(const Request &req, const Response &res)
|
|||
});
|
||||
}
|
||||
|
||||
bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
|
||||
static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
|
||||
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
|
||||
}
|
||||
|
||||
|
@ -1225,7 +1226,7 @@ bool is_at_eob(llama_server_context & server_context, const llama_token * tokens
|
|||
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
||||
// This is also called when the stop condition is met.
|
||||
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
||||
void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
|
||||
static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
|
||||
auto & llama = *static_cast<llama_server_context*>(callback_data);
|
||||
// Mark beams as EOS as needed.
|
||||
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
||||
|
@ -1258,7 +1259,8 @@ struct token_translator {
|
|||
std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
|
||||
};
|
||||
|
||||
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
|
||||
static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama)
|
||||
{
|
||||
auto & gtps = llama.generated_token_probs;
|
||||
auto translator = token_translator{llama.ctx};
|
||||
auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
|
||||
|
|
|
@ -3,6 +3,3 @@ add_executable(${TARGET} simple.cpp)
|
|||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
#include "build-info.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
|
12
ggml-alloc.c
12
ggml-alloc.c
|
@ -131,6 +131,10 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
|
|||
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
||||
}
|
||||
|
||||
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||
return t->view_src != NULL;
|
||||
}
|
||||
|
||||
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
||||
|
@ -338,8 +342,8 @@ static void free_vmem(void * base_addr, size_t size) {
|
|||
|
||||
// allocate uncommitted virtual memory to measure the size of the graph
|
||||
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
||||
// 1TB for 64-bit, 1GB for 32-bit
|
||||
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
|
||||
// 128GB for 64-bit, 1GB for 32-bit
|
||||
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
|
||||
do {
|
||||
*base_addr = alloc_vmem(*size);
|
||||
if (*base_addr != NULL) {
|
||||
|
@ -399,10 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
|||
|
||||
//////////// compute graph allocator
|
||||
|
||||
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||
return t->view_src != NULL;
|
||||
}
|
||||
|
||||
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||
if (a->type != b->type) {
|
||||
return false;
|
||||
|
|
12
ggml-metal.m
12
ggml-metal.m
|
@ -78,6 +78,7 @@ struct ggml_metal_context {
|
|||
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||
GGML_METAL_DECL_KERNEL(norm);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
|
||||
|
@ -89,6 +90,7 @@ struct ggml_metal_context {
|
|||
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
|
||||
|
@ -237,6 +239,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||
GGML_METAL_ADD_KERNEL(norm);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
|
||||
|
@ -248,6 +251,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
|
||||
|
@ -309,6 +313,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_DEL_KERNEL(rms_norm);
|
||||
GGML_METAL_DEL_KERNEL(norm);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
|
||||
|
@ -320,6 +325,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||
GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
|
||||
|
@ -885,6 +891,7 @@ void ggml_metal_graph_compute(
|
|||
ne00%32 == 0 &&
|
||||
ne11 > 1) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
|
||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
|
||||
|
@ -919,6 +926,11 @@ void ggml_metal_graph_compute(
|
|||
|
||||
// use custom matrix x vector kernel
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
|
||||
nrows = 4;
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
nth0 = 32;
|
||||
|
|
|
@ -118,7 +118,7 @@ kernel void kernel_soft_max(
|
|||
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
|
||||
// parallel max
|
||||
float lmax = psrc0[tpitg[0]];
|
||||
float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
|
||||
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
|
||||
lmax = MAX(lmax, psrc0[i00]);
|
||||
}
|
||||
|
@ -158,7 +158,7 @@ kernel void kernel_soft_max_4(
|
|||
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
|
||||
// parallel max
|
||||
float4 lmax4 = psrc4[tpitg[0]];
|
||||
float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
|
||||
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
||||
lmax4 = fmax(lmax4, psrc4[i00]);
|
||||
}
|
||||
|
@ -523,6 +523,79 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|||
}
|
||||
}
|
||||
|
||||
#define N_F32_F32 4
|
||||
|
||||
kernel void kernel_mul_mat_f32_f32(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t rb = tgpig.y*N_F32_F32;
|
||||
const int64_t im = tgpig.z;
|
||||
|
||||
device const float * x = (device const float *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
||||
|
||||
if (ne00 < 128) {
|
||||
for (int row = 0; row < N_F32_F32; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tiisg; i < ne00; i += 32) {
|
||||
sumf += (float) x[i] * (float) y[i];
|
||||
}
|
||||
|
||||
float all_sum = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
device const float4 * x4 = (device const float4 *)x;
|
||||
for (int row = 0; row < N_F32_F32; ++row) {
|
||||
int r1 = rb + row;
|
||||
if (r1 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||
device const float4 * y4 = (device const float4 *) y;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = tiisg; i < ne00/4; i += 32) {
|
||||
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
||||
}
|
||||
|
||||
float all_sum = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
|
||||
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_f16_f32_1row(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
|
@ -1399,13 +1472,13 @@ kernel void kernel_mul_mat_q4_K_f32(
|
|||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
constant int64_t & ne02[[buffer(5)]],
|
||||
constant int64_t & ne10[[buffer(9)]],
|
||||
constant int64_t & ne12[[buffer(11)]],
|
||||
constant int64_t & ne0[[buffer(15)]],
|
||||
constant int64_t & ne1[[buffer(16)]],
|
||||
constant uint & gqa[[buffer(17)]],
|
||||
constant int64_t & ne01 [[buffer(4)]],
|
||||
constant int64_t & ne02 [[buffer(5)]],
|
||||
constant int64_t & ne10 [[buffer(9)]],
|
||||
constant int64_t & ne12 [[buffer(11)]],
|
||||
constant int64_t & ne0 [[buffer(15)]],
|
||||
constant int64_t & ne1 [[buffer(16)]],
|
||||
constant uint & gqa [[buffer(17)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
@ -2012,7 +2085,6 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
|
|||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
|
@ -2269,6 +2341,7 @@ typedef void (mat_mm_t)(
|
|||
constant uint & gqa,
|
||||
threadgroup uchar *, uint3, uint, uint);
|
||||
|
||||
template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm<float4x4, 1, dequantize_f32>;
|
||||
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
|
||||
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
|
||||
|
|
90
ggml.c
90
ggml.c
|
@ -17290,14 +17290,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
} else {
|
||||
// wait for other threads to finish
|
||||
const int last = node_n;
|
||||
do {
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_METAL)
|
||||
//apple does nothing
|
||||
#else
|
||||
while (true) {
|
||||
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
||||
// depending on the workload and the operating system.
|
||||
// since it is not clear what is the best approach, it should potentially become user-configurable
|
||||
// ref: https://github.com/ggerganov/ggml/issues/291
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
sched_yield();
|
||||
#endif
|
||||
#endif
|
||||
|
||||
node_n = atomic_load(&state->shared->node_n);
|
||||
} while (node_n == last);
|
||||
if (node_n != last) break;
|
||||
};
|
||||
}
|
||||
|
||||
// check if we should stop
|
||||
|
@ -18348,7 +18352,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||
struct ggml_tensor * node = cgraph->leafs[i];
|
||||
|
||||
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
|
||||
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
|
||||
i,
|
||||
node->ne[0], node->ne[1],
|
||||
ggml_op_name(node->op),
|
||||
|
@ -20111,27 +20115,27 @@ const char * gguf_type_name(enum gguf_type type) {
|
|||
return GGUF_TYPE_NAME[type];
|
||||
}
|
||||
|
||||
int gguf_get_version(struct gguf_context * ctx) {
|
||||
int gguf_get_version(const struct gguf_context * ctx) {
|
||||
return ctx->header.version;
|
||||
}
|
||||
|
||||
size_t gguf_get_alignment(struct gguf_context * ctx) {
|
||||
size_t gguf_get_alignment(const struct gguf_context * ctx) {
|
||||
return ctx->alignment;
|
||||
}
|
||||
|
||||
size_t gguf_get_data_offset(struct gguf_context * ctx) {
|
||||
size_t gguf_get_data_offset(const struct gguf_context * ctx) {
|
||||
return ctx->offset;
|
||||
}
|
||||
|
||||
void * gguf_get_data(struct gguf_context * ctx) {
|
||||
void * gguf_get_data(const struct gguf_context * ctx) {
|
||||
return ctx->data;
|
||||
}
|
||||
|
||||
int gguf_get_n_kv(struct gguf_context * ctx) {
|
||||
int gguf_get_n_kv(const struct gguf_context * ctx) {
|
||||
return ctx->header.n_kv;
|
||||
}
|
||||
|
||||
int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
||||
int gguf_find_key(const struct gguf_context * ctx, const char * key) {
|
||||
// return -1 if key not found
|
||||
int keyfound = -1;
|
||||
|
||||
|
@ -20147,85 +20151,85 @@ int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
|||
return keyfound;
|
||||
}
|
||||
|
||||
const char * gguf_get_key(struct gguf_context * ctx, int i) {
|
||||
const char * gguf_get_key(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].key.data;
|
||||
}
|
||||
|
||||
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
|
||||
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].type;
|
||||
}
|
||||
|
||||
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
|
||||
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.arr.type;
|
||||
}
|
||||
|
||||
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
|
||||
const void * gguf_get_arr_data(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.arr.data;
|
||||
}
|
||||
|
||||
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
|
||||
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
|
||||
struct gguf_kv * kv = &ctx->kv[key_id];
|
||||
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
||||
return str->data;
|
||||
}
|
||||
|
||||
int gguf_get_arr_n(struct gguf_context * ctx, int i) {
|
||||
int gguf_get_arr_n(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.arr.n;
|
||||
}
|
||||
|
||||
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
|
||||
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.uint8;
|
||||
}
|
||||
|
||||
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
|
||||
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.int8;
|
||||
}
|
||||
|
||||
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
|
||||
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.uint16;
|
||||
}
|
||||
|
||||
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
|
||||
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.int16;
|
||||
}
|
||||
|
||||
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
|
||||
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.uint32;
|
||||
}
|
||||
|
||||
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
|
||||
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.int32;
|
||||
}
|
||||
|
||||
float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
||||
float gguf_get_val_f32(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.float32;
|
||||
}
|
||||
|
||||
uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
|
||||
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.uint64;
|
||||
}
|
||||
|
||||
int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
|
||||
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.int64;
|
||||
}
|
||||
|
||||
double gguf_get_val_f64(struct gguf_context * ctx, int i) {
|
||||
double gguf_get_val_f64(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.float64;
|
||||
}
|
||||
|
||||
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
||||
bool gguf_get_val_bool(const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.bool_;
|
||||
}
|
||||
|
||||
const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
|
||||
const char * gguf_get_val_str (const struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.str.data;
|
||||
}
|
||||
|
||||
int gguf_get_n_tensors(struct gguf_context * ctx) {
|
||||
int gguf_get_n_tensors(const struct gguf_context * ctx) {
|
||||
return ctx->header.n_tensors;
|
||||
}
|
||||
|
||||
int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
||||
int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
|
||||
// return -1 if tensor not found
|
||||
int tensorfound = -1;
|
||||
|
||||
|
@ -20241,11 +20245,11 @@ int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
|||
return tensorfound;
|
||||
}
|
||||
|
||||
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
|
||||
size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
|
||||
return ctx->infos[i].offset;
|
||||
}
|
||||
|
||||
char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
|
||||
char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
|
||||
return ctx->infos[i].name.data;
|
||||
}
|
||||
|
||||
|
@ -20528,7 +20532,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
|
|||
buf->offset += el_size;
|
||||
}
|
||||
|
||||
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
||||
static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
||||
// write header
|
||||
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
|
||||
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
|
||||
|
@ -20643,7 +20647,7 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
|
|||
}
|
||||
}
|
||||
|
||||
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
|
||||
void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
|
||||
FILE * file = fopen(fname, "wb");
|
||||
if (!file) {
|
||||
GGML_ASSERT(false && "failed to open file for writing");
|
||||
|
@ -20660,7 +20664,7 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only
|
|||
fclose(file);
|
||||
}
|
||||
|
||||
size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
||||
size_t gguf_get_meta_size(const struct gguf_context * ctx) {
|
||||
// no allocs - only compute size
|
||||
struct gguf_buf buf = gguf_buf_init(0);
|
||||
|
||||
|
@ -20669,7 +20673,7 @@ size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
|||
return buf.offset;
|
||||
}
|
||||
|
||||
void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
|
||||
void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
|
||||
struct gguf_buf buf = gguf_buf_init(16*1024);
|
||||
|
||||
gguf_write_to_buf(ctx, &buf, true);
|
||||
|
@ -20745,6 +20749,14 @@ int ggml_cpu_has_arm_fma(void) {
|
|||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_metal(void) {
|
||||
#if defined(GGML_USE_METAL)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_f16c(void) {
|
||||
#if defined(__F16C__)
|
||||
return 1;
|
||||
|
|
74
ggml.h
74
ggml.h
|
@ -195,6 +195,14 @@
|
|||
# define GGML_DEPRECATED(func, hint) func
|
||||
#endif
|
||||
|
||||
#ifndef __GNUC__
|
||||
# define GGML_ATTRIBUTE_FORMAT(...)
|
||||
#elif defined(__MINGW32__)
|
||||
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
#else
|
||||
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
@ -270,7 +278,7 @@ extern "C" {
|
|||
|
||||
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||
typedef half ggml_fp16_t;
|
||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
#elif defined(__ARM_NEON)
|
||||
typedef __fp16 ggml_fp16_t;
|
||||
#else
|
||||
typedef uint16_t ggml_fp16_t;
|
||||
|
@ -685,6 +693,7 @@ extern "C" {
|
|||
|
||||
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
||||
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
|
||||
|
||||
//
|
||||
|
@ -1866,39 +1875,39 @@ extern "C" {
|
|||
|
||||
GGML_API const char * gguf_type_name(enum gguf_type type);
|
||||
|
||||
GGML_API int gguf_get_version (struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
|
||||
GGML_API void * gguf_get_data (struct gguf_context * ctx);
|
||||
GGML_API int gguf_get_version (const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
|
||||
GGML_API void * gguf_get_data (const struct gguf_context * ctx);
|
||||
|
||||
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
|
||||
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
|
||||
GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
|
||||
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
|
||||
|
||||
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
|
||||
GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
|
||||
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
|
||||
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
|
||||
|
||||
// results are undefined if the wrong type is used for the key
|
||||
GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
|
||||
GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
|
||||
GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
|
||||
GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i);
|
||||
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
|
||||
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
|
||||
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
|
||||
GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
|
||||
GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
|
||||
GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
|
||||
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
||||
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
||||
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
||||
GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
|
||||
GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
|
||||
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i);
|
||||
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i);
|
||||
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i);
|
||||
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i);
|
||||
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i);
|
||||
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i);
|
||||
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i);
|
||||
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i);
|
||||
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i);
|
||||
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i);
|
||||
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i);
|
||||
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
|
||||
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i);
|
||||
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
|
||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
||||
|
||||
GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
|
||||
GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
|
||||
GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
|
||||
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
||||
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
||||
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
||||
|
||||
// overrides existing values or adds a new one
|
||||
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||
|
@ -1943,11 +1952,11 @@ extern "C" {
|
|||
//
|
||||
|
||||
// write the entire context to a binary file
|
||||
GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||
GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||
|
||||
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
||||
GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
|
||||
GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
|
||||
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
||||
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
||||
|
||||
//
|
||||
// system info
|
||||
|
@ -1961,6 +1970,7 @@ extern "C" {
|
|||
GGML_API int ggml_cpu_has_fma (void);
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_API int ggml_cpu_has_metal (void);
|
||||
GGML_API int ggml_cpu_has_f16c (void);
|
||||
GGML_API int ggml_cpu_has_fp16_va (void);
|
||||
GGML_API int ggml_cpu_has_wasm_simd (void);
|
||||
|
|
|
@ -77,13 +77,14 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
|
|||
|
||||
|
||||
class MODEL_ARCH(IntEnum):
|
||||
LLAMA : int = auto()
|
||||
FALCON : int = auto()
|
||||
BAICHUAN:int = auto()
|
||||
GPT2 : int = auto()
|
||||
GPTJ : int = auto()
|
||||
GPTNEOX: int = auto()
|
||||
MPT : int = auto()
|
||||
LLAMA : int = auto()
|
||||
FALCON : int = auto()
|
||||
BAICHUAN : int = auto()
|
||||
GPT2 : int = auto()
|
||||
GPTJ : int = auto()
|
||||
GPTNEOX : int = auto()
|
||||
MPT : int = auto()
|
||||
STARCODER : int = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
|
@ -107,13 +108,14 @@ class MODEL_TENSOR(IntEnum):
|
|||
|
||||
|
||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.LLAMA: "llama",
|
||||
MODEL_ARCH.FALCON: "falcon",
|
||||
MODEL_ARCH.BAICHUAN:"baichuan",
|
||||
MODEL_ARCH.GPT2: "gpt2",
|
||||
MODEL_ARCH.GPTJ: "gptj",
|
||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||
MODEL_ARCH.MPT: "mpt",
|
||||
MODEL_ARCH.LLAMA: "llama",
|
||||
MODEL_ARCH.FALCON: "falcon",
|
||||
MODEL_ARCH.BAICHUAN: "baichuan",
|
||||
MODEL_ARCH.GPT2: "gpt2",
|
||||
MODEL_ARCH.GPTJ: "gptj",
|
||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||
MODEL_ARCH.MPT: "mpt",
|
||||
MODEL_ARCH.STARCODER: "starcoder",
|
||||
}
|
||||
|
||||
MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
|
||||
|
@ -171,6 +173,18 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
|
|||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.STARCODER: {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
},
|
||||
MODEL_ARCH.GPT2: {
|
||||
# TODO
|
||||
},
|
||||
|
|
387
llama.cpp
387
llama.cpp
|
@ -1,3 +1,4 @@
|
|||
#define LLAMA_API_INTERNAL
|
||||
#include "llama.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
@ -109,7 +110,7 @@ static size_t utf8_len(char src) {
|
|||
return lookup[highbits];
|
||||
}
|
||||
|
||||
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
std::string result;
|
||||
for (size_t pos = 0; ; pos += search.length()) {
|
||||
auto new_pos = s.find(search, pos);
|
||||
|
@ -161,17 +162,19 @@ enum llm_arch {
|
|||
LLM_ARCH_GPTJ,
|
||||
LLM_ARCH_GPTNEOX,
|
||||
LLM_ARCH_MPT,
|
||||
LLM_ARCH_STARCODER,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_LLAMA, "llama" },
|
||||
{ LLM_ARCH_FALCON, "falcon" },
|
||||
{ LLM_ARCH_GPT2, "gpt2" },
|
||||
{ LLM_ARCH_GPTJ, "gptj" },
|
||||
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
||||
{ LLM_ARCH_MPT, "mpt" },
|
||||
{ LLM_ARCH_BAICHUAN,"baichuan" },
|
||||
{ LLM_ARCH_LLAMA, "llama" },
|
||||
{ LLM_ARCH_FALCON, "falcon" },
|
||||
{ LLM_ARCH_GPT2, "gpt2" },
|
||||
{ LLM_ARCH_GPTJ, "gptj" },
|
||||
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
||||
{ LLM_ARCH_MPT, "mpt" },
|
||||
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
||||
{ LLM_ARCH_STARCODER, "starcoder" },
|
||||
};
|
||||
|
||||
enum llm_kv {
|
||||
|
@ -377,6 +380,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_STARCODER,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
|
@ -900,9 +918,11 @@ static llama_state g_state;
|
|||
// available llama models
|
||||
enum e_model {
|
||||
MODEL_UNKNOWN,
|
||||
MODEL_1B,
|
||||
MODEL_3B,
|
||||
MODEL_7B,
|
||||
MODEL_13B,
|
||||
MODEL_15B,
|
||||
MODEL_30B,
|
||||
MODEL_34B,
|
||||
MODEL_40B,
|
||||
|
@ -971,13 +991,22 @@ struct llama_layer {
|
|||
struct ggml_tensor * wo;
|
||||
struct ggml_tensor * wqkv;
|
||||
|
||||
// attention bias
|
||||
struct ggml_tensor * bo;
|
||||
struct ggml_tensor * bqkv;
|
||||
|
||||
// normalization
|
||||
struct ggml_tensor * ffn_norm;
|
||||
struct ggml_tensor * ffn_norm_b;
|
||||
|
||||
// ff
|
||||
struct ggml_tensor * w1; // ffn_gate
|
||||
struct ggml_tensor * w2; // ffn_down
|
||||
struct ggml_tensor * w3; // ffn_up
|
||||
|
||||
// ff bias
|
||||
struct ggml_tensor * b2; // ffn_down
|
||||
struct ggml_tensor * b3; // ffn_up
|
||||
};
|
||||
|
||||
struct llama_kv_cache {
|
||||
|
@ -1055,6 +1084,7 @@ struct llama_model {
|
|||
llama_vocab vocab;
|
||||
|
||||
struct ggml_tensor * tok_embeddings;
|
||||
struct ggml_tensor * pos_embeddings;
|
||||
|
||||
struct ggml_tensor * output_norm;
|
||||
struct ggml_tensor * output_norm_b;
|
||||
|
@ -1566,7 +1596,7 @@ struct llama_model_loader {
|
|||
// load LLaMA models
|
||||
//
|
||||
|
||||
std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
||||
static std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
||||
if (ftype & LLAMA_FTYPE_GUESSED) {
|
||||
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
||||
}
|
||||
|
@ -1599,9 +1629,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
|||
|
||||
static const char * llama_model_type_name(e_model type) {
|
||||
switch (type) {
|
||||
case MODEL_1B: return "1B";
|
||||
case MODEL_3B: return "3B";
|
||||
case MODEL_7B: return "7B";
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_15B: return "15B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_34B: return "34B";
|
||||
case MODEL_40B: return "40B";
|
||||
|
@ -1719,6 +1751,17 @@ static void llm_load_hparams(
|
|||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER:
|
||||
{
|
||||
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
||||
switch (hparams.n_layer) {
|
||||
case 24: model.type = e_model::MODEL_1B; break;
|
||||
case 36: model.type = e_model::MODEL_3B; break;
|
||||
case 42: model.type = e_model::MODEL_7B; break;
|
||||
case 40: model.type = e_model::MODEL_15B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: (void)0;
|
||||
};
|
||||
|
||||
|
@ -2172,6 +2215,85 @@ static void llm_load_tensors(
|
|||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER:
|
||||
{
|
||||
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
|
||||
|
||||
// output
|
||||
{
|
||||
ggml_backend backend_norm;
|
||||
ggml_backend backend_output;
|
||||
|
||||
if (n_gpu_layers > int(n_layer)) {
|
||||
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
||||
// on Windows however this is detrimental unless everything is on the GPU
|
||||
#ifndef _WIN32
|
||||
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||
#else
|
||||
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||
#endif // _WIN32
|
||||
|
||||
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
||||
} else {
|
||||
backend_norm = GGML_BACKEND_CPU;
|
||||
backend_output = GGML_BACKEND_CPU;
|
||||
}
|
||||
|
||||
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
||||
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
|
||||
if (backend_norm == GGML_BACKEND_GPU) {
|
||||
vram_weights += ggml_nbytes(model.output_norm);
|
||||
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||
}
|
||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||
vram_weights += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
||||
|
||||
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
||||
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
||||
|
||||
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
||||
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
||||
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
||||
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
||||
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
|
||||
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
};
|
||||
|
@ -3315,6 +3437,235 @@ static struct ggml_cgraph * llm_build_falcon(
|
|||
return gf;
|
||||
}
|
||||
|
||||
static struct ggml_cgraph * llm_build_starcoder(
|
||||
llama_context & lctx,
|
||||
const llama_token * tokens,
|
||||
const float * embd,
|
||||
int n_tokens,
|
||||
int n_past) {
|
||||
|
||||
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
||||
|
||||
const int N = n_tokens;
|
||||
|
||||
const auto & model = lctx.model;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const auto & kv_self = lctx.kv_self;
|
||||
|
||||
GGML_ASSERT(!!kv_self.ctx);
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
const int64_t n_ctx = hparams.n_ctx;
|
||||
const int64_t n_head = hparams.n_head;
|
||||
const int64_t n_head_kv = hparams.n_head_kv;
|
||||
const int64_t n_embd_head = hparams.n_embd_head();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
const float norm_eps = hparams.f_norm_eps;
|
||||
|
||||
auto & buf_compute = lctx.buf_compute;
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute.size,
|
||||
/*.mem_buffer =*/ buf_compute.data,
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
params.no_alloc = true;
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * token;
|
||||
struct ggml_tensor * position;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
if (tokens) {
|
||||
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
|
||||
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
||||
}
|
||||
ggml_set_name(inp_tokens, "inp_tokens");
|
||||
|
||||
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
||||
} else {
|
||||
#ifdef GGML_USE_MPI
|
||||
GGML_ASSERT(false && "not implemented");
|
||||
#endif
|
||||
|
||||
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
||||
|
||||
ggml_allocr_alloc(lctx.alloc, token);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
memcpy(token->data, embd, N * n_embd * ggml_element_size(inpL));
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// Compute position embeddings.
|
||||
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
for (int i = 0; i < N; ++i) {
|
||||
((int32_t *) inp_positions->data)[i] = n_past + i;
|
||||
}
|
||||
}
|
||||
ggml_set_name(inp_positions, "inp_positions");
|
||||
|
||||
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
||||
}
|
||||
|
||||
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||
}
|
||||
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
||||
|
||||
inpL = ggml_add(ctx0, token, position);
|
||||
ggml_set_name(inpL, "inpL");
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
{
|
||||
// Norm
|
||||
cur = ggml_norm(ctx0, inpL, norm_eps);
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
||||
}
|
||||
|
||||
{
|
||||
// Self Attention
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
||||
|
||||
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
|
||||
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
|
||||
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
||||
|
||||
struct ggml_tensor * Qcur = tmpq;
|
||||
struct ggml_tensor * Kcur = tmpk;
|
||||
|
||||
{
|
||||
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
|
||||
ggml_set_name(Vcur, "Vcur");
|
||||
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
||||
ggml_set_name(k, "k");
|
||||
|
||||
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
||||
( n_ctx)*ggml_element_size(kv_self.v),
|
||||
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
||||
}
|
||||
|
||||
struct ggml_tensor * Q =
|
||||
ggml_permute(ctx0,
|
||||
ggml_cpy(ctx0,
|
||||
Qcur,
|
||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)),
|
||||
0, 2, 1, 3);
|
||||
ggml_set_name(Q, "Q");
|
||||
|
||||
struct ggml_tensor * K =
|
||||
ggml_view_3d(ctx0, kv_self.k,
|
||||
n_embd_head, n_past + N, n_head_kv,
|
||||
ggml_element_size(kv_self.k)*n_embd_gqa,
|
||||
ggml_element_size(kv_self.k)*n_embd_head,
|
||||
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
||||
ggml_set_name(K, "K");
|
||||
|
||||
// K * Q
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
ggml_set_name(KQ, "KQ");
|
||||
|
||||
// KQ_scaled = KQ / sqrt(n_embd_head)
|
||||
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
||||
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
||||
ggml_set_name(KQ_scaled, "KQ_scaled");
|
||||
|
||||
// KQ_masked = mask_past(KQ_scaled)
|
||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||
ggml_set_name(KQ_masked, "KQ_masked");
|
||||
|
||||
// KQ = soft_max(KQ_masked)
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
||||
|
||||
// split cached V into n_head heads
|
||||
struct ggml_tensor * V =
|
||||
ggml_view_3d(ctx0, kv_self.v,
|
||||
n_past + N, n_embd_head, n_head_kv,
|
||||
ggml_element_size(kv_self.v)*n_ctx,
|
||||
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
||||
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
||||
ggml_set_name(V, "V");
|
||||
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||
ggml_set_name(KQV, "KQV");
|
||||
|
||||
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
ggml_set_name(KQV_merged, "KQV_merged");
|
||||
|
||||
// cur = KQV_merged.contiguous().view(n_embd, N)
|
||||
cur = ggml_cpy(ctx0,
|
||||
KQV_merged,
|
||||
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||
ggml_set_name(cur, "KQV_merged_contiguous");
|
||||
}
|
||||
|
||||
// Projection
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
||||
|
||||
// Add the input
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
struct ggml_tensor * inpFF = cur;
|
||||
|
||||
// FF
|
||||
{
|
||||
// Norm
|
||||
{
|
||||
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
||||
|
||||
// GELU activation
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
// Projection
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
||||
}
|
||||
|
||||
inpL = ggml_add(ctx0, cur, inpFF);
|
||||
}
|
||||
|
||||
// Output Norm
|
||||
{
|
||||
cur = ggml_norm(ctx0, inpL, norm_eps);
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
||||
}
|
||||
ggml_set_name(cur, "result_norm");
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
ggml_set_name(cur, "result_output");
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
ggml_free(ctx0);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph(
|
||||
llama_context & lctx,
|
||||
const llama_token * tokens,
|
||||
|
@ -3338,6 +3689,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
{
|
||||
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER:
|
||||
{
|
||||
result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past);
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
};
|
||||
|
@ -3951,7 +4306,7 @@ struct llama_grammar_candidate {
|
|||
|
||||
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
||||
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
||||
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||
const char * src,
|
||||
llama_partial_utf8 partial_start) {
|
||||
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
||||
|
@ -5552,7 +5907,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
}
|
||||
|
||||
// TODO: after the GGUF PR, this likely won't work and needs to be updated
|
||||
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||
static int llama_apply_lora_from_file_internal(
|
||||
const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
|
||||
) {
|
||||
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
||||
|
||||
const int64_t t_start_lora_us = ggml_time_us();
|
||||
|
@ -6099,7 +6456,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
return ctx;
|
||||
}
|
||||
|
||||
struct llama_context * llama_init_from_file(
|
||||
static struct llama_context * llama_init_from_file(
|
||||
const char * path_model,
|
||||
struct llama_context_params params) {
|
||||
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
||||
|
@ -6304,7 +6661,7 @@ struct llama_data_file_context : llama_data_context {
|
|||
* llama_copy_state_data(ctx, &data_ctx);
|
||||
*
|
||||
*/
|
||||
void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
||||
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
||||
// copy rng
|
||||
{
|
||||
std::stringstream rng_ss;
|
||||
|
@ -6842,7 +7199,9 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
|||
}
|
||||
|
||||
// For internal test use
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
||||
struct llama_context * ctx
|
||||
) {
|
||||
return ctx->model.tensors_by_name;
|
||||
}
|
||||
|
||||
|
|
4
llama.h
4
llama.h
|
@ -540,7 +540,9 @@ extern "C" {
|
|||
|
||||
struct ggml_tensor;
|
||||
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
||||
struct llama_context * ctx
|
||||
);
|
||||
|
||||
#endif // LLAMA_API_INTERNAL
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
typedef int codepoint;
|
||||
|
||||
std::string codepoint_to_utf8(codepoint cp) {
|
||||
static std::string codepoint_to_utf8(codepoint cp) {
|
||||
std::string result;
|
||||
if (0x00 <= cp && cp <= 0x7f) {
|
||||
result.push_back(cp);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue