Compare commits
15 commits
master
...
gg/llama-s
Author | SHA1 | Date | |
---|---|---|---|
|
a97b3621cf | ||
|
afd40ea206 | ||
|
36803b1902 | ||
|
a59ee7c4eb | ||
|
10eb87409e | ||
|
f65e3d324d | ||
|
439e68c1e5 | ||
|
34889bf810 | ||
|
e159e7751c | ||
|
9a735ae6d8 | ||
|
82caffa74e | ||
|
32e7b9dc99 | ||
|
0127774ae4 | ||
|
0bebe45a25 | ||
|
168324a388 |
47 changed files with 997 additions and 1042 deletions
|
@ -13,6 +13,9 @@
|
||||||
# # with SYCL support
|
# # with SYCL support
|
||||||
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
#
|
#
|
||||||
|
# # with METAL support
|
||||||
|
# GG_BUILD_METAL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
|
#
|
||||||
# # with VULKAN support
|
# # with VULKAN support
|
||||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
#
|
#
|
||||||
|
|
|
@ -15,6 +15,14 @@ function(llama_add_compile_flags)
|
||||||
|
|
||||||
list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
|
list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
|
||||||
|
|
||||||
|
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||||
|
list(APPEND CXX_FLAGS -Wshadow)
|
||||||
|
|
||||||
|
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||||
|
list(APPEND CXX_FLAGS -Wshadow-field-in-constructor)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
||||||
|
|
||||||
list(APPEND C_FLAGS ${WARNING_FLAGS})
|
list(APPEND C_FLAGS ${WARNING_FLAGS})
|
||||||
|
|
962
common/arg.cpp
962
common/arg.cpp
File diff suppressed because it is too large
Load diff
46
common/arg.h
46
common/arg.h
|
@ -25,43 +25,43 @@ struct common_arg {
|
||||||
void (*handler_int) (common_params & params, int) = nullptr;
|
void (*handler_int) (common_params & params, int) = nullptr;
|
||||||
|
|
||||||
common_arg(
|
common_arg(
|
||||||
const std::initializer_list<const char *> & args,
|
const std::initializer_list<const char *> & args_,
|
||||||
const char * value_hint,
|
const char * value_hint_,
|
||||||
const std::string & help,
|
const std::string & help_,
|
||||||
void (*handler)(common_params & params, const std::string &)
|
void (*handler)(common_params & params, const std::string &)
|
||||||
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
) : args(args_), value_hint(value_hint_), help(help_), handler_string(handler) {}
|
||||||
|
|
||||||
common_arg(
|
common_arg(
|
||||||
const std::initializer_list<const char *> & args,
|
const std::initializer_list<const char *> & args_,
|
||||||
const char * value_hint,
|
const char * value_hint_,
|
||||||
const std::string & help,
|
const std::string & help_,
|
||||||
void (*handler)(common_params & params, int)
|
void (*handler)(common_params & params, int)
|
||||||
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
) : args(args_), value_hint(value_hint_), help(help_), handler_int(handler) {}
|
||||||
|
|
||||||
common_arg(
|
common_arg(
|
||||||
const std::initializer_list<const char *> & args,
|
const std::initializer_list<const char *> & args_,
|
||||||
const std::string & help,
|
const std::string & help_,
|
||||||
void (*handler)(common_params & params)
|
void (*handler)(common_params & params)
|
||||||
) : args(args), help(help), handler_void(handler) {}
|
) : args(args_), help(help_), handler_void(handler) {}
|
||||||
|
|
||||||
// support 2 values for arg
|
// support 2 values for arg
|
||||||
common_arg(
|
common_arg(
|
||||||
const std::initializer_list<const char *> & args,
|
const std::initializer_list<const char *> & args_,
|
||||||
const char * value_hint,
|
const char * value_hint_,
|
||||||
const char * value_hint_2,
|
const char * value_hint_2_,
|
||||||
const std::string & help,
|
const std::string & help_,
|
||||||
void (*handler)(common_params & params, const std::string &, const std::string &)
|
void (*handler)(common_params & params, const std::string &, const std::string &)
|
||||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
) : args(args_), value_hint(value_hint_), value_hint_2(value_hint_2_), help(help_), handler_str_str(handler) {}
|
||||||
|
|
||||||
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
common_arg & set_examples(std::initializer_list<enum llama_example> vals);
|
||||||
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
common_arg & set_excludes(std::initializer_list<enum llama_example> vals);
|
||||||
common_arg & set_env(const char * env);
|
common_arg & set_env(const char * val);
|
||||||
common_arg & set_sparam();
|
common_arg & set_sparam();
|
||||||
bool in_example(enum llama_example ex);
|
bool in_example(enum llama_example ex);
|
||||||
bool is_exclude(enum llama_example ex);
|
bool is_exclude(enum llama_example ex);
|
||||||
bool get_value_from_env(std::string & output);
|
bool get_value_from_env(std::string & output) const;
|
||||||
bool has_value_from_env();
|
bool has_value_from_env() const;
|
||||||
std::string to_string();
|
std::string to_string() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_params_context {
|
struct common_params_context {
|
||||||
|
@ -69,7 +69,7 @@ struct common_params_context {
|
||||||
common_params & params;
|
common_params & params;
|
||||||
std::vector<common_arg> options;
|
std::vector<common_arg> options;
|
||||||
void(*print_usage)(int, char **) = nullptr;
|
void(*print_usage)(int, char **) = nullptr;
|
||||||
common_params_context(common_params & params) : params(params) {}
|
common_params_context(common_params & params_) : params(params_) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
// parse input arguments from CLI
|
// parse input arguments from CLI
|
||||||
|
|
|
@ -763,9 +763,11 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
// if the path already exists, check whether it's a directory
|
// if the path already exists, check whether it's a directory
|
||||||
struct stat info;
|
{
|
||||||
if (stat(path.c_str(), &info) == 0) {
|
struct stat info;
|
||||||
return S_ISDIR(info.st_mode);
|
if (stat(path.c_str(), &info) == 0) {
|
||||||
|
return S_ISDIR(info.st_mode);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t pos_slash = 1; // skip leading slashes for directory creation
|
size_t pos_slash = 1; // skip leading slashes for directory creation
|
||||||
|
@ -796,7 +798,7 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string fs_get_cache_directory() {
|
std::string fs_get_cache_directory() {
|
||||||
std::string cache_directory = "";
|
std::string cache_directory;
|
||||||
auto ensure_trailing_slash = [](std::string p) {
|
auto ensure_trailing_slash = [](std::string p) {
|
||||||
// Make sure to add trailing slash
|
// Make sure to add trailing slash
|
||||||
if (p.back() != DIRECTORY_SEPARATOR) {
|
if (p.back() != DIRECTORY_SEPARATOR) {
|
||||||
|
@ -1206,7 +1208,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
||||||
{
|
{
|
||||||
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||||
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
common_load_model_from_url_headers * cur = (common_load_model_from_url_headers *) userdata;
|
||||||
|
|
||||||
static std::regex header_regex("([^:]+): (.*)\r\n");
|
static std::regex header_regex("([^:]+): (.*)\r\n");
|
||||||
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
||||||
|
@ -1218,9 +1220,9 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
||||||
const std::string & key = match[1];
|
const std::string & key = match[1];
|
||||||
const std::string & value = match[2];
|
const std::string & value = match[2];
|
||||||
if (std::regex_match(key, match, etag_regex)) {
|
if (std::regex_match(key, match, etag_regex)) {
|
||||||
headers->etag = value;
|
cur->etag = value;
|
||||||
} else if (std::regex_match(key, match, last_modified_regex)) {
|
} else if (std::regex_match(key, match, last_modified_regex)) {
|
||||||
headers->last_modified = value;
|
cur->last_modified = value;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return n_items;
|
return n_items;
|
||||||
|
@ -1292,18 +1294,18 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
||||||
|
|
||||||
// helper function to hide password in URL
|
// helper function to hide password in URL
|
||||||
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
auto llama_download_hide_password_in_url = [](const std::string & url_full) -> std::string {
|
||||||
std::size_t protocol_pos = url.find("://");
|
std::size_t protocol_pos = url_full.find("://");
|
||||||
if (protocol_pos == std::string::npos) {
|
if (protocol_pos == std::string::npos) {
|
||||||
return url; // Malformed URL
|
return url_full; // Malformed URL
|
||||||
}
|
}
|
||||||
|
|
||||||
std::size_t at_pos = url.find('@', protocol_pos + 3);
|
std::size_t at_pos = url_full.find('@', protocol_pos + 3);
|
||||||
if (at_pos == std::string::npos) {
|
if (at_pos == std::string::npos) {
|
||||||
return url; // No password in URL
|
return url_full; // No password in URL
|
||||||
}
|
}
|
||||||
|
|
||||||
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
|
return url_full.substr(0, protocol_pos + 3) + "********" + url_full.substr(at_pos);
|
||||||
};
|
};
|
||||||
|
|
||||||
// start the download
|
// start the download
|
||||||
|
|
|
@ -43,7 +43,7 @@ namespace console {
|
||||||
static bool simple_io = true;
|
static bool simple_io = true;
|
||||||
static display_t current_display = reset;
|
static display_t current_display = reset;
|
||||||
|
|
||||||
static FILE* out = stdout;
|
static FILE* fout = stdout;
|
||||||
|
|
||||||
#if defined (_WIN32)
|
#if defined (_WIN32)
|
||||||
static void* hConsole;
|
static void* hConsole;
|
||||||
|
@ -110,7 +110,7 @@ namespace console {
|
||||||
|
|
||||||
tty = fopen("/dev/tty", "w+");
|
tty = fopen("/dev/tty", "w+");
|
||||||
if (tty != nullptr) {
|
if (tty != nullptr) {
|
||||||
out = tty;
|
fout = tty;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -126,7 +126,7 @@ namespace console {
|
||||||
// Restore settings on POSIX systems
|
// Restore settings on POSIX systems
|
||||||
if (!simple_io) {
|
if (!simple_io) {
|
||||||
if (tty != nullptr) {
|
if (tty != nullptr) {
|
||||||
out = stdout;
|
fout = stdout;
|
||||||
fclose(tty);
|
fclose(tty);
|
||||||
tty = nullptr;
|
tty = nullptr;
|
||||||
}
|
}
|
||||||
|
@ -145,19 +145,19 @@ namespace console {
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
switch(display) {
|
switch(display) {
|
||||||
case reset:
|
case reset:
|
||||||
fprintf(out, ANSI_COLOR_RESET);
|
fprintf(fout, ANSI_COLOR_RESET);
|
||||||
break;
|
break;
|
||||||
case prompt:
|
case prompt:
|
||||||
fprintf(out, ANSI_COLOR_YELLOW);
|
fprintf(fout, ANSI_COLOR_YELLOW);
|
||||||
break;
|
break;
|
||||||
case user_input:
|
case user_input:
|
||||||
fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
|
fprintf(fout, ANSI_BOLD ANSI_COLOR_GREEN);
|
||||||
break;
|
break;
|
||||||
case error:
|
case error:
|
||||||
fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
|
fprintf(fout, ANSI_BOLD ANSI_COLOR_RED);
|
||||||
}
|
}
|
||||||
current_display = display;
|
current_display = display;
|
||||||
fflush(out);
|
fflush(fout);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -233,7 +233,7 @@ namespace console {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
putc('\b', out);
|
putc('\b', fout);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int estimateWidth(char32_t codepoint) {
|
static int estimateWidth(char32_t codepoint) {
|
||||||
|
@ -274,7 +274,7 @@ namespace console {
|
||||||
#else
|
#else
|
||||||
// We can trust expectedWidth if we've got one
|
// We can trust expectedWidth if we've got one
|
||||||
if (expectedWidth >= 0 || tty == nullptr) {
|
if (expectedWidth >= 0 || tty == nullptr) {
|
||||||
fwrite(utf8_codepoint, length, 1, out);
|
fwrite(utf8_codepoint, length, 1, fout);
|
||||||
return expectedWidth;
|
return expectedWidth;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -311,7 +311,7 @@ namespace console {
|
||||||
pop_cursor();
|
pop_cursor();
|
||||||
put_codepoint(&ch, 1, 1);
|
put_codepoint(&ch, 1, 1);
|
||||||
#else
|
#else
|
||||||
fprintf(out, "\b%c", ch);
|
fprintf(fout, "\b%c", ch);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -353,7 +353,7 @@ namespace console {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool readline_advanced(std::string & line, bool multiline_input) {
|
static bool readline_advanced(std::string & line, bool multiline_input) {
|
||||||
if (out != stdout) {
|
if (fout != stdout) {
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -364,7 +364,7 @@ namespace console {
|
||||||
|
|
||||||
char32_t input_char;
|
char32_t input_char;
|
||||||
while (true) {
|
while (true) {
|
||||||
fflush(out); // Ensure all output is displayed before waiting for input
|
fflush(fout); // Ensure all output is displayed before waiting for input
|
||||||
input_char = getchar32();
|
input_char = getchar32();
|
||||||
|
|
||||||
if (input_char == '\r' || input_char == '\n') {
|
if (input_char == '\r' || input_char == '\n') {
|
||||||
|
@ -432,7 +432,7 @@ namespace console {
|
||||||
line.pop_back();
|
line.pop_back();
|
||||||
if (last == '\\') {
|
if (last == '\\') {
|
||||||
line += '\n';
|
line += '\n';
|
||||||
fputc('\n', out);
|
fputc('\n', fout);
|
||||||
has_more = !has_more;
|
has_more = !has_more;
|
||||||
} else {
|
} else {
|
||||||
// llama will just eat the single space, it won't act as a space
|
// llama will just eat the single space, it won't act as a space
|
||||||
|
@ -447,11 +447,11 @@ namespace console {
|
||||||
has_more = false;
|
has_more = false;
|
||||||
} else {
|
} else {
|
||||||
line += '\n';
|
line += '\n';
|
||||||
fputc('\n', out);
|
fputc('\n', fout);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(out);
|
fflush(fout);
|
||||||
return has_more;
|
return has_more;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -579,8 +579,8 @@ private:
|
||||||
seq.back().second = false;
|
seq.back().second = false;
|
||||||
} else {
|
} else {
|
||||||
std::string literal;
|
std::string literal;
|
||||||
auto is_non_literal = [&](char c) {
|
auto is_non_literal = [&](char ch) {
|
||||||
return NON_LITERAL_SET.find(c) != NON_LITERAL_SET.end();
|
return NON_LITERAL_SET.find(ch) != NON_LITERAL_SET.end();
|
||||||
};
|
};
|
||||||
while (i < length) {
|
while (i < length) {
|
||||||
if (sub_pattern[i] == '\\' && i < length - 1) {
|
if (sub_pattern[i] == '\\' && i < length - 1) {
|
||||||
|
|
|
@ -255,8 +255,8 @@ public:
|
||||||
thrd = std::thread([this]() {
|
thrd = std::thread([this]() {
|
||||||
while (true) {
|
while (true) {
|
||||||
{
|
{
|
||||||
std::unique_lock<std::mutex> lock(mtx);
|
std::unique_lock<std::mutex> lock_thrd(mtx);
|
||||||
cv.wait(lock, [this]() { return head != tail; });
|
cv.wait(lock_thrd, [this]() { return head != tail; });
|
||||||
|
|
||||||
cur = entries[head];
|
cur = entries[head];
|
||||||
|
|
||||||
|
@ -338,16 +338,16 @@ public:
|
||||||
resume();
|
resume();
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_prefix(bool prefix) {
|
void set_prefix(bool val) {
|
||||||
std::lock_guard<std::mutex> lock(mtx);
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
this->prefix = prefix;
|
prefix = val;
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_timestamps(bool timestamps) {
|
void set_timestamps(bool val) {
|
||||||
std::lock_guard<std::mutex> lock(mtx);
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
this->timestamps = timestamps;
|
timestamps = val;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -62,7 +62,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
|
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
|
||||||
|
|
||||||
// decode in batches of ctx_params.n_batch tokens
|
// decode in batches of ctx_params.n_batch tokens
|
||||||
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
|
auto decode_helper = [&ctx, &batch](int32_t n_batch) {
|
||||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||||
|
|
||||||
|
@ -94,7 +94,7 @@ int main(int argc, char ** argv) {
|
||||||
common_batch_add(batch, 0, i, { 0 }, false);
|
common_batch_add(batch, 0, i, { 0 }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx_params.n_batch)) {
|
||||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -134,7 +134,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx_params.n_batch)) {
|
||||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -156,7 +156,7 @@ int main(int argc, char ** argv) {
|
||||||
common_batch_add(batch, 0, pp + i, { j }, true);
|
common_batch_add(batch, 0, pp + i, { j }, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx_params.n_batch)) {
|
||||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -471,12 +471,12 @@ struct my_llama_file {
|
||||||
GGML_ASSERT(ret == 0); // same
|
GGML_ASSERT(ret == 0); // same
|
||||||
}
|
}
|
||||||
|
|
||||||
void read_raw(void * ptr, size_t size) {
|
void read_raw(void * raw_addr, size_t raw_size) {
|
||||||
if (size == 0) {
|
if (raw_size == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
errno = 0;
|
errno = 0;
|
||||||
std::size_t ret = std::fread(ptr, size, 1, fp);
|
std::size_t ret = std::fread(raw_addr, raw_size, 1, fp);
|
||||||
if (ferror(fp)) {
|
if (ferror(fp)) {
|
||||||
die_fmt("fread failed: %s", strerror(errno));
|
die_fmt("fread failed: %s", strerror(errno));
|
||||||
}
|
}
|
||||||
|
|
|
@ -66,7 +66,7 @@ struct file_input {
|
||||||
float alpha;
|
float alpha;
|
||||||
float scale;
|
float scale;
|
||||||
|
|
||||||
file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
|
file_input(std::string & fname, float scale_): f_in(fname, std::ios::binary), scale(scale_) {
|
||||||
if (!f_in.is_open()) {
|
if (!f_in.is_open()) {
|
||||||
throw std::runtime_error("failed to open input gguf from " + fname);
|
throw std::runtime_error("failed to open input gguf from " + fname);
|
||||||
}
|
}
|
||||||
|
@ -131,7 +131,7 @@ struct lora_merge_ctx {
|
||||||
std::string & base_fname,
|
std::string & base_fname,
|
||||||
std::vector<common_adapter_lora_info> & lora_files,
|
std::vector<common_adapter_lora_info> & lora_files,
|
||||||
std::string & outfile,
|
std::string & outfile,
|
||||||
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
|
int n_threads_) : base_model(base_fname, 0), n_threads(n_threads_), fout(outfile, std::ios::binary) {
|
||||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||||
|
|
||||||
if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
|
if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
|
||||||
|
@ -157,7 +157,7 @@ struct lora_merge_ctx {
|
||||||
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
|
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
|
||||||
}
|
}
|
||||||
|
|
||||||
void check_metadata_lora(file_input * adapter) {
|
void check_metadata_lora(const file_input * adapter) const {
|
||||||
auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
|
auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
|
||||||
if (general_type != "adapter") {
|
if (general_type != "adapter") {
|
||||||
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
|
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
|
||||||
|
@ -175,7 +175,7 @@ struct lora_merge_ctx {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_type get_out_tensor_type(struct ggml_tensor * t) {
|
static ggml_type get_out_tensor_type(struct ggml_tensor * t) {
|
||||||
if (t->type == GGML_TYPE_F32) {
|
if (t->type == GGML_TYPE_F32) {
|
||||||
return GGML_TYPE_F32;
|
return GGML_TYPE_F32;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -60,13 +60,6 @@ int main(int argc, char** argv) {
|
||||||
const std::string grammar_filename = argv[1];
|
const std::string grammar_filename = argv[1];
|
||||||
const std::string input_filename = argv[2];
|
const std::string input_filename = argv[2];
|
||||||
|
|
||||||
// Read the GBNF grammar file
|
|
||||||
FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
|
|
||||||
if (!grammar_file) {
|
|
||||||
fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string grammar_str;
|
std::string grammar_str;
|
||||||
{
|
{
|
||||||
std::ifstream grammar_file(grammar_filename);
|
std::ifstream grammar_file(grammar_filename);
|
||||||
|
|
|
@ -204,14 +204,14 @@ struct split_strategy {
|
||||||
// temporary buffer for reading in tensor data
|
// temporary buffer for reading in tensor data
|
||||||
std::vector<uint8_t> read_buf;
|
std::vector<uint8_t> read_buf;
|
||||||
|
|
||||||
split_strategy(const split_params & params,
|
split_strategy(const split_params & params_,
|
||||||
std::ifstream & f_input,
|
std::ifstream & f_input_,
|
||||||
struct gguf_context * ctx_gguf,
|
struct gguf_context * ctx_gguf_,
|
||||||
struct ggml_context * ctx_meta) :
|
struct ggml_context * ctx_meta_) :
|
||||||
params(params),
|
params(params_),
|
||||||
f_input(f_input),
|
f_input(f_input_),
|
||||||
ctx_gguf(ctx_gguf),
|
ctx_gguf(ctx_gguf_),
|
||||||
ctx_meta(ctx_meta),
|
ctx_meta(ctx_meta_),
|
||||||
n_tensors(gguf_get_n_tensors(ctx_gguf)) {
|
n_tensors(gguf_get_n_tensors(ctx_gguf)) {
|
||||||
|
|
||||||
// because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
|
// because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
|
||||||
|
|
|
@ -204,13 +204,15 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
||||||
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
|
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
|
||||||
|
|
||||||
// print first 10 elements
|
// print first 10 elements
|
||||||
const float * data = (const float *) cur->data;
|
{
|
||||||
|
const float * data = (const float *) cur->data;
|
||||||
|
|
||||||
printf("%s data[:10] : ", name);
|
printf("%s data[:10] : ", name);
|
||||||
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
|
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
|
||||||
printf("%f ", data[j]);
|
printf("%f ", data[j]);
|
||||||
|
}
|
||||||
|
printf("\n\n");
|
||||||
}
|
}
|
||||||
printf("\n\n");
|
|
||||||
|
|
||||||
// check data
|
// check data
|
||||||
if (check_data) {
|
if (check_data) {
|
||||||
|
|
|
@ -294,7 +294,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
||||||
bool IMatrixCollector::load_imatrix(const char * fname) {
|
bool IMatrixCollector::load_imatrix(const char * fname) {
|
||||||
std::ifstream in(fname, std::ios::binary);
|
std::ifstream in(fname, std::ios::binary);
|
||||||
if (!in) {
|
if (!in) {
|
||||||
LOG_ERR("%s: failed to open %s\n",__func__, fname);
|
LOG_ERR("%s: failed to open %s\n", __func__, fname);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
int n_entries;
|
int n_entries;
|
||||||
|
@ -308,7 +308,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
||||||
std::vector<char> name_as_vec(len+1);
|
std::vector<char> name_as_vec(len+1);
|
||||||
in.read((char *)name_as_vec.data(), len);
|
in.read((char *)name_as_vec.data(), len);
|
||||||
if (in.fail()) {
|
if (in.fail()) {
|
||||||
LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
|
LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
name_as_vec[len] = 0;
|
name_as_vec[len] = 0;
|
||||||
|
@ -319,7 +319,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
||||||
int nval;
|
int nval;
|
||||||
in.read((char *)&nval, sizeof(nval));
|
in.read((char *)&nval, sizeof(nval));
|
||||||
if (in.fail() || nval < 1) {
|
if (in.fail() || nval < 1) {
|
||||||
LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
|
LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i);
|
||||||
m_stats = {};
|
m_stats = {};
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -332,15 +332,15 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
||||||
std::vector<float> tmp(nval);
|
std::vector<float> tmp(nval);
|
||||||
in.read((char*)tmp.data(), nval*sizeof(float));
|
in.read((char*)tmp.data(), nval*sizeof(float));
|
||||||
if (in.fail()) {
|
if (in.fail()) {
|
||||||
LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
|
LOG_ERR("%s: failed reading data for entry %d\n", __func__, i);
|
||||||
m_stats = {};
|
m_stats = {};
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
|
// Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
|
||||||
for (int i = 0; i < nval; i++) {
|
for (int j = 0; j < nval; j++) {
|
||||||
e.values[i] += tmp[i];
|
e.values[j] += tmp[j];
|
||||||
e.counts[i] += ncall;
|
e.counts[j] += ncall;
|
||||||
}
|
}
|
||||||
e.ncall += ncall;
|
e.ncall += ncall;
|
||||||
|
|
||||||
|
@ -488,12 +488,10 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
||||||
logits.reserve((size_t)n_ctx * n_vocab);
|
logits.reserve((size_t)n_ctx * n_vocab);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_chunk; ++i) {
|
for (int ich = 0; ich < n_chunk; ++ich) {
|
||||||
const int start = i * n_ctx;
|
const int start = ich * n_ctx;
|
||||||
const int end = start + n_ctx;
|
const int end = start + n_ctx;
|
||||||
|
|
||||||
std::vector<float> logits;
|
|
||||||
|
|
||||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
// clear the KV cache
|
// clear the KV cache
|
||||||
|
@ -537,7 +535,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
||||||
|
|
||||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
if (i == 0) {
|
if (ich == 0) {
|
||||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||||
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||||
int total_seconds = (int)(t_total * n_chunk);
|
int total_seconds = (int)(t_total * n_chunk);
|
||||||
|
@ -555,7 +553,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
||||||
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
||||||
count += n_ctx - first - 1;
|
count += n_ctx - first - 1;
|
||||||
|
|
||||||
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
LOG("[%d]%.4lf,", ich + 1, std::exp(nll / count));
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
logits.clear();
|
logits.clear();
|
||||||
|
|
|
@ -462,14 +462,14 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// tokenize new prefix and suffix
|
// tokenize new prefix and suffix
|
||||||
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
std::vector<llama_token> inp_pfx_cur = common_tokenize(ctx, params.input_prefix, false);
|
||||||
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
std::vector<llama_token> inp_sfx_cur = common_tokenize(ctx, params.input_suffix, false);
|
||||||
|
|
||||||
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
inp_pfx_cur.insert(inp_pfx_cur.begin(), llama_vocab_fim_pre(vocab));
|
||||||
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
inp_sfx_cur.insert(inp_sfx_cur.begin(), llama_vocab_fim_suf(vocab));
|
||||||
|
|
||||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
embd_inp = params.spm_infill ? inp_sfx_cur : inp_pfx_cur;
|
||||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
embd_end = params.spm_infill ? inp_pfx_cur : inp_sfx_cur;
|
||||||
if (add_bos) {
|
if (add_bos) {
|
||||||
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
||||||
}
|
}
|
||||||
|
|
|
@ -548,11 +548,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
GGML_ASSERT(split_arg.size() <= llama_max_devices());
|
GGML_ASSERT(split_arg.size() <= llama_max_devices());
|
||||||
|
|
||||||
std::vector<float> tensor_split(llama_max_devices());
|
std::vector<float> tensor_split(llama_max_devices());
|
||||||
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
for (size_t is = 0; is < llama_max_devices(); ++is) {
|
||||||
if (i < split_arg.size()) {
|
if (is < split_arg.size()) {
|
||||||
tensor_split[i] = std::stof(split_arg[i]);
|
tensor_split[is] = std::stof(split_arg[is]);
|
||||||
} else {
|
} else {
|
||||||
tensor_split[i] = 0.0f;
|
tensor_split[is] = 0.0f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
params.tensor_split.push_back(tensor_split);
|
params.tensor_split.push_back(tensor_split);
|
||||||
|
|
|
@ -1039,41 +1039,40 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
}
|
}
|
||||||
|
|
||||||
{ // attention
|
{ // attention
|
||||||
int hidden_size = 4096;
|
int hidden_size_cur = 4096;
|
||||||
const int d_head = 128;
|
|
||||||
int n_head = hidden_size/d_head;
|
|
||||||
int num_query = 96;
|
int num_query = 96;
|
||||||
if (ctx->minicpmv_version == 2) {
|
if (ctx->minicpmv_version == 2) {
|
||||||
hidden_size = 4096;
|
hidden_size_cur = 4096;
|
||||||
n_head = hidden_size/d_head;
|
|
||||||
num_query = 96;
|
num_query = 96;
|
||||||
}
|
}
|
||||||
else if (ctx->minicpmv_version == 3) {
|
else if (ctx->minicpmv_version == 3) {
|
||||||
hidden_size = 3584;
|
hidden_size_cur = 3584;
|
||||||
n_head = hidden_size/d_head;
|
|
||||||
num_query = 64;
|
num_query = 64;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const int d_head_cur = 128;
|
||||||
|
const int n_head_cur = hidden_size_cur/d_head_cur;
|
||||||
|
|
||||||
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
|
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
|
||||||
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head_cur));
|
||||||
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
|
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
|
||||||
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
|
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
|
||||||
// permute
|
// permute
|
||||||
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
|
Q = ggml_reshape_4d(ctx0, Q, d_head_cur, n_head_cur, num_query, batch_size);
|
||||||
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
||||||
Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
|
Q = ggml_reshape_3d(ctx0, Q, d_head_cur, num_query, n_head_cur * batch_size);
|
||||||
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
K = ggml_reshape_4d(ctx0, K, d_head_cur, n_head_cur, num_positions, batch_size);
|
||||||
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||||
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
K = ggml_reshape_3d(ctx0, K, d_head_cur, num_positions, n_head_cur * batch_size);
|
||||||
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
|
V = ggml_reshape_4d(ctx0, V, d_head_cur, n_head_cur, num_positions, batch_size);
|
||||||
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
||||||
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
|
V = ggml_reshape_3d(ctx0, V, num_positions, d_head_cur, n_head_cur * batch_size);
|
||||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
||||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
||||||
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
|
KQV = ggml_reshape_4d(ctx0, KQV, d_head_cur, num_query, n_head_cur, batch_size);
|
||||||
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
|
KQV = ggml_cont_3d(ctx0, KQV, hidden_size_cur, num_query, batch_size);
|
||||||
|
|
||||||
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
|
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
|
||||||
}
|
}
|
||||||
|
@ -1113,12 +1112,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
struct ggml_context * meta = NULL;
|
struct ggml_context * meta = NULL;
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
struct gguf_init_params params_meta = {
|
||||||
/*.no_alloc = */ true,
|
/*.no_alloc = */ true,
|
||||||
/*.ctx = */ &meta,
|
/*.ctx = */ &meta,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gguf_context * ctx = gguf_init_from_file(fname, params);
|
struct gguf_context * ctx = gguf_init_from_file(fname, params_meta);
|
||||||
if (!ctx) {
|
if (!ctx) {
|
||||||
throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
|
throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
|
||||||
}
|
}
|
||||||
|
@ -1310,13 +1309,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
// load tensors
|
// load tensors
|
||||||
{
|
{
|
||||||
std::vector<uint8_t> read_buf;
|
std::vector<uint8_t> read_buf;
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params_data = {
|
||||||
/*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(),
|
/*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(),
|
||||||
/*.mem_buffer =*/ NULL,
|
/*.mem_buffer =*/ NULL,
|
||||||
/*.no_alloc =*/ true,
|
/*.no_alloc =*/ true,
|
||||||
};
|
};
|
||||||
|
|
||||||
new_clip->ctx_data = ggml_init(params);
|
new_clip->ctx_data = ggml_init(params_data);
|
||||||
if (!new_clip->ctx_data) {
|
if (!new_clip->ctx_data) {
|
||||||
LOG_ERR("%s: ggml_init() failed\n", __func__);
|
LOG_ERR("%s: ggml_init() failed\n", __func__);
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
|
@ -2083,7 +2082,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||||
}
|
}
|
||||||
else if (ctx->has_qwen2vl_merger) {
|
else if (ctx->has_qwen2vl_merger) {
|
||||||
clip_image_u8 * resized = clip_image_u8_init();
|
clip_image_u8 * resized = clip_image_u8_init();
|
||||||
auto patch_size = clip_patch_size(ctx) * 2;
|
auto patch_size = clip_get_patch_size(ctx) * 2;
|
||||||
int nx = ceil((float)img->nx / patch_size) * patch_size;
|
int nx = ceil((float)img->nx / patch_size) * patch_size;
|
||||||
int ny = ceil((float)img->ny / patch_size) * patch_size;
|
int ny = ceil((float)img->ny / patch_size) * patch_size;
|
||||||
bicubic_resize(*img, *resized, nx, ny);
|
bicubic_resize(*img, *resized, nx, ny);
|
||||||
|
@ -2294,15 +2293,15 @@ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w
|
||||||
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t clip_image_size(const struct clip_ctx * ctx) {
|
int32_t clip_get_image_size(const struct clip_ctx * ctx) {
|
||||||
return ctx->vision_model.hparams.image_size;
|
return ctx->vision_model.hparams.image_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t clip_patch_size(const struct clip_ctx * ctx) {
|
int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
|
||||||
return ctx->vision_model.hparams.patch_size;
|
return ctx->vision_model.hparams.patch_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t clip_hidden_size(const struct clip_ctx * ctx) {
|
int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
|
||||||
return ctx->vision_model.hparams.hidden_size;
|
return ctx->vision_model.hparams.hidden_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -47,9 +47,9 @@ CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||||
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
|
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
|
||||||
|
|
||||||
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
|
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
||||||
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
|
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
||||||
CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
|
CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
// TODO: should be enum, not string
|
// TODO: should be enum, not string
|
||||||
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
||||||
|
|
|
@ -105,8 +105,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
} model;
|
} model;
|
||||||
|
|
||||||
const int32_t image_size = clip_image_size(ctx_clip);
|
const int32_t image_size = clip_get_image_size(ctx_clip);
|
||||||
const int32_t patch_size = clip_patch_size(ctx_clip);
|
const int32_t patch_size = clip_get_patch_size(ctx_clip);
|
||||||
|
|
||||||
int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
|
int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
|
||||||
|
|
||||||
|
@ -353,7 +353,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
img_res_v.size = 0;
|
img_res_v.size = 0;
|
||||||
img_res_v.data = nullptr;
|
img_res_v.data = nullptr;
|
||||||
|
|
||||||
const int32_t image_size = clip_image_size(ctx_clip);
|
const int32_t image_size = clip_get_image_size(ctx_clip);
|
||||||
|
|
||||||
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
|
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
|
||||||
|
|
||||||
|
|
|
@ -348,8 +348,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
|
||||||
|
|
||||||
LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||||
|
|
||||||
for (int i = 0; i < n_chunk; ++i) {
|
for (int ich = 0; ich < n_chunk; ++ich) {
|
||||||
const int start = i * params.ppl_stride;
|
const int start = ich * params.ppl_stride;
|
||||||
const int end = start + calc_chunk;
|
const int end = start + calc_chunk;
|
||||||
|
|
||||||
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
|
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
|
||||||
|
@ -400,7 +400,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
|
||||||
|
|
||||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
if (i == 0) {
|
if (ich == 0) {
|
||||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||||
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||||
int total_seconds = (int)(t_total * n_chunk);
|
int total_seconds = (int)(t_total * n_chunk);
|
||||||
|
@ -427,9 +427,9 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
|
||||||
}
|
}
|
||||||
// perplexity is e^(average negative log-likelihood)
|
// perplexity is e^(average negative log-likelihood)
|
||||||
if (params.ppl_output_type == 0) {
|
if (params.ppl_output_type == 0) {
|
||||||
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
LOG("[%d]%.4lf,", ich + 1, std::exp(nll / count));
|
||||||
} else {
|
} else {
|
||||||
LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
|
LOG("%8d %.4lf\n", ich*params.ppl_stride, std::exp(nll / count));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
|
@ -659,7 +659,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
|
||||||
|
|
||||||
static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
|
static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
|
||||||
int prev_outputs = 0;
|
int prev_outputs = 0;
|
||||||
for (int i = 0; i < (int) batch.n_tokens; i += n_batch) {
|
for (int i = 0; i < batch.n_tokens; i += n_batch) {
|
||||||
const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);
|
const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);
|
||||||
|
|
||||||
llama_batch batch_view = {
|
llama_batch batch_view = {
|
||||||
|
@ -679,8 +679,8 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
|
||||||
}
|
}
|
||||||
|
|
||||||
int n_outputs = 0;
|
int n_outputs = 0;
|
||||||
for (int i = 0; i < n_tokens; ++i) {
|
for (int iv = 0; iv < n_tokens; ++iv) {
|
||||||
n_outputs += batch_view.logits[i] != 0;
|
n_outputs += batch_view.logits[iv] != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
|
memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
|
||||||
|
@ -1752,14 +1752,14 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
|
||||||
auto kld_ptr = kld_values.data();
|
auto kld_ptr = kld_values.data();
|
||||||
auto p_diff_ptr = p_diff_values.data();
|
auto p_diff_ptr = p_diff_values.data();
|
||||||
|
|
||||||
for (int i = 0; i < n_chunk; ++i) {
|
for (int ich = 0; ich < n_chunk; ++ich) {
|
||||||
const int start = i * n_ctx;
|
const int start = ich * n_ctx;
|
||||||
const int end = start + n_ctx;
|
const int end = start + n_ctx;
|
||||||
|
|
||||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
|
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
|
||||||
LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
|
LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, ich);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1804,7 +1804,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
|
||||||
|
|
||||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
if (i == 0) {
|
if (ich == 0) {
|
||||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||||
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||||
int total_seconds = (int)(t_total * n_chunk);
|
int total_seconds = (int)(t_total * n_chunk);
|
||||||
|
@ -1824,7 +1824,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
|
||||||
p_diff_ptr += n_ctx - 1 - first;
|
p_diff_ptr += n_ctx - 1 - first;
|
||||||
kld_ptr += n_ctx - 1 - first;
|
kld_ptr += n_ctx - 1 - first;
|
||||||
|
|
||||||
LOG("%4d", i+1);
|
LOG("%4d", ich + 1);
|
||||||
|
|
||||||
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
||||||
const double ppl_val = exp(log_ppl.first);
|
const double ppl_val = exp(log_ppl.first);
|
||||||
|
|
|
@ -3,3 +3,12 @@ add_executable(${TARGET} run.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
||||||
|
# TMP
|
||||||
|
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||||
|
target_compile_options(${TARGET} PRIVATE -Wno-shadow)
|
||||||
|
|
||||||
|
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||||
|
target_compile_options(${TARGET} PRIVATE -Wno-shadow-field-in-constructor)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
|
@ -122,9 +122,9 @@ struct slot_params {
|
||||||
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
||||||
}
|
}
|
||||||
|
|
||||||
json lora = json::array();
|
json json_lora = json::array();
|
||||||
for (size_t i = 0; i < this->lora.size(); ++i) {
|
for (size_t i = 0; i < lora.size(); ++i) {
|
||||||
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
|
json_lora.push_back({{"id", i}, {"scale", lora[i].scale}});
|
||||||
}
|
}
|
||||||
|
|
||||||
return json {
|
return json {
|
||||||
|
@ -167,7 +167,7 @@ struct slot_params {
|
||||||
{"speculative.p_min", speculative.p_min},
|
{"speculative.p_min", speculative.p_min},
|
||||||
{"timings_per_token", timings_per_token},
|
{"timings_per_token", timings_per_token},
|
||||||
{"post_sampling_probs", post_sampling_probs},
|
{"post_sampling_probs", post_sampling_probs},
|
||||||
{"lora", lora},
|
{"lora", json_lora},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -200,7 +200,7 @@ struct server_task {
|
||||||
// used by SERVER_TASK_TYPE_SET_LORA
|
// used by SERVER_TASK_TYPE_SET_LORA
|
||||||
std::vector<common_adapter_lora_info> set_lora;
|
std::vector<common_adapter_lora_info> set_lora;
|
||||||
|
|
||||||
server_task(server_task_type type) : type(type) {}
|
server_task(server_task_type type_) : type(type_) {}
|
||||||
|
|
||||||
static slot_params params_from_json_cmpl(
|
static slot_params params_from_json_cmpl(
|
||||||
const llama_context * ctx,
|
const llama_context * ctx,
|
||||||
|
@ -1641,7 +1641,7 @@ struct server_context {
|
||||||
|
|
||||||
llama_context_params cparams_dft;
|
llama_context_params cparams_dft;
|
||||||
|
|
||||||
llama_batch batch = {};
|
llama_batch batch_main = {};
|
||||||
|
|
||||||
bool clean_kv_cache = true;
|
bool clean_kv_cache = true;
|
||||||
bool add_bos_token = true;
|
bool add_bos_token = true;
|
||||||
|
@ -1676,7 +1676,7 @@ struct server_context {
|
||||||
llama_batch_free(slot.batch_spec);
|
llama_batch_free(slot.batch_spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch_main);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool load_model(const common_params & params) {
|
bool load_model(const common_params & params) {
|
||||||
|
@ -1797,7 +1797,7 @@ struct server_context {
|
||||||
const int32_t n_batch = llama_n_batch(ctx);
|
const int32_t n_batch = llama_n_batch(ctx);
|
||||||
|
|
||||||
// only a single seq_id per token is needed
|
// only a single seq_id per token is needed
|
||||||
batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
|
batch_main = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
metrics.init();
|
metrics.init();
|
||||||
|
@ -2655,7 +2655,7 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
// start populating the batch for this iteration
|
// start populating the batch for this iteration
|
||||||
common_batch_clear(batch);
|
common_batch_clear(batch_main);
|
||||||
|
|
||||||
// track if given slot can be batched with slots already in the batch
|
// track if given slot can be batched with slots already in the batch
|
||||||
server_slot * slot_batched = nullptr;
|
server_slot * slot_batched = nullptr;
|
||||||
|
@ -2673,9 +2673,9 @@ struct server_context {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.i_batch = batch.n_tokens;
|
slot.i_batch = batch_main.n_tokens;
|
||||||
|
|
||||||
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
|
common_batch_add(batch_main, slot.sampled, slot.n_past, { slot.id }, true);
|
||||||
|
|
||||||
slot.n_past += 1;
|
slot.n_past += 1;
|
||||||
|
|
||||||
|
@ -2692,7 +2692,7 @@ struct server_context {
|
||||||
int32_t n_ubatch = llama_n_ubatch(ctx);
|
int32_t n_ubatch = llama_n_ubatch(ctx);
|
||||||
|
|
||||||
// next, batch any pending prompts without exceeding n_batch
|
// next, batch any pending prompts without exceeding n_batch
|
||||||
if (params_base.cont_batching || batch.n_tokens == 0) {
|
if (params_base.cont_batching || batch_main.n_tokens == 0) {
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
// check if we can batch this slot with the previous one
|
// check if we can batch this slot with the previous one
|
||||||
if (slot.is_processing()) {
|
if (slot.is_processing()) {
|
||||||
|
@ -2858,7 +2858,7 @@ struct server_context {
|
||||||
// non-causal tasks require to fit the entire prompt in the physical batch
|
// non-causal tasks require to fit the entire prompt in the physical batch
|
||||||
if (slot.is_non_causal()) {
|
if (slot.is_non_causal()) {
|
||||||
// cannot fit the prompt in the current batch - will try next iter
|
// cannot fit the prompt in the current batch - will try next iter
|
||||||
if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
|
if (batch_main.n_tokens + slot.n_prompt_tokens > n_batch) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2878,11 +2878,11 @@ struct server_context {
|
||||||
slot.cache_tokens.resize(slot.n_past);
|
slot.cache_tokens.resize(slot.n_past);
|
||||||
|
|
||||||
// add prompt tokens for processing in the current batch
|
// add prompt tokens for processing in the current batch
|
||||||
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
|
while (slot.n_past < slot.n_prompt_tokens && batch_main.n_tokens < n_batch) {
|
||||||
// without pooling, we want to output the embeddings for all the tokens in the batch
|
// without pooling, we want to output the embeddings for all the tokens in the batch
|
||||||
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
|
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
|
||||||
|
|
||||||
common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd);
|
common_batch_add(batch_main, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd);
|
||||||
|
|
||||||
if (slot.params.cache_prompt) {
|
if (slot.params.cache_prompt) {
|
||||||
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
|
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
|
||||||
|
@ -2892,13 +2892,13 @@ struct server_context {
|
||||||
slot.n_past++;
|
slot.n_past++;
|
||||||
}
|
}
|
||||||
|
|
||||||
SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
|
SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch_main.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
|
||||||
|
|
||||||
// entire prompt has been processed
|
// entire prompt has been processed
|
||||||
if (slot.n_past == slot.n_prompt_tokens) {
|
if (slot.n_past == slot.n_prompt_tokens) {
|
||||||
slot.state = SLOT_STATE_DONE_PROMPT;
|
slot.state = SLOT_STATE_DONE_PROMPT;
|
||||||
|
|
||||||
GGML_ASSERT(batch.n_tokens > 0);
|
GGML_ASSERT(batch_main.n_tokens > 0);
|
||||||
|
|
||||||
common_sampler_reset(slot.smpl);
|
common_sampler_reset(slot.smpl);
|
||||||
|
|
||||||
|
@ -2908,27 +2908,27 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
// extract the logits only for the last token
|
// extract the logits only for the last token
|
||||||
batch.logits[batch.n_tokens - 1] = true;
|
batch_main.logits[batch_main.n_tokens - 1] = true;
|
||||||
|
|
||||||
slot.n_decoded = 0;
|
slot.n_decoded = 0;
|
||||||
slot.i_batch = batch.n_tokens - 1;
|
slot.i_batch = batch_main.n_tokens - 1;
|
||||||
|
|
||||||
SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens);
|
SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch_main.n_tokens);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (batch.n_tokens >= n_batch) {
|
if (batch_main.n_tokens >= n_batch) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (batch.n_tokens == 0) {
|
if (batch_main.n_tokens == 0) {
|
||||||
SRV_WRN("%s", "no tokens to decode\n");
|
SRV_WRN("%s", "no tokens to decode\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
|
SRV_DBG("decoding batch, n_tokens = %d\n", batch_main.n_tokens);
|
||||||
|
|
||||||
if (slot_batched) {
|
if (slot_batched) {
|
||||||
// make sure we're in the right embedding mode
|
// make sure we're in the right embedding mode
|
||||||
|
@ -2938,17 +2938,17 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
// process the created batch of tokens
|
// process the created batch of tokens
|
||||||
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
for (int32_t i_batch = 0; i_batch < batch_main.n_tokens; i_batch += n_batch) {
|
||||||
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
const int32_t n_tokens = std::min(n_batch, batch_main.n_tokens - i_batch);
|
||||||
|
|
||||||
llama_batch batch_view = {
|
llama_batch batch_view = {
|
||||||
n_tokens,
|
n_tokens,
|
||||||
batch.token + i,
|
batch_main.token + i_batch,
|
||||||
nullptr,
|
nullptr,
|
||||||
batch.pos + i,
|
batch_main.pos + i_batch,
|
||||||
batch.n_seq_id + i,
|
batch_main.n_seq_id + i_batch,
|
||||||
batch.seq_id + i,
|
batch_main.seq_id + i_batch,
|
||||||
batch.logits + i,
|
batch_main.logits + i_batch,
|
||||||
};
|
};
|
||||||
|
|
||||||
const int ret = llama_decode(ctx, batch_view);
|
const int ret = llama_decode(ctx, batch_view);
|
||||||
|
@ -2957,7 +2957,7 @@ struct server_context {
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
if (n_batch == 1 || ret < 0) {
|
if (n_batch == 1 || ret < 0) {
|
||||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
// if you get here, it means the KV cache is full - try increasing it via the context size
|
||||||
SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
|
SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i_batch = %d, n_batch = %d, ret = %d\n", i_batch, n_batch, ret);
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
slot.release();
|
slot.release();
|
||||||
send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
|
send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
|
||||||
|
@ -2967,15 +2967,15 @@ struct server_context {
|
||||||
|
|
||||||
// retry with half the batch size to try to find a free slot in the KV cache
|
// retry with half the batch size to try to find a free slot in the KV cache
|
||||||
n_batch /= 2;
|
n_batch /= 2;
|
||||||
i -= n_batch;
|
i_batch -= n_batch;
|
||||||
|
|
||||||
SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
|
SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i_batch = %d, n_batch = %d, ret = %d\n", i_batch, n_batch, ret);
|
||||||
|
|
||||||
continue; // continue loop of n_batch
|
continue; // continue loop of n_batch
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
|
if (slot.i_batch < (int) i_batch || slot.i_batch >= (int) (i_batch + n_tokens)) {
|
||||||
continue; // continue loop of slots
|
continue; // continue loop of slots
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3001,7 +3001,7 @@ struct server_context {
|
||||||
continue; // continue loop of slots
|
continue; // continue loop of slots
|
||||||
}
|
}
|
||||||
|
|
||||||
const int tok_idx = slot.i_batch - i;
|
const int tok_idx = slot.i_batch - i_batch;
|
||||||
|
|
||||||
llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
|
llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
|
||||||
|
|
||||||
|
@ -3687,8 +3687,8 @@ int main(int argc, char ** argv) {
|
||||||
} else {
|
} else {
|
||||||
// multiple results (multitask)
|
// multiple results (multitask)
|
||||||
json arr = json::array();
|
json arr = json::array();
|
||||||
for (auto & res : results) {
|
for (auto & result : results) {
|
||||||
arr.push_back(res->to_json());
|
arr.push_back(result->to_json());
|
||||||
}
|
}
|
||||||
res_ok(res, arr);
|
res_ok(res, arr);
|
||||||
}
|
}
|
||||||
|
@ -3702,8 +3702,8 @@ int main(int argc, char ** argv) {
|
||||||
ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool {
|
ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool {
|
||||||
json res_json = result->to_json();
|
json res_json = result->to_json();
|
||||||
if (res_json.is_array()) {
|
if (res_json.is_array()) {
|
||||||
for (const auto & res : res_json) {
|
for (const auto & item : res_json) {
|
||||||
if (!server_sent_event(sink, "data", res)) {
|
if (!server_sent_event(sink, "data", item)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3973,9 +3973,9 @@ int main(int argc, char ** argv) {
|
||||||
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
|
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
|
||||||
|
|
||||||
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
||||||
for (auto & res : results) {
|
for (auto & result : results) {
|
||||||
GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
|
GGML_ASSERT(dynamic_cast<server_task_result_embd*>(result.get()) != nullptr);
|
||||||
responses.push_back(res->to_json());
|
responses.push_back(result->to_json());
|
||||||
}
|
}
|
||||||
}, [&](const json & error_data) {
|
}, [&](const json & error_data) {
|
||||||
res_error(res, error_data);
|
res_error(res, error_data);
|
||||||
|
@ -4063,9 +4063,9 @@ int main(int argc, char ** argv) {
|
||||||
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
|
std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
|
||||||
|
|
||||||
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
|
||||||
for (auto & res : results) {
|
for (auto & result : results) {
|
||||||
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
|
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(result.get()) != nullptr);
|
||||||
responses.push_back(res->to_json());
|
responses.push_back(result->to_json());
|
||||||
}
|
}
|
||||||
}, [&](const json & error_data) {
|
}, [&](const json & error_data) {
|
||||||
res_error(res, error_data);
|
res_error(res, error_data);
|
||||||
|
|
|
@ -129,15 +129,15 @@ static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_
|
||||||
if (p.is_string()) {
|
if (p.is_string()) {
|
||||||
auto s = p.template get<std::string>();
|
auto s = p.template get<std::string>();
|
||||||
|
|
||||||
llama_tokens p;
|
llama_tokens ids;
|
||||||
if (first) {
|
if (first) {
|
||||||
p = common_tokenize(vocab, s, add_special, parse_special);
|
ids = common_tokenize(vocab, s, add_special, parse_special);
|
||||||
first = false;
|
first = false;
|
||||||
} else {
|
} else {
|
||||||
p = common_tokenize(vocab, s, false, parse_special);
|
ids = common_tokenize(vocab, s, false, parse_special);
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
prompt_tokens.insert(prompt_tokens.end(), ids.begin(), ids.end());
|
||||||
} else {
|
} else {
|
||||||
if (first) {
|
if (first) {
|
||||||
first = false;
|
first = false;
|
||||||
|
|
|
@ -110,9 +110,8 @@ int main(int argc, char ** argv) {
|
||||||
llama_token new_token_id;
|
llama_token new_token_id;
|
||||||
while (true) {
|
while (true) {
|
||||||
// check if we have enough space in the context to evaluate this batch
|
// check if we have enough space in the context to evaluate this batch
|
||||||
int n_ctx = llama_n_ctx(ctx);
|
|
||||||
int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
|
int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
|
||||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
if (n_ctx_used + batch.n_tokens > (int) llama_n_ctx(ctx)) {
|
||||||
printf("\033[0m\n");
|
printf("\033[0m\n");
|
||||||
fprintf(stderr, "context size exceeded\n");
|
fprintf(stderr, "context size exceeded\n");
|
||||||
exit(0);
|
exit(0);
|
||||||
|
|
|
@ -544,26 +544,26 @@ int main(int argc, char ** argv) {
|
||||||
for (int is = 0; is < (int) sa.size(); ++is) {
|
for (int is = 0; is < (int) sa.size(); ++is) {
|
||||||
const llama_token id = cur_p->data[is].id;
|
const llama_token id = cur_p->data[is].id;
|
||||||
|
|
||||||
const int s = sa[is];
|
const int sd = sa[is];
|
||||||
|
|
||||||
common_sampler_accept(drafts[s].smpl, id, true);
|
common_sampler_accept(drafts[sd].smpl, id, true);
|
||||||
|
|
||||||
drafts[s].tokens.push_back(id);
|
drafts[sd].tokens.push_back(id);
|
||||||
// save cur_p.data into drafts[s].dists
|
// save cur_p.data into drafts[sd].dists
|
||||||
drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size});
|
drafts[sd].dists.push_back({cur_p->data, cur_p->data + cur_p->size});
|
||||||
|
|
||||||
// add unique drafted tokens to the target batch
|
// add unique drafted tokens to the target batch
|
||||||
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
|
drafts[sd].i_batch_tgt.push_back(batch_tgt.n_tokens);
|
||||||
|
|
||||||
common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
|
common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { sd }, true);
|
||||||
|
|
||||||
// add the token to the batch for batched decoding with the draft model
|
// add the token to the batch for batched decoding with the draft model
|
||||||
drafts[s].i_batch_dft = batch_dft.n_tokens;
|
drafts[sd].i_batch_dft = batch_dft.n_tokens;
|
||||||
|
|
||||||
common_batch_add(batch_dft, id, n_past_cur, { s }, true);
|
common_batch_add(batch_dft, id, n_past_cur, { sd }, true);
|
||||||
|
|
||||||
if (batch_tgt.n_tokens > n_draft) {
|
if (batch_tgt.n_tokens > n_draft) {
|
||||||
drafts[s].drafting = false;
|
drafts[sd].drafting = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -323,7 +323,7 @@ extern "C" {
|
||||||
// Utils
|
// Utils
|
||||||
//
|
//
|
||||||
|
|
||||||
struct ggml_backend_graph_copy {
|
struct ggml_backend_graph_copy_state {
|
||||||
ggml_backend_buffer_t buffer;
|
ggml_backend_buffer_t buffer;
|
||||||
struct ggml_context * ctx_allocated;
|
struct ggml_context * ctx_allocated;
|
||||||
struct ggml_context * ctx_unallocated;
|
struct ggml_context * ctx_unallocated;
|
||||||
|
@ -331,8 +331,8 @@ extern "C" {
|
||||||
};
|
};
|
||||||
|
|
||||||
// Copy a graph to a different backend
|
// Copy a graph to a different backend
|
||||||
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
GGML_API struct ggml_backend_graph_copy_state ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
||||||
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy_state copy);
|
||||||
|
|
||||||
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||||
|
|
||||||
|
|
|
@ -1724,7 +1724,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
struct ggml_backend_graph_copy_state ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
||||||
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
|
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
|
||||||
struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
||||||
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
||||||
|
@ -1805,14 +1805,14 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
|
void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy_state copy) {
|
||||||
ggml_backend_buffer_free(copy.buffer);
|
ggml_backend_buffer_free(copy.buffer);
|
||||||
ggml_free(copy.ctx_allocated);
|
ggml_free(copy.ctx_allocated);
|
||||||
ggml_free(copy.ctx_unallocated);
|
ggml_free(copy.ctx_unallocated);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
|
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
|
||||||
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
struct ggml_backend_graph_copy_state copy = ggml_backend_graph_copy(backend2, graph);
|
||||||
if (copy.buffer == NULL) {
|
if (copy.buffer == NULL) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,7 +55,7 @@ struct llama_adapter_lora_weight {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_adapter_lora_weight() = default;
|
llama_adapter_lora_weight() = default;
|
||||||
llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
llama_adapter_lora_weight(struct ggml_tensor * a_, struct ggml_tensor * b_) : a(a_), b(b_) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_adapter_lora {
|
struct llama_adapter_lora {
|
||||||
|
|
|
@ -1443,7 +1443,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||||
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
};
|
};
|
||||||
|
|
||||||
LLM_KV::LLM_KV(llm_arch arch) : arch(arch) {}
|
LLM_KV::LLM_KV(llm_arch arch_) : arch(arch_) {}
|
||||||
|
|
||||||
std::string LLM_KV::operator()(llm_kv kv) const {
|
std::string LLM_KV::operator()(llm_kv kv) const {
|
||||||
return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
||||||
|
|
|
@ -374,7 +374,7 @@ struct LLM_TN_IMPL {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LLM_TN {
|
struct LLM_TN {
|
||||||
LLM_TN(llm_arch arch) : arch(arch) {}
|
LLM_TN(llm_arch arch_) : arch(arch_) {}
|
||||||
|
|
||||||
llm_arch arch;
|
llm_arch arch;
|
||||||
|
|
||||||
|
|
|
@ -7,9 +7,9 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
|
||||||
// clear empty sequences
|
// clear empty sequences
|
||||||
// the previous ubatch is assumed to be gone,
|
// the previous ubatch is assumed to be gone,
|
||||||
// so nothing should refer to values in these sequences anymore.
|
// so nothing should refer to values in these sequences anymore.
|
||||||
for (size_t i = seq.size(); i-- > 0;) {
|
for (size_t i = seqs.size(); i-- > 0;) {
|
||||||
if (seq[i].length == 0) {
|
if (seqs[i].length == 0) {
|
||||||
seq.pop_back();
|
seqs.pop_back();
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -36,48 +36,48 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) {
|
void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) {
|
||||||
GGML_ASSERT(batch != nullptr);
|
GGML_ASSERT(batch_ptr != nullptr);
|
||||||
GGML_ASSERT(length <= seq.length);
|
GGML_ASSERT(length <= seq.length);
|
||||||
// Can only add sequences of equal lengths to a batch,
|
// Can only add sequences of equal lengths to a batch,
|
||||||
// otherwise it isn't clear to which sequence a token belongs
|
// otherwise it isn't clear to which sequence a token belongs
|
||||||
GGML_ASSERT(seq.n_seq_id == 0 || ubatch.n_seqs == 0 || length == (size_t) ubatch.n_tokens / ubatch.n_seqs);
|
GGML_ASSERT(seq.n_seq_id == 0 || ubatch.n_seqs == 0 || length == (size_t) ubatch.n_tokens / ubatch.n_seqs);
|
||||||
GGML_ASSERT((seq.n_seq_id != 0) == ubatch.equal_seqs);
|
GGML_ASSERT((seq.n_seq_id != 0) == ubatch.equal_seqs);
|
||||||
// NOTE: loops are separated for cache-friendliness
|
// NOTE: loops are separated for cache-friendliness
|
||||||
if (batch->token) {
|
if (batch_ptr->token) {
|
||||||
if (ubatch.equal_seqs) {
|
if (ubatch.equal_seqs) {
|
||||||
for (size_t i = 0; i < length; ++i) {
|
for (size_t i = 0; i < length; ++i) {
|
||||||
ubatch.token[ubatch.n_tokens + i] = batch->token[ids[seq.offset + i]];
|
ubatch.token[ubatch.n_tokens + i] = batch_ptr->token[ids[seq.offset + i]];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// simple split
|
// simple split
|
||||||
ubatch.token = batch->token + seq.offset;
|
ubatch.token = batch_ptr->token + seq.offset;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ubatch.token = nullptr;
|
ubatch.token = nullptr;
|
||||||
}
|
}
|
||||||
if (batch->embd) {
|
if (batch_ptr->embd) {
|
||||||
if (ubatch.equal_seqs) {
|
if (ubatch.equal_seqs) {
|
||||||
for (size_t i = 0; i < length; ++i) {
|
for (size_t i = 0; i < length; ++i) {
|
||||||
memcpy(
|
memcpy(
|
||||||
ubatch.embd + (n_embd * (ubatch.n_tokens + i)),
|
ubatch.embd + (n_embd * (ubatch.n_tokens + i)),
|
||||||
batch->embd + (n_embd * ids[seq.offset + i]),
|
batch_ptr->embd + (n_embd * ids[seq.offset + i]),
|
||||||
n_embd * sizeof(float)
|
n_embd * sizeof(float)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// simple split
|
// simple split
|
||||||
ubatch.embd = batch->embd + (n_embd * seq.offset);
|
ubatch.embd = batch_ptr->embd + (n_embd * seq.offset);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ubatch.embd = nullptr;
|
ubatch.embd = nullptr;
|
||||||
}
|
}
|
||||||
if (ubatch.equal_seqs) {
|
if (ubatch.equal_seqs) {
|
||||||
for (size_t i = 0; i < length; ++i) {
|
for (size_t i = 0; i < length; ++i) {
|
||||||
ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
|
ubatch.pos[ubatch.n_tokens + i] = batch_ptr->pos[ids[seq.offset + i]];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// simple split
|
// simple split
|
||||||
ubatch.pos = batch->pos + seq.offset;
|
ubatch.pos = batch_ptr->pos + seq.offset;
|
||||||
}
|
}
|
||||||
if (ubatch.equal_seqs) {
|
if (ubatch.equal_seqs) {
|
||||||
ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
|
ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
|
||||||
|
@ -86,15 +86,15 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// simple split
|
// simple split
|
||||||
if (batch->n_seq_id) {
|
if (batch_ptr->n_seq_id) {
|
||||||
ubatch.n_seq_id = batch->n_seq_id + seq.offset;
|
ubatch.n_seq_id = batch_ptr->n_seq_id + seq.offset;
|
||||||
} else {
|
} else {
|
||||||
for (size_t i = 0; i < length; ++i) {
|
for (size_t i = 0; i < length; ++i) {
|
||||||
ubatch.n_seq_id[ubatch.n_seqs + i] = 1;
|
ubatch.n_seq_id[ubatch.n_seqs + i] = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (batch->seq_id) {
|
if (batch_ptr->seq_id) {
|
||||||
ubatch.seq_id = batch->seq_id + seq.offset;
|
ubatch.seq_id = batch_ptr->seq_id + seq.offset;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (logits_all) {
|
if (logits_all) {
|
||||||
|
@ -102,17 +102,17 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
|
||||||
ubatch.output[ubatch.n_tokens + i] = 1;
|
ubatch.output[ubatch.n_tokens + i] = 1;
|
||||||
out_ids.push_back(ids[seq.offset + i]);
|
out_ids.push_back(ids[seq.offset + i]);
|
||||||
}
|
}
|
||||||
} else if (batch->logits) {
|
} else if (batch_ptr->logits) {
|
||||||
if (ubatch.equal_seqs) {
|
if (ubatch.equal_seqs) {
|
||||||
for (size_t i = 0; i < length; ++i) {
|
for (size_t i = 0; i < length; ++i) {
|
||||||
size_t id = ids[seq.offset + i];
|
size_t id = ids[seq.offset + i];
|
||||||
int8_t is_output = batch->logits[id];
|
int8_t is_output = batch_ptr->logits[id];
|
||||||
ubatch.output[ubatch.n_tokens + i] = is_output;
|
ubatch.output[ubatch.n_tokens + i] = is_output;
|
||||||
if (is_output) { out_ids.push_back(id); }
|
if (is_output) { out_ids.push_back(id); }
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// simple split
|
// simple split
|
||||||
ubatch.output = batch->logits + seq.offset;
|
ubatch.output = batch_ptr->logits + seq.offset;
|
||||||
for (size_t i = 0; i < length; ++i) {
|
for (size_t i = 0; i < length; ++i) {
|
||||||
if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); }
|
if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); }
|
||||||
}
|
}
|
||||||
|
@ -139,12 +139,12 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
|
||||||
|
|
||||||
llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) {
|
llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) {
|
||||||
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
|
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
|
||||||
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
|
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr);
|
||||||
ubatch.equal_seqs = false;
|
ubatch.equal_seqs = false;
|
||||||
if (!seq.empty()) {
|
if (!seqs.empty()) {
|
||||||
llama_sbatch_seq & s = seq[0];
|
llama_sbatch_seq & s = seqs[0];
|
||||||
size_t length = s.length < n_ubatch ? s.length : n_ubatch;
|
size_t length = s.length < n_ubatch ? s.length : n_ubatch;
|
||||||
GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
|
GGML_ASSERT(seqs.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
|
||||||
add_seq_to_ubatch(ubatch, s, length);
|
add_seq_to_ubatch(ubatch, s, length);
|
||||||
}
|
}
|
||||||
return ubatch;
|
return ubatch;
|
||||||
|
@ -152,15 +152,15 @@ llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) {
|
||||||
|
|
||||||
llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
|
llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
|
||||||
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
|
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
|
||||||
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
|
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr);
|
||||||
if (!seq.empty()) {
|
if (!seqs.empty()) {
|
||||||
size_t length = 0;
|
size_t length = 0;
|
||||||
size_t n_tokens_in_ubatch = 0;
|
size_t n_tokens_in_ubatch = 0;
|
||||||
GGML_ASSERT(seq[0].n_seq_id > 0); // should not be mixed with simple splits
|
GGML_ASSERT(seqs[0].n_seq_id > 0); // should not be mixed with simple splits
|
||||||
// smallest first, because it's easier to split this way;
|
// smallest first, because it's easier to split this way;
|
||||||
// starting from the end to pop in constant time.
|
// starting from the end to pop in constant time.
|
||||||
for (size_t i = seq.size(); i-- > 0;) {
|
for (size_t i = seqs.size(); i-- > 0;) {
|
||||||
llama_sbatch_seq & s = seq[i];
|
llama_sbatch_seq & s = seqs[i];
|
||||||
GGML_ASSERT(s.length > 0);
|
GGML_ASSERT(s.length > 0);
|
||||||
if (length == 0) {
|
if (length == 0) {
|
||||||
length = s.length < n_ubatch ? s.length : n_ubatch;
|
length = s.length < n_ubatch ? s.length : n_ubatch;
|
||||||
|
@ -179,9 +179,9 @@ llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
|
||||||
|
|
||||||
llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
|
llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
|
||||||
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
|
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
|
||||||
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
|
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr);
|
||||||
if (!seq.empty()) {
|
if (!seqs.empty()) {
|
||||||
llama_sbatch_seq & s = seq[seq.size() - 1];
|
llama_sbatch_seq & s = seqs.back();
|
||||||
size_t length = s.length < n_ubatch ? s.length : n_ubatch;
|
size_t length = s.length < n_ubatch ? s.length : n_ubatch;
|
||||||
GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits
|
GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits
|
||||||
add_seq_to_ubatch(ubatch, s, length);
|
add_seq_to_ubatch(ubatch, s, length);
|
||||||
|
@ -189,23 +189,24 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
|
||||||
return ubatch;
|
return ubatch;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd_cur, bool simple_split, bool logits_all_cur) {
|
||||||
GGML_ASSERT(batch.n_tokens >= 0);
|
GGML_ASSERT(batch.n_tokens >= 0);
|
||||||
this->batch = &batch;
|
|
||||||
this->n_embd = n_embd;
|
batch_ptr = &batch;
|
||||||
this->logits_all = logits_all;
|
n_embd = n_embd_cur;
|
||||||
|
logits_all = logits_all_cur;
|
||||||
|
|
||||||
n_tokens = batch.n_tokens;
|
n_tokens = batch.n_tokens;
|
||||||
ids.resize(n_tokens);
|
ids.resize(n_tokens);
|
||||||
out_ids.clear();
|
out_ids.clear();
|
||||||
// TODO: reserve out_ids and seq
|
// TODO: reserve out_ids and seqs
|
||||||
|
|
||||||
for (size_t i = 0; i < n_tokens; ++i) {
|
for (size_t i = 0; i < n_tokens; ++i) {
|
||||||
ids[i] = i;
|
ids[i] = i;
|
||||||
}
|
}
|
||||||
if (simple_split) {
|
if (simple_split) {
|
||||||
seq.resize(1);
|
seqs.resize(1);
|
||||||
llama_sbatch_seq & s = seq[0];
|
llama_sbatch_seq & s = seqs[0];
|
||||||
s.n_seq_id = 0;
|
s.n_seq_id = 0;
|
||||||
s.seq_id = nullptr;
|
s.seq_id = nullptr;
|
||||||
s.offset = 0;
|
s.offset = 0;
|
||||||
|
@ -259,11 +260,11 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
|
llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
|
||||||
seq.push_back(new_seq);
|
seqs.push_back(new_seq);
|
||||||
last_seq = &seq.back();
|
last_seq = &seqs.back();
|
||||||
}
|
}
|
||||||
// keep shared prompts first at the end, then sort by length descending.
|
// keep shared prompts first at the end, then sort by length descending.
|
||||||
std::sort(seq.begin(), seq.end(),
|
std::sort(seqs.begin(), seqs.end(),
|
||||||
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
|
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
|
||||||
if (a.n_seq_id == b.n_seq_id) {
|
if (a.n_seq_id == b.n_seq_id) {
|
||||||
return a.length > b.length;
|
return a.length > b.length;
|
||||||
|
|
|
@ -45,9 +45,9 @@ struct llama_sbatch {
|
||||||
std::vector<size_t> ids;
|
std::vector<size_t> ids;
|
||||||
// batch indices of the output
|
// batch indices of the output
|
||||||
std::vector<size_t> out_ids;
|
std::vector<size_t> out_ids;
|
||||||
std::vector<llama_sbatch_seq> seq;
|
std::vector<llama_sbatch_seq> seqs;
|
||||||
|
|
||||||
const llama_batch * batch = nullptr;
|
const llama_batch * batch_ptr = nullptr;
|
||||||
|
|
||||||
// buffers for the ubatch
|
// buffers for the ubatch
|
||||||
std::vector<llama_token> ubatch_token;
|
std::vector<llama_token> ubatch_token;
|
||||||
|
|
|
@ -916,8 +916,8 @@ struct llama_data_write {
|
||||||
write(&n_seq_id, sizeof(n_seq_id));
|
write(&n_seq_id, sizeof(n_seq_id));
|
||||||
|
|
||||||
if (n_seq_id) {
|
if (n_seq_id) {
|
||||||
for (auto seq_id : cell.seq_id) {
|
for (auto sid : cell.seq_id) {
|
||||||
write(&seq_id, sizeof(seq_id));
|
write(&sid, sizeof(sid));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,8 +15,8 @@
|
||||||
#include <set>
|
#include <set>
|
||||||
|
|
||||||
struct llama_context {
|
struct llama_context {
|
||||||
llama_context(const llama_model & model)
|
llama_context(const llama_model & model_)
|
||||||
: model(model)
|
: model(model_)
|
||||||
, t_start_us(model.t_start_us)
|
, t_start_us(model.t_start_us)
|
||||||
, t_load_us(model.t_load_us) {}
|
, t_load_us(model.t_load_us) {}
|
||||||
|
|
||||||
|
|
|
@ -490,7 +490,7 @@ const char * llama_grammar_parser::parse_sequence(
|
||||||
pos = parse_space(pos + 1, is_nested);
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
|
||||||
if (is_digit_char(*pos)) {
|
if (is_digit_char(*pos)) {
|
||||||
const char * int_end = parse_int(pos);
|
int_end = parse_int(pos);
|
||||||
max_times = std::stoul(std::string(pos, int_end - pos));
|
max_times = std::stoul(std::string(pos, int_end - pos));
|
||||||
pos = parse_space(int_end, is_nested);
|
pos = parse_space(int_end, is_nested);
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,7 +17,7 @@ struct llama_logger_state {
|
||||||
|
|
||||||
static llama_logger_state g_logger_state;
|
static llama_logger_state g_logger_state;
|
||||||
|
|
||||||
time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
|
time_meas::time_meas(int64_t & t_acc_, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc_) {}
|
||||||
|
|
||||||
time_meas::~time_meas() {
|
time_meas::~time_meas() {
|
||||||
if (t_start_us >= 0) {
|
if (t_start_us >= 0) {
|
||||||
|
|
|
@ -58,12 +58,12 @@ struct llama_kv_cache {
|
||||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||||
|
|
||||||
size_t total_size() const {
|
size_t total_size() const {
|
||||||
size_t size = 0;
|
size_t size_all = 0;
|
||||||
for (const auto & buf : bufs) {
|
for (const auto & buf : bufs) {
|
||||||
size += ggml_backend_buffer_get_size(buf.get());
|
size_all += ggml_backend_buffer_get_size(buf.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
return size;
|
return size_all;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: better data structures to reduce the cost of this operation
|
// TODO: better data structures to reduce the cost of this operation
|
||||||
|
|
|
@ -454,8 +454,8 @@ struct llama_mlock::impl {
|
||||||
return (size_t) sysconf(_SC_PAGESIZE);
|
return (size_t) sysconf(_SC_PAGESIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool raw_lock(const void * addr, size_t size) const {
|
bool raw_lock(const void * lock_addr, size_t lock_len) const {
|
||||||
if (!mlock(addr, size)) {
|
if (!mlock(lock_addr, lock_len)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -475,12 +475,12 @@ struct llama_mlock::impl {
|
||||||
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
|
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
|
||||||
suggest = false;
|
suggest = false;
|
||||||
}
|
}
|
||||||
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
|
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + lock_len)) {
|
||||||
suggest = false;
|
suggest = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
||||||
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
lock_len, size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -535,7 +535,7 @@ struct llama_mlock::impl {
|
||||||
return (size_t) 65536;
|
return (size_t) 65536;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool raw_lock(const void * addr, size_t len) const {
|
bool raw_lock(const void * lock_addr, size_t lock_len) const {
|
||||||
LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
|
LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -413,7 +413,7 @@ namespace GGUFMeta {
|
||||||
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
||||||
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
||||||
|
|
||||||
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
|
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap_cur, bool check_tensors_cur, const struct llama_model_kv_override * param_overrides_p) {
|
||||||
int trace = 0;
|
int trace = 0;
|
||||||
if (getenv("LLAMA_TRACE")) {
|
if (getenv("LLAMA_TRACE")) {
|
||||||
trace = atoi(getenv("LLAMA_TRACE"));
|
trace = atoi(getenv("LLAMA_TRACE"));
|
||||||
|
@ -626,11 +626,11 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
|
||||||
|
|
||||||
if (!llama_mmap::SUPPORTED) {
|
if (!llama_mmap::SUPPORTED) {
|
||||||
LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
|
LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
|
||||||
use_mmap = false;
|
use_mmap_cur = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
this->use_mmap = use_mmap;
|
use_mmap = use_mmap_cur;
|
||||||
this->check_tensors = check_tensors;
|
check_tensors = check_tensors_cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_model_loader::get_arch_name() const {
|
std::string llama_model_loader::get_arch_name() const {
|
||||||
|
@ -887,15 +887,15 @@ bool llama_model_loader::load_all_data(
|
||||||
|
|
||||||
// If the backend is supported, create pinned memory buffers and events for synchronisation.
|
// If the backend is supported, create pinned memory buffers and events for synchronisation.
|
||||||
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
||||||
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
|
auto * buf_new = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
|
||||||
if (!buf) {
|
if (!buf_new) {
|
||||||
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
|
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
|
||||||
ggml_backend_dev_name(dev));
|
ggml_backend_dev_name(dev));
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
host_buffers.emplace_back(buf);
|
host_buffers.emplace_back(buf_new);
|
||||||
host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
|
host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf_new));
|
||||||
|
|
||||||
auto * event = ggml_backend_event_new(dev);
|
auto * event = ggml_backend_event_new(dev);
|
||||||
if (!event) {
|
if (!event) {
|
||||||
|
|
|
@ -31,7 +31,7 @@ struct llama_model_loader {
|
||||||
|
|
||||||
ggml_tensor * tensor;
|
ggml_tensor * tensor;
|
||||||
|
|
||||||
llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
llama_tensor_weight(const llama_file * file, uint16_t idx_, const struct gguf_context * gguf_ctx, ggml_tensor * tensor_) : idx(idx_), tensor(tensor_) {
|
||||||
const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor));
|
const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor));
|
||||||
if (tensor_idx < 0) {
|
if (tensor_idx < 0) {
|
||||||
throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
|
throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
|
||||||
|
@ -90,7 +90,7 @@ struct llama_model_loader {
|
||||||
size_t size_data = 0;
|
size_t size_data = 0;
|
||||||
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
||||||
|
|
||||||
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p);
|
llama_model_loader(const std::string & fname, bool use_mmap_cur, bool check_tensors_cur, const struct llama_model_kv_override * param_overrides_p);
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
||||||
|
|
|
@ -311,9 +311,9 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_m
|
||||||
ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
|
ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
|
||||||
if (ggml_backend_split_buffer_type_fn) {
|
if (ggml_backend_split_buffer_type_fn) {
|
||||||
size_t dev_index = [&]() {
|
size_t dev_index = [&]() {
|
||||||
auto * reg = ggml_backend_dev_backend_reg(dev);
|
ggml_backend_reg_t reg_dev = ggml_backend_dev_backend_reg(dev);
|
||||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
|
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg_dev); ++i) {
|
||||||
if (ggml_backend_reg_dev_get(reg, i) == dev) {
|
if (ggml_backend_reg_dev_get(reg_dev, i) == dev) {
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -340,7 +340,8 @@ struct llama_model::impl {
|
||||||
|
|
||||||
size_t n_bytes = 0;
|
size_t n_bytes = 0;
|
||||||
|
|
||||||
std::string desc_str;
|
std::string name_str = "n/a";
|
||||||
|
std::string desc_str = "n/a";
|
||||||
|
|
||||||
// model memory mapped files
|
// model memory mapped files
|
||||||
llama_mmaps mappings;
|
llama_mmaps mappings;
|
||||||
|
@ -368,7 +369,7 @@ struct llama_model::impl {
|
||||||
std::vector<layer_dev> dev_layer;
|
std::vector<layer_dev> dev_layer;
|
||||||
};
|
};
|
||||||
|
|
||||||
llama_model::llama_model(const struct llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
|
llama_model::llama_model(const struct llama_model_params & params_) : params(params_), pimpl(std::make_unique<impl>()) {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_model::~llama_model() {}
|
llama_model::~llama_model() {}
|
||||||
|
@ -390,17 +391,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
|
|
||||||
// get metadata as string
|
// get metadata as string
|
||||||
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
||||||
enum gguf_type type = gguf_get_kv_type(ctx, i);
|
gguf_type type_cur = gguf_get_kv_type(ctx, i);
|
||||||
if (type == GGUF_TYPE_ARRAY) {
|
if (type_cur == GGUF_TYPE_ARRAY) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const char * name = gguf_get_key(ctx, i);
|
const char * name_cur = gguf_get_key(ctx, i);
|
||||||
const std::string value = gguf_kv_to_str(ctx, i);
|
const std::string value_cur = gguf_kv_to_str(ctx, i);
|
||||||
gguf_kv.emplace(name, value);
|
gguf_kv.emplace(name_cur, value_cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
// get general kv
|
// get general kv
|
||||||
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
ml.get_key(LLM_KV_GENERAL_NAME, pimpl->name_str, false);
|
||||||
|
|
||||||
// everything past this point is not vocab-related
|
// everything past this point is not vocab-related
|
||||||
if (hparams.vocab_only) {
|
if (hparams.vocab_only) {
|
||||||
|
@ -1303,7 +1304,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
||||||
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
||||||
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
||||||
return {cpu_dev, &pimpl->cpu_buft_list};
|
return { cpu_dev, &pimpl->cpu_buft_list };
|
||||||
}
|
}
|
||||||
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
|
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
|
||||||
auto * dev = devices.at(layer_gpu);
|
auto * dev = devices.at(layer_gpu);
|
||||||
|
@ -1333,13 +1334,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
||||||
auto it = ctx_map.find(buft);
|
auto it = ctx_map.find(buft);
|
||||||
if (it == ctx_map.end()) {
|
if (it == ctx_map.end()) {
|
||||||
ggml_init_params params = {
|
ggml_init_params params_cur = {
|
||||||
/*.mem_size =*/ ctx_size,
|
/*.mem_size =*/ ctx_size,
|
||||||
/*.mem_buffer =*/ NULL,
|
/*.mem_buffer =*/ NULL,
|
||||||
/*.no_alloc =*/ true,
|
/*.no_alloc =*/ true,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_context * ctx = ggml_init(params);
|
ggml_context * ctx = ggml_init(params_cur);
|
||||||
if (!ctx) {
|
if (!ctx) {
|
||||||
throw std::runtime_error(format("failed to create ggml context"));
|
throw std::runtime_error(format("failed to create ggml context"));
|
||||||
}
|
}
|
||||||
|
@ -1452,7 +1453,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
// avoid using a host buffer when using mmap
|
// avoid using a host buffer when using mmap
|
||||||
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
||||||
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
||||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
||||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1557,31 +1557,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
auto & layer = layers[i];
|
auto & layer = layers[i];
|
||||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
|
const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
|
||||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
|
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
|
const int64_t n_embd_gqa_i = hparams.n_embd_v_gqa(i);
|
||||||
const int64_t n_ff = hparams.n_ff(i);
|
const int64_t n_ff_i = hparams.n_ff(i);
|
||||||
const int64_t n_head = hparams.n_head(i);
|
const int64_t n_head_i = hparams.n_head(i);
|
||||||
const int64_t n_head_kv = hparams.n_head_kv(i);
|
const int64_t n_head_kv_i = hparams.n_head_kv(i);
|
||||||
|
|
||||||
if (n_head_kv == 0 && n_head > 0) {
|
if (n_head_kv_i == 0 && n_head_i > 0) {
|
||||||
// linear attention for DeciLMCausalModel
|
// linear attention for DeciLMCausalModel
|
||||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||||
}
|
}
|
||||||
else if (n_head_kv > 0) {
|
else if (n_head_kv_i > 0) {
|
||||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
|
||||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
|
||||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
|
||||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// optional bias tensors
|
// optional bias tensors
|
||||||
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa_i}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa_i}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
@ -1594,14 +1594,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff_i}, 0);
|
||||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff_i, n_embd}, 0);
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_i}, 0);
|
||||||
|
|
||||||
// optional MLP bias
|
// optional MLP bias
|
||||||
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff_i}, TENSOR_NOT_REQUIRED);
|
||||||
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff_i}, TENSOR_NOT_REQUIRED);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_MINICPM3:
|
case LLM_ARCH_MINICPM3:
|
||||||
|
@ -2653,23 +2653,23 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
const int64_t n_head = hparams.n_head(i);
|
const int64_t n_head_i = hparams.n_head(i);
|
||||||
const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
|
const int64_t n_head_qkv_i = 2*hparams.n_head_kv(i) + n_head_i;
|
||||||
const int64_t n_ff = hparams.n_ff(i);
|
const int64_t n_ff_i = hparams.n_ff(i);
|
||||||
|
|
||||||
auto & layer = layers[i];
|
auto & layer = layers[i];
|
||||||
|
|
||||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
|
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv_i*n_embd_head_k}, 0);
|
||||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head_i*n_embd_head_k, n_embd}, 0);
|
||||||
|
|
||||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff_i}, 0);
|
||||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_i, n_embd}, 0);
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_i}, 0);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_GPTNEOX:
|
case LLM_ARCH_GPTNEOX:
|
||||||
|
@ -3167,11 +3167,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
const int time_mix_extra_dim = hparams.time_mix_extra_dim;
|
const int time_mix_extra_dim = hparams.time_mix_extra_dim;
|
||||||
const int time_decay_extra_dim = hparams.time_decay_extra_dim;
|
const int time_decay_extra_dim = hparams.time_decay_extra_dim;
|
||||||
const int head_size = hparams.wkv_head_size;
|
const int head_size = hparams.wkv_head_size;
|
||||||
const int attn_hidden_size = n_embd;
|
const int attn_hidden_size = n_embd;
|
||||||
const int n_head_kv = hparams.n_head_kv();
|
|
||||||
int attn_key_value_size;
|
int attn_key_value_size;
|
||||||
if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
|
if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
|
||||||
attn_key_value_size = attn_hidden_size;
|
attn_key_value_size = attn_hidden_size;
|
||||||
|
@ -3254,7 +3254,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
|
|
||||||
// posnet
|
// posnet
|
||||||
{
|
{
|
||||||
const int64_t n_embd = hparams.posnet.n_embd;
|
const int64_t n_embd_cur = hparams.posnet.n_embd;
|
||||||
|
|
||||||
for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
|
for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
|
||||||
auto & layer = layers[i].posnet;
|
auto & layer = layers[i].posnet;
|
||||||
|
@ -3274,39 +3274,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
case 3:
|
case 3:
|
||||||
case 4:
|
case 4:
|
||||||
{
|
{
|
||||||
layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
|
layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd_cur}, 0);
|
||||||
layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
|
layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd_cur}, 0);
|
||||||
|
|
||||||
layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
|
layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd_cur, n_embd_cur}, 0);
|
||||||
layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
|
layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd_cur}, 0);
|
||||||
|
|
||||||
layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
|
layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd_cur}, 0);
|
||||||
layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
|
layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd_cur}, 0);
|
||||||
|
|
||||||
layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
|
layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd_cur, n_embd_cur}, 0);
|
||||||
layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
|
layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd_cur}, 0);
|
||||||
} break;
|
} break;
|
||||||
case 2:
|
case 2:
|
||||||
{
|
{
|
||||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd_cur}, 0);
|
||||||
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
|
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd_cur}, 0);
|
||||||
|
|
||||||
layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
|
layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd_cur, n_embd_cur}, 0);
|
||||||
layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
|
layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd_cur}, 0);
|
||||||
|
|
||||||
layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
|
layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd_cur, n_embd_cur}, 0);
|
||||||
layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
|
layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd_cur}, 0);
|
||||||
|
|
||||||
layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
|
layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd_cur, n_embd_cur}, 0);
|
||||||
layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
|
layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd_cur}, 0);
|
||||||
|
|
||||||
layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
|
layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd_cur, n_embd_cur}, 0);
|
||||||
layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
|
layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd_cur}, 0);
|
||||||
} break;
|
} break;
|
||||||
case 5:
|
case 5:
|
||||||
{
|
{
|
||||||
layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
|
layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd_cur}, 0);
|
||||||
layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
|
layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd_cur}, 0);
|
||||||
} break;
|
} break;
|
||||||
default: GGML_ABORT("unknown posnet layer");
|
default: GGML_ABORT("unknown posnet layer");
|
||||||
};
|
};
|
||||||
|
@ -3320,29 +3320,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
|
|
||||||
// convnext
|
// convnext
|
||||||
{
|
{
|
||||||
const int64_t n_embd = hparams.convnext.n_embd;
|
const int64_t n_embd_cur = hparams.convnext.n_embd;
|
||||||
|
|
||||||
for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
|
for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
|
||||||
auto & layer = layers[i].convnext;
|
auto & layer = layers[i].convnext;
|
||||||
|
|
||||||
layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
|
layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd_cur}, 0);
|
||||||
layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
|
layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd_cur}, 0);
|
||||||
|
|
||||||
layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
|
layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd_cur}, 0);
|
||||||
layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
|
layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd_cur}, 0);
|
||||||
|
|
||||||
layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
|
layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd_cur, n_ff}, 0);
|
||||||
layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
|
layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
|
||||||
|
|
||||||
layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
|
layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd_cur}, 0);
|
||||||
layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
|
layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd_cur}, 0);
|
||||||
|
|
||||||
layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
|
layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd_cur}, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// output
|
// output
|
||||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd_cur}, 0);
|
||||||
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd_cur}, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
|
||||||
|
@ -3601,7 +3601,7 @@ void llama_model::print_info() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
// general kv
|
// general kv
|
||||||
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, pimpl->name_str.c_str());
|
||||||
|
|
||||||
if (arch == LLM_ARCH_DEEPSEEK) {
|
if (arch == LLM_ARCH_DEEPSEEK) {
|
||||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||||
|
@ -3696,8 +3696,8 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
|
||||||
|
|
||||||
const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
|
const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
|
||||||
auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
|
auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
|
||||||
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
|
[name](const std::pair<std::string, struct ggml_tensor *> & entry) {
|
||||||
return it.first == name;
|
return entry.first == name;
|
||||||
});
|
});
|
||||||
if (it == tensors_by_name.end()) {
|
if (it == tensors_by_name.end()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
|
@ -290,8 +290,6 @@ struct llama_model {
|
||||||
llm_type type = LLM_TYPE_UNKNOWN;
|
llm_type type = LLM_TYPE_UNKNOWN;
|
||||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||||
|
|
||||||
std::string name = "n/a";
|
|
||||||
|
|
||||||
llama_hparams hparams = {};
|
llama_hparams hparams = {};
|
||||||
llama_vocab vocab;
|
llama_vocab vocab;
|
||||||
|
|
||||||
|
|
|
@ -41,9 +41,9 @@ struct quantize_state_impl {
|
||||||
// used to figure out if a model shares tok_embd with the output weight
|
// used to figure out if a model shares tok_embd with the output weight
|
||||||
bool has_output = false;
|
bool has_output = false;
|
||||||
|
|
||||||
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
|
quantize_state_impl(const llama_model & model_, const llama_model_quantize_params * params_)
|
||||||
: model(model)
|
: model(model_)
|
||||||
, params(params)
|
, params(params_)
|
||||||
{}
|
{}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -130,17 +130,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
||||||
return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
|
return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
|
||||||
};
|
};
|
||||||
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
||||||
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
|
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name_layer) {
|
||||||
if (n_expert > 1) {
|
if (n_expert > 1) {
|
||||||
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
|
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
|
||||||
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
||||||
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
||||||
// tensor name.
|
// tensor name.
|
||||||
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
if (sscanf(name_layer, "blk.%d.", &i_layer) != 1) {
|
||||||
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
throw std::runtime_error(format("Failed to determine layer for tensor %s", name_layer));
|
||||||
}
|
}
|
||||||
if (i_layer < 0 || i_layer >= n_layer) {
|
if (i_layer < 0 || i_layer >= n_layer) {
|
||||||
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
|
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name_layer, n_layer));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return std::make_pair(i_layer, n_layer);
|
return std::make_pair(i_layer, n_layer);
|
||||||
|
@ -423,8 +423,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
|
||||||
int64_t counter = 0;
|
int64_t counter = 0;
|
||||||
size_t new_size = 0;
|
size_t new_size = 0;
|
||||||
bool valid = true;
|
bool valid = true;
|
||||||
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
|
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix]() {
|
||||||
nrows, n_per_row, imatrix]() {
|
|
||||||
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
||||||
size_t local_size = 0;
|
size_t local_size = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -437,6 +436,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
|
|
||||||
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
||||||
size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
||||||
local_size += this_size;
|
local_size += this_size;
|
||||||
|
@ -445,7 +445,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
|
||||||
const size_t row_size = ggml_row_size(new_type, n_per_row);
|
const size_t row_size = ggml_row_size(new_type, n_per_row);
|
||||||
void * this_data = (char *) new_data + first_row * row_size;
|
void * this_data = (char *) new_data + first_row * row_size;
|
||||||
if (!ggml_validate_row_data(new_type, this_data, this_size)) {
|
if (!ggml_validate_row_data(new_type, this_data, this_size)) {
|
||||||
std::unique_lock<std::mutex> lock(mutex);
|
lock.lock();
|
||||||
valid = false;
|
valid = false;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -589,15 +589,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
}
|
}
|
||||||
|
|
||||||
// make a list of weights
|
// make a list of weights
|
||||||
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
|
std::vector<const llama_model_loader::llama_tensor_weight *> tensor_weights;
|
||||||
tensors.reserve(ml.weights_map.size());
|
tensor_weights.reserve(ml.weights_map.size());
|
||||||
for (const auto & it : ml.weights_map) {
|
for (const auto & it : ml.weights_map) {
|
||||||
tensors.push_back(&it.second);
|
tensor_weights.push_back(&it.second);
|
||||||
}
|
}
|
||||||
|
|
||||||
// keep_split requires that the weights are sorted by split index
|
// keep_split requires that the weights are sorted by split index
|
||||||
if (params->keep_split) {
|
if (params->keep_split) {
|
||||||
std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
|
std::sort(tensor_weights.begin(), tensor_weights.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
|
||||||
if (a->idx == b->idx) {
|
if (a->idx == b->idx) {
|
||||||
return a->offs < b->offs;
|
return a->offs < b->offs;
|
||||||
}
|
}
|
||||||
|
@ -605,8 +605,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto * it : tensors) {
|
for (const auto * tw : tensor_weights) {
|
||||||
const struct ggml_tensor * tensor = it->tensor;
|
const ggml_tensor * tensor = tw->tensor;
|
||||||
|
|
||||||
const std::string name = ggml_get_name(tensor);
|
const std::string name = ggml_get_name(tensor);
|
||||||
|
|
||||||
|
@ -650,17 +650,17 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
|
|
||||||
// Assume split index is continuous
|
// Assume split index is continuous
|
||||||
if (params->keep_split) {
|
if (params->keep_split) {
|
||||||
for (const auto * it : tensors) {
|
for (const auto * tw : tensor_weights) {
|
||||||
n_split = std::max(uint16_t(it->idx + 1), n_split);
|
n_split = std::max(uint16_t(tw->idx + 1), n_split);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::vector<gguf_context_ptr> ctx_outs(n_split);
|
std::vector<gguf_context_ptr> ctx_outs(n_split);
|
||||||
ctx_outs[0] = std::move(ctx_out);
|
ctx_outs[0] = std::move(ctx_out);
|
||||||
|
|
||||||
// populate the original tensors so we get an initial meta data
|
// populate the original tensor_weights so we get an initial meta data
|
||||||
for (const auto * it : tensors) {
|
for (const auto * tw : tensor_weights) {
|
||||||
uint16_t i_split = params->keep_split ? it->idx : 0;
|
uint16_t i_split = params->keep_split ? tw->idx : 0;
|
||||||
struct ggml_tensor * tensor = it->tensor;
|
ggml_tensor * tensor = tw->tensor;
|
||||||
if (!ctx_outs[i_split]) {
|
if (!ctx_outs[i_split]) {
|
||||||
ctx_outs[i_split].reset(gguf_init_empty());
|
ctx_outs[i_split].reset(gguf_init_empty());
|
||||||
}
|
}
|
||||||
|
@ -707,12 +707,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
|
|
||||||
const auto tn = LLM_TN(model.arch);
|
const auto tn = LLM_TN(model.arch);
|
||||||
new_ofstream(0);
|
new_ofstream(0);
|
||||||
for (const auto * it : tensors) {
|
for (const auto * tw : tensor_weights) {
|
||||||
const auto & weight = *it;
|
ggml_tensor * tensor = tw->tensor;
|
||||||
struct ggml_tensor * tensor = weight.tensor;
|
if (tw->idx != cur_split && params->keep_split) {
|
||||||
if (weight.idx != cur_split && params->keep_split) {
|
|
||||||
close_ofstream();
|
close_ofstream();
|
||||||
new_ofstream(weight.idx);
|
new_ofstream(tw->idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string name = ggml_get_name(tensor);
|
const std::string name = ggml_get_name(tensor);
|
||||||
|
|
|
@ -412,8 +412,8 @@ static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token
|
||||||
|
|
||||||
time_meas tm(chain->t_sample_us, chain->params.no_perf);
|
time_meas tm(chain->t_sample_us, chain->params.no_perf);
|
||||||
|
|
||||||
for (auto * smpl : chain->samplers) {
|
for (auto * cur : chain->samplers) {
|
||||||
llama_sampler_accept(smpl, token);
|
llama_sampler_accept(cur, token);
|
||||||
}
|
}
|
||||||
|
|
||||||
chain->n_sample++;
|
chain->n_sample++;
|
||||||
|
@ -424,16 +424,16 @@ static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_d
|
||||||
|
|
||||||
time_meas tm(chain->t_sample_us, chain->params.no_perf);
|
time_meas tm(chain->t_sample_us, chain->params.no_perf);
|
||||||
|
|
||||||
for (auto * smpl : chain->samplers) {
|
for (auto * cur : chain->samplers) {
|
||||||
llama_sampler_apply(smpl, cur_p);
|
llama_sampler_apply(cur, cur_p);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
|
static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
|
||||||
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
||||||
|
|
||||||
for (auto * smpl : chain->samplers) {
|
for (auto * cur : chain->samplers) {
|
||||||
llama_sampler_reset(smpl);
|
llama_sampler_reset(cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
chain->t_sample_us = 0;
|
chain->t_sample_us = 0;
|
||||||
|
@ -445,8 +445,8 @@ static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampl
|
||||||
|
|
||||||
auto * result = llama_sampler_chain_init(chain_src->params);
|
auto * result = llama_sampler_chain_init(chain_src->params);
|
||||||
|
|
||||||
for (auto * smpl : chain_src->samplers) {
|
for (auto * cur : chain_src->samplers) {
|
||||||
llama_sampler_chain_add(result, llama_sampler_clone(smpl));
|
llama_sampler_chain_add(result, llama_sampler_clone(cur));
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -455,8 +455,8 @@ static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampl
|
||||||
static void llama_sampler_chain_free(struct llama_sampler * smpl) {
|
static void llama_sampler_chain_free(struct llama_sampler * smpl) {
|
||||||
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
||||||
|
|
||||||
for (auto * smpl : chain->samplers) {
|
for (auto * cur : chain->samplers) {
|
||||||
llama_sampler_free(smpl);
|
llama_sampler_free(cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
delete chain;
|
delete chain;
|
||||||
|
|
|
@ -24,41 +24,48 @@
|
||||||
struct naive_trie {
|
struct naive_trie {
|
||||||
naive_trie() : has_value(false), value(0) {
|
naive_trie() : has_value(false), value(0) {
|
||||||
}
|
}
|
||||||
void insert(const char * key, size_t len, int32_t value = 0) {
|
|
||||||
|
void insert(const char * key, size_t len, int32_t val = 0) {
|
||||||
if (len == 0) {
|
if (len == 0) {
|
||||||
this->has_value = true;
|
has_value = true;
|
||||||
this->value = value;
|
value = val;
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
char c = key[0];
|
char c = key[0];
|
||||||
auto res = children.find(c);
|
auto child = children.find(c);
|
||||||
if (res != children.end()) {
|
if (child != children.end()) {
|
||||||
res->second.insert(key + 1, len - 1, value);
|
child->second.insert(key + 1, len - 1, val);
|
||||||
} else {
|
} else {
|
||||||
auto res = children.insert(std::make_pair(c, naive_trie()));
|
auto child_new = children.insert(std::make_pair(c, naive_trie()));
|
||||||
res.first->second.insert(key + 1, len - 1, value);
|
child_new.first->second.insert(key + 1, len - 1, val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
|
std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
|
||||||
if (len == 0 || offset == len) {
|
if (len == 0 || offset == len) {
|
||||||
return std::make_pair(key, offset);
|
return std::make_pair(key, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
char c = key[offset];
|
char c = key[offset];
|
||||||
auto res = children.find(c);
|
auto child = children.find(c);
|
||||||
if (res != children.end()) {
|
if (child != children.end()) {
|
||||||
return res->second.get_longest_prefix(key, len, offset + 1);
|
return child->second.get_longest_prefix(key, len, offset + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
return std::make_pair(key, offset);
|
return std::make_pair(key, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
const struct naive_trie * traverse(const char c) const {
|
const struct naive_trie * traverse(const char c) const {
|
||||||
auto res = children.find(c);
|
auto child = children.find(c);
|
||||||
if (res != children.end()) {
|
if (child != children.end()) {
|
||||||
return &res->second;
|
return &child->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::map<char, struct naive_trie> children;
|
std::map<char, struct naive_trie> children;
|
||||||
bool has_value;
|
bool has_value;
|
||||||
llama_token value;
|
llama_token value;
|
||||||
|
@ -108,7 +115,7 @@ struct llm_tokenizer_spm : llm_tokenizer {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llm_tokenizer_spm_session {
|
struct llm_tokenizer_spm_session {
|
||||||
llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
|
llm_tokenizer_spm_session(const llama_vocab & vocab_) : vocab(vocab_) {}
|
||||||
|
|
||||||
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||||
// split string into utf8 chars
|
// split string into utf8 chars
|
||||||
|
@ -408,7 +415,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llm_tokenizer_bpe_session {
|
struct llm_tokenizer_bpe_session {
|
||||||
llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
|
llm_tokenizer_bpe_session(const llama_vocab & vocab_, const llm_tokenizer_bpe & tokenizer_) : vocab(vocab_), tokenizer(tokenizer_) {}
|
||||||
|
|
||||||
static void append(const llama_token token_id, std::vector<llama_token> & output) {
|
static void append(const llama_token token_id, std::vector<llama_token> & output) {
|
||||||
output.push_back(token_id);
|
output.push_back(token_id);
|
||||||
|
@ -596,7 +603,7 @@ struct llm_tokenizer_wpm : llm_tokenizer {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llm_tokenizer_wpm_session {
|
struct llm_tokenizer_wpm_session {
|
||||||
llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
|
llm_tokenizer_wpm_session(const llama_vocab & vocab_) : vocab(vocab_) {}
|
||||||
|
|
||||||
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||||
// normalize and split by whitespace
|
// normalize and split by whitespace
|
||||||
|
@ -775,7 +782,7 @@ struct llm_tokenizer_ugm : llm_tokenizer {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llm_tokenizer_ugm_session {
|
struct llm_tokenizer_ugm_session {
|
||||||
llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
|
llm_tokenizer_ugm_session(const llama_vocab & vocab_, const llm_tokenizer_ugm & tokenizer_) : vocab(vocab_), tokenizer(tokenizer_) {}
|
||||||
|
|
||||||
/* This implementation is based on SentencePiece optimized Viterbi algorithm for
|
/* This implementation is based on SentencePiece optimized Viterbi algorithm for
|
||||||
* unigram language models. The general idea is to:
|
* unigram language models. The general idea is to:
|
||||||
|
@ -942,7 +949,7 @@ private:
|
||||||
*/
|
*/
|
||||||
struct xcda_array_view {
|
struct xcda_array_view {
|
||||||
public:
|
public:
|
||||||
xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
|
xcda_array_view(const uint32_t * xcda_array_, size_t xcda_array_size_) : xcda_array(xcda_array_), xcda_array_size(xcda_array_size_) {
|
||||||
}
|
}
|
||||||
uint32_t get_base(size_t index) {
|
uint32_t get_base(size_t index) {
|
||||||
uint32_t packed_node = get_node(index);
|
uint32_t packed_node = get_node(index);
|
||||||
|
@ -1128,7 +1135,7 @@ struct llm_tokenizer_rwkv : llm_tokenizer {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llm_tokenizer_rwkv_session {
|
struct llm_tokenizer_rwkv_session {
|
||||||
llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
|
llm_tokenizer_rwkv_session(const llama_vocab & vocab_, const llm_tokenizer_rwkv & tokenizer_) : vocab(vocab_), tokenizer(tokenizer_) {}
|
||||||
|
|
||||||
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||||
uint32_t position = 0;
|
uint32_t position = 0;
|
||||||
|
@ -1255,7 +1262,7 @@ struct llama_vocab::impl {
|
||||||
|
|
||||||
std::vector<char> precompiled_charsmap;
|
std::vector<char> precompiled_charsmap;
|
||||||
|
|
||||||
impl(const llama_vocab & vocab) : vocab(vocab) {
|
impl(const llama_vocab & vocab_) : vocab(vocab_) {
|
||||||
}
|
}
|
||||||
|
|
||||||
~impl() = default;
|
~impl() = default;
|
||||||
|
@ -1278,7 +1285,7 @@ struct llama_vocab::impl {
|
||||||
|
|
||||||
llama_token_attr token_get_attr(llama_token id) const;
|
llama_token_attr token_get_attr(llama_token id) const;
|
||||||
|
|
||||||
void init_tokenizer(enum llama_vocab_type type);
|
void init_tokenizer();
|
||||||
|
|
||||||
void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
|
void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
|
||||||
|
|
||||||
|
@ -1668,7 +1675,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
}
|
}
|
||||||
GGML_ASSERT(id_to_token.size() == token_to_id.size());
|
GGML_ASSERT(id_to_token.size() == token_to_id.size());
|
||||||
|
|
||||||
init_tokenizer(type);
|
init_tokenizer();
|
||||||
|
|
||||||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||||||
if (type == LLAMA_VOCAB_TYPE_SPM) {
|
if (type == LLAMA_VOCAB_TYPE_SPM) {
|
||||||
|
@ -2109,7 +2116,7 @@ llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
|
||||||
return id_to_token.at(id).attr;
|
return id_to_token.at(id).attr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
|
void llama_vocab::impl::init_tokenizer() {
|
||||||
LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
|
LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
|
@ -2489,15 +2496,15 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
|
||||||
|
|
||||||
// copy piece chars to output text buffer
|
// copy piece chars to output text buffer
|
||||||
// skip up to 'lstrip' leading spaces before copying
|
// skip up to 'lstrip' leading spaces before copying
|
||||||
auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
|
auto _try_copy = [=] (const char * text, size_t size) -> int32_t {
|
||||||
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
|
for (int32_t i = 0; i < lstrip && size && *text == ' '; ++i) {
|
||||||
token++;
|
text++;
|
||||||
size--;
|
size--;
|
||||||
}
|
}
|
||||||
if (length < (int32_t)size) {
|
if (length < (int32_t)size) {
|
||||||
return -(int32_t) size;
|
return -(int32_t) size;
|
||||||
}
|
}
|
||||||
memcpy(buf, token, size);
|
memcpy(buf, text, size);
|
||||||
return (int32_t) size;
|
return (int32_t) size;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
175
src/llama.cpp
175
src/llama.cpp
|
@ -1089,16 +1089,16 @@ struct llm_build_context {
|
||||||
|
|
||||||
// TODO: consider making the entire interface noexcept
|
// TODO: consider making the entire interface noexcept
|
||||||
llm_build_context(
|
llm_build_context(
|
||||||
llama_context & lctx,
|
llama_context & lctx_,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch_,
|
||||||
const llm_build_cb & cb,
|
const llm_build_cb & cb_,
|
||||||
bool worst_case) :
|
bool worst_case) :
|
||||||
model (lctx.model),
|
model (lctx_.model),
|
||||||
lctx (lctx),
|
lctx (lctx_),
|
||||||
hparams (model.hparams),
|
hparams (model.hparams),
|
||||||
cparams (lctx.cparams),
|
cparams (lctx_.cparams),
|
||||||
ubatch (ubatch),
|
ubatch (ubatch_),
|
||||||
kv_self (lctx.kv_self),
|
kv_self (lctx_.kv_self),
|
||||||
n_embd (hparams.n_embd),
|
n_embd (hparams.n_embd),
|
||||||
n_layer (hparams.n_layer),
|
n_layer (hparams.n_layer),
|
||||||
n_rot (hparams.n_rot),
|
n_rot (hparams.n_rot),
|
||||||
|
@ -1119,17 +1119,17 @@ struct llm_build_context {
|
||||||
beta_slow (cparams.yarn_beta_slow),
|
beta_slow (cparams.yarn_beta_slow),
|
||||||
norm_eps (hparams.f_norm_eps),
|
norm_eps (hparams.f_norm_eps),
|
||||||
norm_rms_eps (hparams.f_norm_rms_eps),
|
norm_rms_eps (hparams.f_norm_rms_eps),
|
||||||
n_tokens (ubatch.n_tokens),
|
n_tokens (ubatch_.n_tokens),
|
||||||
n_kv (worst_case ? kv_self.size : kv_self.n),
|
n_kv (worst_case ? kv_self.size : kv_self.n),
|
||||||
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
|
n_outputs (worst_case ? n_tokens : lctx_.n_outputs),
|
||||||
n_outputs_enc (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd),
|
n_outputs_enc (worst_case ? n_tokens : lctx_.embd_enc.size() / hparams.n_embd),
|
||||||
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
|
||||||
n_ctx_orig (cparams.n_ctx_orig_yarn),
|
n_ctx_orig (cparams.n_ctx_orig_yarn),
|
||||||
flash_attn (cparams.flash_attn),
|
flash_attn (cparams.flash_attn),
|
||||||
pooling_type (cparams.pooling_type),
|
pooling_type (cparams.pooling_type),
|
||||||
rope_type (hparams.rope_type),
|
rope_type (hparams.rope_type),
|
||||||
cb (cb),
|
cb (cb_),
|
||||||
buf_compute_meta (lctx.buf_compute_meta) {
|
buf_compute_meta (lctx_.buf_compute_meta) {
|
||||||
// all initializations should be done in init()
|
// all initializations should be done in init()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1174,14 +1174,15 @@ struct llm_build_context {
|
||||||
ggml_set_input(lctx.inp_K_shift);
|
ggml_set_input(lctx.inp_K_shift);
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
const int64_t n_head_kv_i = hparams.n_head_kv(il);
|
||||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il);
|
||||||
|
|
||||||
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||||
struct ggml_tensor * k =
|
struct ggml_tensor * k =
|
||||||
ggml_view_3d(ctx0, kv_self.k_l[il],
|
ggml_view_3d(ctx0, kv_self.k_l[il],
|
||||||
n_embd_head_k, n_head_kv, n_ctx,
|
n_embd_head_k, n_head_kv_i, n_ctx,
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
|
||||||
0);
|
0);
|
||||||
|
|
||||||
struct ggml_tensor * tmp;
|
struct ggml_tensor * tmp;
|
||||||
|
@ -1231,18 +1232,18 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il);
|
||||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(il);
|
||||||
|
|
||||||
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||||
n_embd_k_gqa, nm,
|
n_embd_k_gqa_i, nm,
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i*i));
|
||||||
|
|
||||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||||
n_embd_k_gqa, nm,
|
n_embd_k_gqa_i, nm,
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i*id));
|
||||||
|
|
||||||
ggml_tensor * view_v_src;
|
ggml_tensor * view_v_src;
|
||||||
ggml_tensor * view_v_dst;
|
ggml_tensor * view_v_dst;
|
||||||
|
@ -1250,22 +1251,22 @@ struct llm_build_context {
|
||||||
if (flash_attn) {
|
if (flash_attn) {
|
||||||
// NOTE: the V cache is not transposed when using flash attention
|
// NOTE: the V cache is not transposed when using flash attention
|
||||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||||
n_embd_v_gqa, nm,
|
n_embd_v_gqa_i, nm,
|
||||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i),
|
||||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i*i));
|
||||||
|
|
||||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||||
n_embd_v_gqa, nm,
|
n_embd_v_gqa_i, nm,
|
||||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i),
|
||||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i*id));
|
||||||
} else {
|
} else {
|
||||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||||
nm, n_embd_v_gqa,
|
nm, n_embd_v_gqa_i,
|
||||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||||
ggml_row_size(kv_self.v_l[il]->type, i));
|
ggml_row_size(kv_self.v_l[il]->type, i));
|
||||||
|
|
||||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||||
nm, n_embd_v_gqa,
|
nm, n_embd_v_gqa_i,
|
||||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||||
ggml_row_size(kv_self.v_l[il]->type, id));
|
ggml_row_size(kv_self.v_l[il]->type, id));
|
||||||
}
|
}
|
||||||
|
@ -1459,7 +1460,6 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * llm_build_inp_embd_enc() {
|
struct ggml_tensor * llm_build_inp_embd_enc() {
|
||||||
const int64_t n_embd = hparams.n_embd;
|
|
||||||
lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
|
lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
|
||||||
ggml_set_input(lctx.inp_embd_enc);
|
ggml_set_input(lctx.inp_embd_enc);
|
||||||
cb(lctx.inp_embd_enc, "embd_enc", -1);
|
cb(lctx.inp_embd_enc, "embd_enc", -1);
|
||||||
|
@ -1476,9 +1476,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_llama() {
|
struct ggml_cgraph * build_llama() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
@ -1553,7 +1550,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
@ -1642,9 +1638,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_deci() {
|
struct ggml_cgraph * build_deci() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
@ -1663,10 +1656,10 @@ struct llm_build_context {
|
||||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * inpSA = inpL;
|
struct ggml_tensor * inpSA = inpL;
|
||||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
const int64_t n_head_kv_i = hparams.n_head_kv(il);
|
||||||
const int64_t n_head = hparams.n_head(il);
|
const int64_t n_head_i = hparams.n_head(il);
|
||||||
|
|
||||||
if (n_head == 0) {
|
if (n_head_i == 0) {
|
||||||
// attention-free layer of Llama-3_1-Nemotron-51B
|
// attention-free layer of Llama-3_1-Nemotron-51B
|
||||||
cur = inpL;
|
cur = inpL;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1677,11 +1670,11 @@ struct llm_build_context {
|
||||||
cb(cur, "attn_norm", il);
|
cb(cur, "attn_norm", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_head > 0 && n_head_kv == 0) {
|
if (n_head_i > 0 && n_head_kv_i == 0) {
|
||||||
// "linear attention" of Llama-3_1-Nemotron-51B
|
// "linear attention" of Llama-3_1-Nemotron-51B
|
||||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
||||||
cb(cur, "wo", il);
|
cb(cur, "wo", il);
|
||||||
} else if (n_head > 0) {
|
} else if (n_head_i > 0) {
|
||||||
// self-attention
|
// self-attention
|
||||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||||
|
@ -1709,14 +1702,14 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head_i, n_tokens), inp_pos, rope_factors,
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_ext(
|
Kcur = ggml_rope_ext(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv_i, n_tokens), inp_pos, rope_factors,
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
|
@ -1730,7 +1723,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
@ -1742,7 +1734,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
||||||
struct ggml_tensor * ffn_inp = cur;
|
struct ggml_tensor * ffn_inp = cur;
|
||||||
if (n_head > 0) {
|
if (n_head_i > 0) {
|
||||||
ffn_inp = ggml_add(ctx0, cur, inpSA);
|
ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
cb(ffn_inp, "ffn_inp", il);
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
}
|
}
|
||||||
|
@ -2141,9 +2133,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_grok() {
|
struct ggml_cgraph * build_grok() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
@ -2218,7 +2207,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
@ -2300,9 +2288,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_dbrx() {
|
struct ggml_cgraph * build_dbrx() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
@ -2370,7 +2355,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
@ -2659,7 +2643,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
// iterate layers
|
// iterate layers
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * cur = inpL;
|
cur = inpL;
|
||||||
|
|
||||||
struct ggml_tensor * Qcur;
|
struct ggml_tensor * Qcur;
|
||||||
struct ggml_tensor * Kcur;
|
struct ggml_tensor * Kcur;
|
||||||
|
@ -3553,9 +3537,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_qwen2moe() {
|
struct ggml_cgraph * build_qwen2moe() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
@ -3620,7 +3601,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
@ -4737,8 +4717,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_gemma() {
|
struct ggml_cgraph * build_gemma() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
@ -4845,8 +4823,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_gemma2() {
|
struct ggml_cgraph * build_gemma2() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
@ -4982,6 +4958,7 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
@ -5440,9 +5417,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_olmo() {
|
struct ggml_cgraph * build_olmo() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
@ -5513,7 +5487,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
@ -5564,9 +5537,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_olmo2() {
|
struct ggml_cgraph * build_olmo2() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
@ -5637,7 +5607,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
@ -5692,9 +5661,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_olmoe() {
|
struct ggml_cgraph * build_olmoe() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
@ -5764,7 +5730,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
@ -5832,9 +5797,9 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
const int64_t n_head = hparams.n_head(il);
|
const int64_t n_head_i = hparams.n_head(il);
|
||||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
const int64_t n_head_kv_i = hparams.n_head_kv(il);
|
||||||
const int64_t n_head_qkv = 2*n_head_kv + n_head;
|
const int64_t n_head_qkv_i = 2*n_head_kv_i + n_head_i;
|
||||||
|
|
||||||
cur = inpL;
|
cur = inpL;
|
||||||
struct ggml_tensor * residual = cur;
|
struct ggml_tensor * residual = cur;
|
||||||
|
@ -5850,15 +5815,15 @@ struct llm_build_context {
|
||||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
||||||
cb(cur, "wqkv", il);
|
cb(cur, "wqkv", il);
|
||||||
|
|
||||||
cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
|
cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv_i, n_tokens);
|
||||||
|
|
||||||
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
|
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_i, n_tokens, cur->nb[1], cur->nb[2], 0));
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
|
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head_i));
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
|
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head_i+n_head_kv_i)));
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
||||||
|
@ -5883,7 +5848,7 @@ struct llm_build_context {
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
|
Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv_i, n_tokens);
|
||||||
cb(Qcur, "Vcur", il);
|
cb(Qcur, "Vcur", il);
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||||
|
@ -6085,9 +6050,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_arctic() {
|
struct ggml_cgraph * build_arctic() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
@ -6146,7 +6108,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
@ -6219,9 +6180,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_deepseek() {
|
struct ggml_cgraph * build_deepseek() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
@ -6295,7 +6253,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
@ -6376,9 +6333,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_deepseek2() {
|
struct ggml_cgraph * build_deepseek2() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
bool is_lite = (hparams.n_layer == 27);
|
bool is_lite = (hparams.n_layer == 27);
|
||||||
|
|
||||||
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
||||||
|
@ -6527,7 +6481,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
@ -6757,9 +6710,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_t5_enc() {
|
struct ggml_cgraph * build_t5_enc() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
@ -6833,7 +6783,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
@ -6889,9 +6838,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_t5_dec() {
|
struct ggml_cgraph * build_t5_dec() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
@ -7033,7 +6979,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
|
inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
|
||||||
|
@ -7421,9 +7366,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_exaone() {
|
struct ggml_cgraph * build_exaone() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
@ -7497,7 +7439,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
@ -7551,9 +7492,9 @@ struct llm_build_context {
|
||||||
// Token shift state dimensions should be 2 * n_emb
|
// Token shift state dimensions should be 2 * n_emb
|
||||||
GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
|
GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
|
||||||
|
|
||||||
const int64_t n_seqs = ubatch.n_seqs;
|
const int64_t n_seqs = ubatch.n_seqs;
|
||||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||||
const int64_t n_tokens = ubatch.n_tokens;
|
|
||||||
GGML_ASSERT(n_seqs != 0);
|
GGML_ASSERT(n_seqs != 0);
|
||||||
GGML_ASSERT(ubatch.equal_seqs);
|
GGML_ASSERT(ubatch.equal_seqs);
|
||||||
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
|
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
|
||||||
|
@ -7664,9 +7605,9 @@ struct llm_build_context {
|
||||||
|
|
||||||
GGML_ASSERT(n_embd == hparams.n_embd_k_s());
|
GGML_ASSERT(n_embd == hparams.n_embd_k_s());
|
||||||
|
|
||||||
const int64_t n_seqs = ubatch.n_seqs;
|
const int64_t n_seqs = ubatch.n_seqs;
|
||||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||||
const int64_t n_tokens = ubatch.n_tokens;
|
|
||||||
GGML_ASSERT(n_seqs != 0);
|
GGML_ASSERT(n_seqs != 0);
|
||||||
GGML_ASSERT(ubatch.equal_seqs);
|
GGML_ASSERT(ubatch.equal_seqs);
|
||||||
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
|
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
|
||||||
|
@ -7779,9 +7720,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_chameleon() {
|
struct ggml_cgraph * build_chameleon() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
@ -7878,7 +7816,6 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue