Merge 'origin/master' into hipblas

This commit is contained in:
Henri Vasserman 2023-05-03 15:04:51 +03:00
commit b67cc50dad
No known key found for this signature in database
GPG key ID: 2995FC0F58B1A986
16 changed files with 351 additions and 180 deletions

View file

@ -77,21 +77,19 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
# Build info header # Build info header
# #
# Write header template to binary dir to keep source directory clean
file(WRITE "${CMAKE_BINARY_DIR}/BUILD_INFO.h.in" "\
#ifndef BUILD_INFO_H\n\
#define BUILD_INFO_H\n\
\n\
#define BUILD_NUMBER @BUILD_NUMBER@\n\
#define BUILD_COMMIT \"@BUILD_COMMIT@\"\n\
\n\
#endif // BUILD_INFO_H\n\
")
# Generate initial build-info.h # Generate initial build-info.h
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
# Is git submodule
if(NOT IS_DIRECTORY "${GIT_DIR}")
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
endif()
# Add a custom target for build-info.h # Add a custom target for build-info.h
add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h") add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
@ -101,7 +99,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
COMMENT "Generating build details from Git" COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake" COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/.git/index" DEPENDS "${GIT_DIR}/index"
VERBATIM VERBATIM
) )
else() else()
@ -389,8 +387,11 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
add_compile_options(-mavx512vnni) add_compile_options(-mavx512vnni)
endif() endif()
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected")
add_compile_options(-mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
else() else()
# TODO: support PowerPC
message(STATUS "Unknown architecture") message(STATUS "Unknown architecture")
endif() endif()

View file

@ -226,7 +226,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
build-info.h: $(wildcard .git/index) scripts/build-info.sh build-info.h: $(wildcard .git/index) scripts/build-info.sh
@scripts/build-info.sh > $@.tmp @sh scripts/build-info.sh > $@.tmp
@if ! cmp -s $@.tmp $@; then \ @if ! cmp -s $@.tmp $@; then \
mv $@.tmp $@; \ mv $@.tmp $@; \
else \ else \

View file

@ -38,9 +38,9 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN" #define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5ld x %5ld x %5ld, nb = (%5li, %5li, %5li) - ", #TENSOR, \ #define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\ TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \ (int) TENSOR->ne[0], (int) TENSOR->ne[1], (int) TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
{ float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); } { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
struct benchmark_params_struct { struct benchmark_params_struct {
@ -138,7 +138,7 @@ int main(int argc, char ** argv) {
ctx = ggml_init(params); ctx = ggml_init(params);
if (!ctx) { if (!ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__); fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false; return 1;
} }

View file

@ -66,6 +66,33 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
} }
std::string process_escapes(const char* input) {
std::string output;
if (input != nullptr) {
std::size_t input_len = std::strlen(input);
output.reserve(input_len);
for (std::size_t i = 0; i < input_len; ++i) {
if (input[i] == '\\' && i + 1 < input_len) {
switch (input[++i]) {
case 'n': output.push_back('\n'); break;
case 't': output.push_back('\t'); break;
case '\'': output.push_back('\''); break;
case '\"': output.push_back('\"'); break;
case '\\': output.push_back('\\'); break;
default: output.push_back('\\');
output.push_back(input[i]); break;
}
} else {
output.push_back(input[i]);
}
}
}
return output;
}
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false; bool invalid_param = false;
std::string arg; std::string arg;
@ -91,7 +118,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.prompt = argv[i]; params.prompt = process_escapes(argv[i]);
} else if (arg == "--session") { } else if (arg == "--session") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -324,7 +351,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n"); fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
fprintf(stderr, " specified more than once for multiple prompts).\n"); fprintf(stderr, " specified more than once for multiple prompts).\n");
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n"); fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n"); fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: empty)\n"); fprintf(stderr, " prompt to start generation with (default: empty)\n");
@ -405,6 +432,39 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
return res; return res;
} }
struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.logits_all = params.perplexity;
lparams.embedding = params.embedding;
llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
if (lctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return NULL;
}
if (!params.lora_adapter.empty()) {
int err = llama_apply_lora_from_file(lctx,
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
return NULL;
}
}
return lctx;
}
/* Keep track of current color of output, and emit ANSI code if it changes. */ /* Keep track of current color of output, and emit ANSI code if it changes. */
void set_console_color(console_state & con_st, console_color_t color) { void set_console_color(console_state & con_st, console_color_t color) {
if (con_st.use_color && con_st.color != color) { if (con_st.use_color && con_st.color != color) {

View file

@ -77,6 +77,12 @@ std::string gpt_random_prompt(std::mt19937 & rng);
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos); std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
//
// Model utils
//
struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
// //
// Console utils // Console utils
// //

View file

@ -21,7 +21,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed <= 0) { if (params.seed < 0) {
params.seed = time(NULL); params.seed = time(NULL);
} }
@ -35,24 +35,10 @@ int main(int argc, char ** argv) {
llama_context * ctx; llama_context * ctx;
// load the model // load the model
{ ctx = llama_init_from_gpt_params(params);
auto lparams = llama_context_default_params(); if (ctx == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
lparams.n_ctx = params.n_ctx; return 1;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
} }
// print system information // print system information

View file

@ -130,7 +130,7 @@ It is important to note that the generated text may be shorter than the specifie
- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1). - `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1).
The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than or equal to 0, a random seed will be used, which will result in different outputs on each run. The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
### Temperature ### Temperature

View file

@ -22,6 +22,9 @@
#include <signal.h> #include <signal.h>
#include <unistd.h> #include <unistd.h>
#elif defined (_WIN32) #elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h> #include <signal.h>
#endif #endif
@ -84,7 +87,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed <= 0) { if (params.seed < 0) {
params.seed = time(NULL); params.seed = time(NULL);
} }
@ -101,34 +104,11 @@ int main(int argc, char ** argv) {
llama_context * ctx; llama_context * ctx;
g_ctx = &ctx; g_ctx = &ctx;
// load the model // load the model and apply lora adapter, if any
{ ctx = llama_init_from_gpt_params(params);
auto lparams = llama_context_default_params(); if (ctx == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
lparams.n_ctx = params.n_ctx; return 1;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
if (!params.lora_adapter.empty()) {
int err = llama_apply_lora_from_file(ctx,
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
return 1;
}
} }
// print system information // print system information
@ -263,7 +243,10 @@ int main(int argc, char ** argv) {
sigint_action.sa_flags = 0; sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL); sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32) #elif defined (_WIN32)
signal(SIGINT, sigint_handler); auto console_ctrl_handler = [](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif #endif
fprintf(stderr, "%s: interactive mode on.\n", __func__); fprintf(stderr, "%s: interactive mode on.\n", __func__);
@ -298,7 +281,7 @@ int main(int argc, char ** argv) {
} }
bool is_antiprompt = false; bool is_antiprompt = false;
bool input_noecho = false; bool input_echo = true;
// HACK - because session saving incurs a non-negligible delay, for now skip re-saving session // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
// if we loaded a session with at least 75% similarity. It's currently just used to speed up the // if we loaded a session with at least 75% similarity. It's currently just used to speed up the
@ -306,9 +289,9 @@ int main(int argc, char ** argv) {
bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4); bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);
int n_past = 0; int n_past = 0;
int n_remain = params.n_predict; int n_remain = params.n_predict;
int n_consumed = 0; int n_consumed = 0;
int n_session_consumed = 0; int n_session_consumed = 0;
// the first thing we will do is to output the prompt, so set color accordingly // the first thing we will do is to output the prompt, so set color accordingly
@ -413,7 +396,7 @@ int main(int argc, char ** argv) {
llama_token id = 0; llama_token id = 0;
{ {
auto logits = llama_get_logits(ctx); auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx); auto n_vocab = llama_n_vocab(ctx);
// Apply params.logit_bias map // Apply params.logit_bias map
@ -485,7 +468,7 @@ int main(int argc, char ** argv) {
embd.push_back(id); embd.push_back(id);
// echo this to console // echo this to console
input_noecho = false; input_echo = true;
// decrement remaining sampling budget // decrement remaining sampling budget
--n_remain; --n_remain;
@ -503,14 +486,14 @@ int main(int argc, char ** argv) {
} }
// display text // display text
if (!input_noecho) { if (input_echo) {
for (auto id : embd) { for (auto id : embd) {
printf("%s", llama_token_to_str(ctx, id)); printf("%s", llama_token_to_str(ctx, id));
} }
fflush(stdout); fflush(stdout);
} }
// reset color to default if we there is no pending user input // reset color to default if we there is no pending user input
if (!input_noecho && (int)embd_inp.size() == n_consumed) { if (input_echo && (int)embd_inp.size() == n_consumed) {
set_console_color(con_st, CONSOLE_COLOR_DEFAULT); set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
} }
@ -542,11 +525,6 @@ int main(int argc, char ** argv) {
// potentially set color to indicate we are taking user input // potentially set color to indicate we are taking user input
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT); set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
#if defined (_WIN32)
// Windows: must reactivate sigint handler after each signal
signal(SIGINT, sigint_handler);
#endif
if (params.instruct) { if (params.instruct) {
printf("\n> "); printf("\n> ");
} }
@ -605,7 +583,7 @@ int main(int argc, char ** argv) {
n_remain -= line_inp.size(); n_remain -= line_inp.size();
} }
input_noecho = true; // do not echo this again input_echo = false; // do not echo this again
} }
if (n_past > 0) { if (n_past > 0) {
@ -630,10 +608,6 @@ int main(int argc, char ** argv) {
} }
} }
#if defined (_WIN32)
signal(SIGINT, SIG_DFL);
#endif
llama_print_timings(ctx); llama_print_timings(ctx);
llama_free(ctx); llama_free(ctx);

View file

@ -109,7 +109,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed <= 0) { if (params.seed < 0) {
params.seed = time(NULL); params.seed = time(NULL);
} }
@ -122,36 +122,11 @@ int main(int argc, char ** argv) {
llama_context * ctx; llama_context * ctx;
// load the model // load the model and apply lora adapter, if any
{ ctx = llama_init_from_gpt_params(params);
auto lparams = llama_context_default_params(); if (ctx == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
lparams.n_ctx = params.n_ctx; return 1;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
if (!params.lora_adapter.empty()) {
int err = llama_apply_lora_from_file(ctx,
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
return 1;
}
} }
// print system information // print system information

129
ggml.c
View file

@ -671,35 +671,91 @@ float vmaxvq_f32(float32x4_t v) {
} }
int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) { int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
return vget_low_s8(vcombine_s8(a, b)); int8x8_t res;
res[0] = a[0]; res[1] = b[0];
res[2] = a[1]; res[3] = b[1];
res[4] = a[2]; res[5] = b[2];
res[6] = a[3]; res[7] = b[3];
return res;
} }
int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) { int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
return vget_high_s8(vcombine_s8(a, b)); int8x8_t res;
res[0] = a[4]; res[1] = b[4];
res[2] = a[5]; res[3] = b[5];
res[4] = a[6]; res[5] = b[6];
res[6] = a[7]; res[7] = b[7];
return res;
} }
uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
return vget_low_u8(vcombine_u8(a, b)); uint8x8_t res;
res[0] = a[0]; res[1] = b[0];
res[2] = a[1]; res[3] = b[1];
res[4] = a[2]; res[5] = b[2];
res[6] = a[3]; res[7] = b[3];
return res;
} }
uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
return vget_high_u8(vcombine_u8(a, b)); uint8x8_t res;
res[0] = a[4]; res[1] = b[4];
res[2] = a[5]; res[3] = b[5];
res[4] = a[6]; res[5] = b[6];
res[6] = a[7]; res[7] = b[7];
return res;
} }
int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) { int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
return vcombine_s8(vget_low_s8(a), vget_low_s8(b)); int8x16_t res;
res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
return res;
} }
int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) { int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
return vcombine_s8(vget_high_s8(a), vget_high_s8(b)); int8x16_t res;
res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
return res;
} }
uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) { uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
return vcombine_u8(vget_low_u8(a), vget_low_u8(b)); uint8x16_t res;
res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
return res;
} }
uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) { uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
return vcombine_u8(vget_high_u8(a), vget_high_u8(b)); uint8x16_t res;
res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
return res;
} }
int32x4_t vcvtnq_s32_f32(float32x4_t v) { int32x4_t vcvtnq_s32_f32(float32x4_t v) {
@ -826,6 +882,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
float max = 0.0f; float max = 0.0f;
float min = 0.0f; float min = 0.0f;
vector float asrcv [8];
vector float srcv [8]; vector float srcv [8];
vector float maxv[8]; vector float maxv[8];
vector float minv[8]; vector float minv[8];
@ -4541,6 +4598,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
/*.perf_cycles =*/ 0, /*.perf_cycles =*/ 0,
/*.perf_time_us =*/ 0, /*.perf_time_us =*/ 0,
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data, /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
/*.name =*/ { 0 },
/*.pad =*/ { 0 }, /*.pad =*/ { 0 },
}; };
@ -4895,6 +4953,15 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
return (float *)(tensor->data); return (float *)(tensor->data);
} }
const char * ggml_get_name(const struct ggml_tensor * tensor) {
return tensor->name;
}
void ggml_set_name(struct ggml_tensor * tensor, const char * name) {
strncpy(tensor->name, name, sizeof(tensor->name));
tensor->name[sizeof(tensor->name) - 1] = '\0';
}
struct ggml_tensor * ggml_view_tensor( struct ggml_tensor * ggml_view_tensor(
struct ggml_context * ctx, struct ggml_context * ctx,
const struct ggml_tensor * src) { const struct ggml_tensor * src) {
@ -5994,6 +6061,7 @@ struct ggml_tensor * ggml_diag_mask_inf(
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
struct ggml_tensor * result = ggml_view_tensor(ctx, a); struct ggml_tensor * result = ggml_view_tensor(ctx, a);
struct ggml_tensor * b = ggml_new_i32(ctx, n_past); struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
ggml_set_name(b, "n_past");
result->op = GGML_OP_DIAG_MASK_INF; result->op = GGML_OP_DIAG_MASK_INF;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6051,6 +6119,7 @@ struct ggml_tensor * ggml_rope(
((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_dims; ((int32_t *) b->data)[1] = n_dims;
((int32_t *) b->data)[2] = mode; ((int32_t *) b->data)[2] = mode;
ggml_set_name(b, "n_past, n_dims, mode");
result->op = GGML_OP_ROPE; result->op = GGML_OP_ROPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -12118,10 +12187,16 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
snprintf(color, sizeof(color), "white"); snprintf(color, sizeof(color), "white");
} }
fprintf(fp, " \"%p\" [ \ fprintf(fp, " \"%p\" [ "
style = filled; fillcolor = %s; shape = record; \ "style = filled; fillcolor = %s; shape = record; "
label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s", "label=\"",
(void *) node, color, (void *) node, color);
if (strlen(node->name) > 0) {
fprintf(fp, "%s |", node->name);
}
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s",
i, node->ne[0], node->ne[1], i, node->ne[0], node->ne[1],
GGML_OP_SYMBOL[node->op]); GGML_OP_SYMBOL[node->op]);
@ -12137,18 +12212,26 @@ label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
snprintf(color, sizeof(color), "pink"); snprintf(color, sizeof(color), "pink");
if (ggml_nelements(node) == 1) { fprintf(fp, " \"%p\" [ "
fprintf(fp, " \"%p\" [ \ "style = filled; fillcolor = %s; shape = record; "
style = filled; fillcolor = %s; shape = record; \ "label=\"<x>",
label=\"<x>%.1e\"; ]\n", (void *) node, color);
(void *) node, color, (double)ggml_get_f32_1d(node, 0));
} else { if (strlen(node->name) > 0) {
fprintf(fp, " \"%p\" [ \ fprintf(fp, "%s | ", node->name);
style = filled; fillcolor = %s; shape = record; \
label=\"<x>CONST %d [%" PRId64 ", %" PRId64 "]\"; ]\n",
(void *) node, color,
i, node->ne[0], node->ne[1]);
} }
if (ggml_nelements(node) == 1) {
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
}
else {
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
}
}
else {
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
}
fprintf(fp, "\"; ]\n");
} }
for (int i = 0; i < gb->n_nodes; i++) { for (int i = 0; i < gb->n_nodes; i++) {

8
ggml.h
View file

@ -350,7 +350,10 @@ extern "C" {
int64_t perf_time_us; int64_t perf_time_us;
void * data; void * data;
char padding[8];
char name[32];
char padding[8]; // TODO: remove and add padding to name?
}; };
// computation graph // computation graph
@ -473,6 +476,9 @@ extern "C" {
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
// //
// operations on tensors with backpropagation // operations on tensors with backpropagation
// //

132
llama.cpp
View file

@ -659,6 +659,7 @@ struct llama_model_loader {
LLAMA_ASSERT(lt.ne.size() == 1); LLAMA_ASSERT(lt.ne.size() == 1);
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0)); tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
} }
ggml_set_name(tensor, lt.name.c_str());
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
lt.ggml_tensor = tensor; lt.ggml_tensor = tensor;
num_ggml_tensors_created++; num_ggml_tensors_created++;
@ -798,6 +799,8 @@ static bool kv_cache_init(
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
ggml_set_name(cache.k, "cache_k");
ggml_set_name(cache.v, "cache_v");
return true; return true;
} }
@ -806,7 +809,7 @@ struct llama_context_params llama_context_default_params() {
struct llama_context_params result = { struct llama_context_params result = {
/*.n_ctx =*/ 512, /*.n_ctx =*/ 512,
/*.n_parts =*/ -1, /*.n_parts =*/ -1,
/*.seed =*/ 0, /*.seed =*/ -1,
/*.f16_kv =*/ false, /*.f16_kv =*/ false,
/*.logits_all =*/ false, /*.logits_all =*/ false,
/*.vocab_only =*/ false, /*.vocab_only =*/ false,
@ -1084,6 +1087,7 @@ static bool llama_eval_internal(
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
ggml_set_name(embd, "embd");
memcpy(embd->data, tokens, N*ggml_element_size(embd)); memcpy(embd->data, tokens, N*ggml_element_size(embd));
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
@ -1110,6 +1114,8 @@ static bool llama_eval_internal(
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
ggml_set_name(Qcur, "Qcur");
ggml_set_name(Kcur, "Kcur");
// store key and value to memory // store key and value to memory
{ {
@ -1130,6 +1136,7 @@ static bool llama_eval_internal(
ggml_permute(ctx0, ggml_permute(ctx0,
Qcur, Qcur,
0, 2, 1, 3); 0, 2, 1, 3);
ggml_set_name(Q, "Q");
struct ggml_tensor * K = struct ggml_tensor * K =
ggml_permute(ctx0, ggml_permute(ctx0,
@ -1137,21 +1144,26 @@ static bool llama_eval_internal(
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
n_embd/n_head, n_head, n_past + N), n_embd/n_head, n_head, n_past + N),
0, 2, 1, 3); 0, 2, 1, 3);
ggml_set_name(K, "K");
// K * Q // K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
ggml_set_name(KQ, "KQ");
// KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled = KQ / sqrt(n_embd/n_head)
struct ggml_tensor * KQ_scaled = struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
ggml_scale(ctx0, ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
KQ,
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
ggml_set_name(KQ_scaled, "KQ_scaled");
// KQ_masked = mask_past(KQ_scaled) // KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
ggml_set_name(KQ_masked, "KQ_masked");
// KQ = soft_max(KQ_masked) // KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
ggml_set_name(KQ_soft_max, "KQ_soft_max");
// split cached V into n_head heads // split cached V into n_head heads
struct ggml_tensor * V = struct ggml_tensor * V =
@ -1160,9 +1172,11 @@ static bool llama_eval_internal(
n_ctx*ggml_element_size(kv_self.v), n_ctx*ggml_element_size(kv_self.v),
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head, n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
il*n_ctx*ggml_element_size(kv_self.v)*n_embd); il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
ggml_set_name(V, "V");
#if 1 #if 1
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
ggml_set_name(KQV, "KQV");
#else #else
// make V contiguous in memory to speed up the matmul, however we waste time on the copy // make V contiguous in memory to speed up the matmul, however we waste time on the copy
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
@ -1173,11 +1187,13 @@ static bool llama_eval_internal(
// KQV_merged = KQV.permute(0, 2, 1, 3) // KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
ggml_set_name(KQV_merged, "KQV_merged");
// cur = KQV_merged.contiguous().view(n_embd, N) // cur = KQV_merged.contiguous().view(n_embd, N)
cur = ggml_cpy(ctx0, cur = ggml_cpy(ctx0,
KQV_merged, KQV_merged,
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
ggml_set_name(cur, "KQV_merged_contiguous");
// projection (no bias) // projection (no bias)
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
@ -1269,6 +1285,9 @@ static bool llama_eval_internal(
//embd_w.resize(n_vocab*N); //embd_w.resize(n_vocab*N);
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
// update kv token count
lctx.model.kv_self.n = n_past + N;
// extract logits // extract logits
{ {
auto & logits_out = lctx.logits; auto & logits_out = lctx.logits;
@ -1686,7 +1705,7 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
} }
} }
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty) { void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
if (last_tokens_size == 0 || penalty == 1.0f) { if (last_tokens_size == 0 || penalty == 1.0f) {
return; return;
} }
@ -1715,7 +1734,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
} }
} }
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) { void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) { if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
return; return;
} }
@ -2037,7 +2056,7 @@ struct llama_context * llama_init_from_file(
llama_context * ctx = new llama_context; llama_context * ctx = new llama_context;
if (params.seed <= 0) { if (params.seed < 0) {
params.seed = time(NULL); params.seed = time(NULL);
} }
@ -2379,13 +2398,13 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
#define LLAMA_MAX_RNG_STATE 64*1024 #define LLAMA_MAX_RNG_STATE 64*1024
void llama_set_rng_seed(struct llama_context * ctx, int seed) { void llama_set_rng_seed(struct llama_context * ctx, int seed) {
if (seed <= 0) { if (seed < 0) {
seed = time(NULL); seed = time(NULL);
} }
ctx->rng.seed(seed); ctx->rng.seed(seed);
} }
// Returns the size of the state // Returns the *maximum* size of the state
size_t llama_get_state_size(const struct llama_context * ctx) { size_t llama_get_state_size(const struct llama_context * ctx) {
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
// for reference, std::mt19937(1337) serializes to 6701 bytes. // for reference, std::mt19937(1337) serializes to 6701 bytes.
@ -2464,21 +2483,51 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
// copy kv cache // copy kv cache
{ {
const size_t kv_size = ctx->model.kv_self.buf.size; const auto & kv_self = ctx->model.kv_self;
const auto & hparams = ctx->model.hparams;
const int n_layer = hparams.n_layer;
const int n_embd = hparams.n_embd;
const int n_ctx = hparams.n_ctx;
const size_t kv_size = kv_self.buf.size;
const int kv_ntok = llama_get_kv_cache_token_count(ctx); const int kv_ntok = llama_get_kv_cache_token_count(ctx);
memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size); memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok); memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
if (kv_size) { if (kv_size) {
memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size; const size_t elt_size = ggml_element_size(kv_self.k);
char buffer[4096];
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
ggml_cgraph gf{};
gf.n_threads = 1;
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
kout3d->data = out;
out += ggml_nbytes(kout3d);
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
vout3d->data = out;
out += ggml_nbytes(vout3d);
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
n_embd, kv_ntok, n_layer,
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
kv_ntok, n_embd, n_layer,
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
ggml_graph_compute(cpy_ctx, &gf);
} }
} }
const size_t written = out - dest; const size_t written = out - dest;
const size_t expected = llama_get_state_size(ctx); const size_t max_size = llama_get_state_size(ctx);
LLAMA_ASSERT(written == expected); LLAMA_ASSERT(written <= max_size);
return written; return written;
} }
@ -2536,6 +2585,12 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
// set kv cache // set kv cache
{ {
const auto & kv_self = ctx->model.kv_self;
const auto & hparams = ctx->model.hparams;
const int n_layer = hparams.n_layer;
const int n_embd = hparams.n_embd;
const int n_ctx = hparams.n_ctx;
size_t kv_size; size_t kv_size;
int kv_ntok; int kv_ntok;
@ -2543,25 +2598,42 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok); memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
if (kv_size) { if (kv_size) {
LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size); LLAMA_ASSERT(kv_self.buf.size == kv_size);
void * k_data = ctx->model.kv_self.k->data; // remember data pointers const size_t elt_size = ggml_element_size(kv_self.k);
void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy char buffer[4096];
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
ggml_cgraph gf{};
gf.n_threads = 1;
memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size; ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
kin3d->data = (void *) in;
in += ggml_nbytes(kin3d);
ctx->model.kv_self.k->data = k_data; // restore correct data pointers ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
ctx->model.kv_self.v->data = v_data; vin3d->data = (void *) in;
in += ggml_nbytes(vin3d);
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
n_embd, kv_ntok, n_layer,
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
kv_ntok, n_embd, n_layer,
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
ggml_graph_compute(cpy_ctx, &gf);
} }
ctx->model.kv_self.n = kv_ntok; ctx->model.kv_self.n = kv_ntok;
} }
const size_t nread = in - src; const size_t nread = in - src;
const size_t expected = llama_get_state_size(ctx); const size_t max_size = llama_get_state_size(ctx);
LLAMA_ASSERT(nread == expected); LLAMA_ASSERT(nread <= max_size);
return nread; return nread;
} }
@ -2604,14 +2676,14 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
// restore the context state // restore the context state
{ {
const size_t n_state_size_cur = file.size - file.tell(); const size_t n_state_size_cur = file.size - file.tell();
const size_t n_state_size_exp = llama_get_state_size(ctx); const size_t n_state_size_max = llama_get_state_size(ctx);
if (n_state_size_cur != n_state_size_exp) { if (n_state_size_cur > n_state_size_max) {
fprintf(stderr, "%s : the state size in session file didn't match! expected %zu, got %zu\n", __func__, n_state_size_exp, n_state_size_cur); fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
return false; return false;
} }
std::vector<uint8_t> state_data(n_state_size_cur); std::vector<uint8_t> state_data(n_state_size_max);
file.read_raw(state_data.data(), n_state_size_cur); file.read_raw(state_data.data(), n_state_size_cur);
llama_set_state_data(ctx, state_data.data()); llama_set_state_data(ctx, state_data.data());
@ -2634,12 +2706,12 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
// save the context state // save the context state
{ {
const size_t n_state_size = llama_get_state_size(ctx); const size_t n_state_size_max = llama_get_state_size(ctx);
std::vector<uint8_t> state_data(n_state_size); std::vector<uint8_t> state_data(n_state_size_max);
llama_copy_state_data(ctx, state_data.data()); const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
file.write_raw(state_data.data(), n_state_size); file.write_raw(state_data.data(), n_state_size_cur);
} }
return true; return true;

11
llama.h
View file

@ -23,7 +23,7 @@
#define LLAMA_FILE_MAGIC 'ggjt' #define LLAMA_FILE_MAGIC 'ggjt'
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml' #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
#define LLAMA_SESSION_MAGIC 'ggsn' #define LLAMA_SESSION_MAGIC 'ggsn'
#define LLAMA_SESSION_VERSION 0 #define LLAMA_SESSION_VERSION 1
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -56,7 +56,7 @@ extern "C" {
struct llama_context_params { struct llama_context_params {
int n_ctx; // text context int n_ctx; // text context
int n_parts; // -1 for default int n_parts; // -1 for default
int seed; // RNG seed, 0 for random int seed; // RNG seed, -1 for random
bool f16_kv; // use fp16 for KV cache bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one bool logits_all; // the llama_eval() call computes all logits, not just the last one
@ -127,7 +127,8 @@ extern "C" {
// Sets the current rng seed. // Sets the current rng seed.
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
// Returns the size in bytes of the state (rng, logits, embedding and kv_cache) // Returns the maximum size in bytes of the state (rng, logits, embedding
// and kv_cache) - will often be smaller after compacting tokens
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
// Copies the state to the specified destination address. // Copies the state to the specified destination address.
@ -192,10 +193,10 @@ extern "C" {
// Sampling functions // Sampling functions
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty); LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);

View file

@ -1,4 +1,4 @@
set(TEMPLATE_FILE "${CMAKE_BINARY_DIR}/BUILD_INFO.h.in") set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h") set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
set(BUILD_NUMBER 0) set(BUILD_NUMBER 0)
set(BUILD_COMMIT "unknown") set(BUILD_COMMIT "unknown")

7
scripts/build-info.h.in Normal file
View file

@ -0,0 +1,7 @@
#ifndef BUILD_INFO_H
#define BUILD_INFO_H
#define BUILD_NUMBER @BUILD_NUMBER@
#define BUILD_COMMIT "@BUILD_COMMIT@"
#endif // BUILD_INFO_H

View file

@ -131,7 +131,7 @@ void test_repetition_penalty(
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
llama_sample_softmax(nullptr, &candidates_p); llama_sample_softmax(nullptr, &candidates_p);
DUMP(&candidates_p); DUMP(&candidates_p);
llama_sample_repetition_penalty(nullptr, &candidates_p, (llama_token *)last_tokens.data(), last_tokens.size(), penalty); llama_sample_repetition_penalty(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), penalty);
llama_sample_softmax(nullptr, &candidates_p); llama_sample_softmax(nullptr, &candidates_p);
DUMP(&candidates_p); DUMP(&candidates_p);
@ -160,7 +160,7 @@ void test_frequency_presence_penalty(
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
llama_sample_softmax(nullptr, &candidates_p); llama_sample_softmax(nullptr, &candidates_p);
// DUMP(&candidates_p); // DUMP(&candidates_p);
llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p, (llama_token *)last_tokens.data(), last_tokens.size(), alpha_frequency, alpha_presence); llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), alpha_frequency, alpha_presence);
llama_sample_softmax(nullptr, &candidates_p); llama_sample_softmax(nullptr, &candidates_p);
// DUMP(&candidates_p); // DUMP(&candidates_p);