Merge branch 'master' into concedo_experimental
# Conflicts: # .github/workflows/build.yml
This commit is contained in:
commit
d8f7a7077a
8 changed files with 242 additions and 111 deletions
|
@ -167,6 +167,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
// store the external file name in params
|
||||||
|
params.prompt_file = argv[i];
|
||||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||||
if (params.prompt.back() == '\n') {
|
if (params.prompt.back() == '\n') {
|
||||||
params.prompt.pop_back();
|
params.prompt.pop_back();
|
||||||
|
@ -1020,10 +1022,11 @@ llama_token llama_sample_token(
|
||||||
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||||
} else {
|
} else {
|
||||||
// Temperature sampling
|
// Temperature sampling
|
||||||
llama_sample_top_k (ctx, &cur_p, top_k, 1);
|
size_t min_keep = std::max(1, params.n_probs);
|
||||||
llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
|
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
|
||||||
llama_sample_typical (ctx, &cur_p, typical_p, 1);
|
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
|
||||||
llama_sample_top_p (ctx, &cur_p, top_p, 1);
|
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
|
||||||
|
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
|
||||||
llama_sample_temp(ctx, &cur_p, temp);
|
llama_sample_temp(ctx, &cur_p, temp);
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|
|
@ -79,6 +79,7 @@ struct gpt_params {
|
||||||
std::string model_draft = ""; // draft model for speculative decoding
|
std::string model_draft = ""; // draft model for speculative decoding
|
||||||
std::string model_alias = "unknown"; // model alias
|
std::string model_alias = "unknown"; // model alias
|
||||||
std::string prompt = "";
|
std::string prompt = "";
|
||||||
|
std::string prompt_file = ""; // store the external prompt file name
|
||||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||||
std::string input_prefix = ""; // string to prefix user inputs with
|
std::string input_prefix = ""; // string to prefix user inputs with
|
||||||
std::string input_suffix = ""; // string to suffix user inputs with
|
std::string input_suffix = ""; // string to suffix user inputs with
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <ctime>
|
||||||
|
|
||||||
// trim whitespace from the beginning and end of a string
|
// trim whitespace from the beginning and end of a string
|
||||||
static std::string trim(const std::string & str) {
|
static std::string trim(const std::string & str) {
|
||||||
|
@ -70,6 +71,26 @@ struct client {
|
||||||
std::vector<llama_token> tokens_prev;
|
std::vector<llama_token> tokens_prev;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static void print_date_time() {
|
||||||
|
std::time_t current_time = std::time(nullptr);
|
||||||
|
std::tm* local_time = std::localtime(¤t_time);
|
||||||
|
char buffer[80];
|
||||||
|
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
|
||||||
|
|
||||||
|
printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Define a split string function to ...
|
||||||
|
static std::vector<std::string> split_string(const std::string& input, char delimiter) {
|
||||||
|
std::vector<std::string> tokens;
|
||||||
|
std::istringstream stream(input);
|
||||||
|
std::string token;
|
||||||
|
while (std::getline(stream, token, delimiter)) {
|
||||||
|
tokens.push_back(token);
|
||||||
|
}
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
srand(1234);
|
srand(1234);
|
||||||
|
|
||||||
|
@ -104,6 +125,23 @@ int main(int argc, char ** argv) {
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
// load the prompts from an external file if there are any
|
||||||
|
if (params.prompt.empty()) {
|
||||||
|
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
||||||
|
} else {
|
||||||
|
// Output each line of the input params.prompts vector and copy to k_prompts
|
||||||
|
int index = 0;
|
||||||
|
printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
|
||||||
|
|
||||||
|
std::vector<std::string> prompts = split_string(params.prompt, '\n');
|
||||||
|
for (const auto& prompt : prompts) {
|
||||||
|
k_prompts.resize(index + 1);
|
||||||
|
k_prompts[index] = prompt;
|
||||||
|
index++;
|
||||||
|
printf("%3d prompt: %s\n", index, prompt.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
fprintf(stderr, "\n\n");
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
|
||||||
|
@ -233,7 +271,7 @@ int main(int argc, char ** argv) {
|
||||||
client.n_decoded = 0;
|
client.n_decoded = 0;
|
||||||
client.i_batch = batch.n_tokens - 1;
|
client.i_batch = batch.n_tokens - 1;
|
||||||
|
|
||||||
LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
||||||
|
|
||||||
g_seq_id += 1;
|
g_seq_id += 1;
|
||||||
|
|
||||||
|
@ -336,8 +374,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\nResponse: %s\n\n",
|
LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
|
||||||
client.id, client.seq_id, client.n_prompt, client.n_decoded,
|
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
|
||||||
(t_main_end - client.t_start_prompt) / 1e6,
|
(t_main_end - client.t_start_prompt) / 1e6,
|
||||||
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
||||||
n_cache_miss,
|
n_cache_miss,
|
||||||
|
@ -357,13 +395,21 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("\n\n");
|
print_date_time();
|
||||||
|
|
||||||
|
LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||||
|
if (params.prompt_file.empty()) {
|
||||||
|
params.prompt_file = "used built-in defaults";
|
||||||
|
}
|
||||||
|
LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
||||||
|
LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
||||||
|
|
||||||
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
|
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
|
||||||
|
|
||||||
LOG_TEE("\n\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
|
|
@ -534,99 +534,21 @@ struct llama_server_context
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
// out of user input, sample next token
|
// out of user input, sample next token
|
||||||
const float temp = params.temp;
|
|
||||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(model) : params.top_k;
|
|
||||||
const float top_p = params.top_p;
|
|
||||||
const float tfs_z = params.tfs_z;
|
|
||||||
const float typical_p = params.typical_p;
|
|
||||||
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
|
||||||
const float repeat_penalty = params.repeat_penalty;
|
|
||||||
const float alpha_presence = params.presence_penalty;
|
|
||||||
const float alpha_frequency = params.frequency_penalty;
|
|
||||||
const int mirostat = params.mirostat;
|
|
||||||
const float mirostat_tau = params.mirostat_tau;
|
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
|
||||||
const bool penalize_nl = params.penalize_nl;
|
|
||||||
const int32_t n_probs = params.n_probs;
|
|
||||||
|
|
||||||
{
|
|
||||||
auto *logits = llama_get_logits(ctx);
|
|
||||||
auto n_vocab = llama_n_vocab(model);
|
|
||||||
|
|
||||||
// Apply params.logit_bias map
|
|
||||||
for (const auto &it : params.logit_bias)
|
|
||||||
{
|
|
||||||
logits[it.first] += it.second;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(llama_n_vocab(model));
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++)
|
|
||||||
{
|
result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
|
|
||||||
// Apply penalties
|
const int32_t n_probs = params.n_probs;
|
||||||
float nl_logit = logits[llama_token_nl(ctx)];
|
if (params.temp <= 0 && n_probs > 0)
|
||||||
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
|
||||||
llama_sample_repetition_penalty(ctx, &candidates_p,
|
|
||||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
|
||||||
last_n_repeat, repeat_penalty);
|
|
||||||
llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
|
||||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
|
||||||
last_n_repeat, alpha_frequency, alpha_presence);
|
|
||||||
if (!penalize_nl)
|
|
||||||
{
|
|
||||||
logits[llama_token_nl(ctx)] = nl_logit;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (grammar != nullptr) {
|
|
||||||
llama_sample_grammar(ctx, &candidates_p, grammar);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (temp <= 0)
|
|
||||||
{
|
|
||||||
// Greedy sampling
|
|
||||||
result.tok = llama_sample_token_greedy(ctx, &candidates_p);
|
|
||||||
if (n_probs > 0)
|
|
||||||
{
|
{
|
||||||
|
// For llama_sample_token_greedy we need to sort candidates
|
||||||
llama_sample_softmax(ctx, &candidates_p);
|
llama_sample_softmax(ctx, &candidates_p);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (mirostat == 1)
|
|
||||||
{
|
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
|
||||||
const int mirostat_m = 100;
|
|
||||||
llama_sample_temp(ctx, &candidates_p, temp);
|
|
||||||
result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
|
||||||
}
|
|
||||||
else if (mirostat == 2)
|
|
||||||
{
|
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
|
||||||
llama_sample_temp(ctx, &candidates_p, temp);
|
|
||||||
result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Temperature sampling
|
|
||||||
size_t min_keep = std::max(1, n_probs);
|
|
||||||
llama_sample_top_k(ctx, &candidates_p, top_k, min_keep);
|
|
||||||
llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
|
|
||||||
llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
|
|
||||||
llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
|
|
||||||
llama_sample_temp(ctx, &candidates_p, temp);
|
|
||||||
result.tok = llama_sample_token(ctx, &candidates_p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (grammar != nullptr) {
|
|
||||||
llama_grammar_accept_token(ctx, grammar, result.tok);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
|
for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
|
||||||
{
|
{
|
||||||
|
|
91
llama.cpp
91
llama.cpp
|
@ -126,6 +126,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
|
||||||
}
|
}
|
||||||
s = std::move(result);
|
s = std::move(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool is_float_close(float a, float b, float abs_tol) {
|
||||||
|
// Check for non-negative tolerance
|
||||||
|
if (abs_tol < 0.0) {
|
||||||
|
throw std::invalid_argument("Tolerance must be non-negative");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Exact equality check
|
||||||
|
if (a == b) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for infinities
|
||||||
|
if (std::isinf(a) || std::isinf(b)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Regular comparison using the provided absolute tolerance
|
||||||
|
return std::fabs(b - a) <= abs_tol;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
#ifdef GGML_USE_CPU_HBM
|
||||||
#include <hbwmalloc.h>
|
#include <hbwmalloc.h>
|
||||||
#endif
|
#endif
|
||||||
|
@ -974,7 +995,24 @@ struct llama_hparams {
|
||||||
float rope_freq_scale_train;
|
float rope_freq_scale_train;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams & other) const {
|
||||||
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
if (this->vocab_only != other.vocab_only) return true;
|
||||||
|
if (this->n_vocab != other.n_vocab) return true;
|
||||||
|
if (this->n_ctx_train != other.n_ctx_train) return true;
|
||||||
|
if (this->n_embd != other.n_embd) return true;
|
||||||
|
if (this->n_head != other.n_head) return true;
|
||||||
|
if (this->n_head_kv != other.n_head_kv) return true;
|
||||||
|
if (this->n_layer != other.n_layer) return true;
|
||||||
|
if (this->n_rot != other.n_rot) return true;
|
||||||
|
if (this->n_ff != other.n_ff) return true;
|
||||||
|
|
||||||
|
const float EPSILON = 1e-9;
|
||||||
|
|
||||||
|
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
||||||
|
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
||||||
|
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
||||||
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t n_gqa() const {
|
uint32_t n_gqa() const {
|
||||||
|
@ -1049,6 +1087,9 @@ struct llama_kv_cell {
|
||||||
struct llama_kv_cache {
|
struct llama_kv_cache {
|
||||||
bool has_shift = false;
|
bool has_shift = false;
|
||||||
|
|
||||||
|
// Note: The value of head isn't only used to optimize searching
|
||||||
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
||||||
|
// cannot be freely changed after a slot has been allocated.
|
||||||
uint32_t head = 0;
|
uint32_t head = 0;
|
||||||
uint32_t size = 0;
|
uint32_t size = 0;
|
||||||
|
|
||||||
|
@ -1306,6 +1347,8 @@ static bool llama_kv_cache_init(
|
||||||
|
|
||||||
// find an empty slot of size "n_tokens" in the cache
|
// find an empty slot of size "n_tokens" in the cache
|
||||||
// updates the cache head
|
// updates the cache head
|
||||||
|
// Note: On success, it's important that cache.head points
|
||||||
|
// to the first cell of the slot.
|
||||||
static bool llama_kv_cache_find_slot(
|
static bool llama_kv_cache_find_slot(
|
||||||
struct llama_kv_cache & cache,
|
struct llama_kv_cache & cache,
|
||||||
const struct llama_batch & batch) {
|
const struct llama_batch & batch) {
|
||||||
|
@ -1321,8 +1364,8 @@ static bool llama_kv_cache_find_slot(
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
if (cache.head + n_tokens > n_ctx) {
|
if (cache.head + n_tokens > n_ctx) {
|
||||||
cache.head = 0;
|
|
||||||
n_tested += n_ctx - cache.head;
|
n_tested += n_ctx - cache.head;
|
||||||
|
cache.head = 0;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1373,6 +1416,9 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
|
||||||
cache.cells[i].pos = -1;
|
cache.cells[i].pos = -1;
|
||||||
cache.cells[i].seq_id.clear();
|
cache.cells[i].seq_id.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Searching for a free slot can start here since we know it will be empty.
|
||||||
|
cache.head = uint32_t(c0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_kv_cache_seq_rm(
|
static void llama_kv_cache_seq_rm(
|
||||||
|
@ -1380,6 +1426,8 @@ static void llama_kv_cache_seq_rm(
|
||||||
llama_seq_id seq_id,
|
llama_seq_id seq_id,
|
||||||
llama_pos p0,
|
llama_pos p0,
|
||||||
llama_pos p1) {
|
llama_pos p1) {
|
||||||
|
uint32_t new_head = cache.size;
|
||||||
|
|
||||||
if (p0 < 0) p0 = 0;
|
if (p0 < 0) p0 = 0;
|
||||||
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
||||||
|
|
||||||
|
@ -1388,9 +1436,13 @@ static void llama_kv_cache_seq_rm(
|
||||||
cache.cells[i].seq_id.erase(seq_id);
|
cache.cells[i].seq_id.erase(seq_id);
|
||||||
if (cache.cells[i].seq_id.empty()) {
|
if (cache.cells[i].seq_id.empty()) {
|
||||||
cache.cells[i].pos = -1;
|
cache.cells[i].pos = -1;
|
||||||
|
if (new_head == cache.size) new_head = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we freed up a slot, set head to it so searching can start there.
|
||||||
|
if (new_head != cache.size) cache.head = new_head;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_kv_cache_seq_cp(
|
static void llama_kv_cache_seq_cp(
|
||||||
|
@ -1402,6 +1454,8 @@ static void llama_kv_cache_seq_cp(
|
||||||
if (p0 < 0) p0 = 0;
|
if (p0 < 0) p0 = 0;
|
||||||
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
||||||
|
|
||||||
|
cache.head = 0;
|
||||||
|
|
||||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||||
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
||||||
cache.cells[i].seq_id.insert(seq_id_dst);
|
cache.cells[i].seq_id.insert(seq_id_dst);
|
||||||
|
@ -1410,12 +1464,18 @@ static void llama_kv_cache_seq_cp(
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
||||||
|
uint32_t new_head = cache.size;
|
||||||
|
|
||||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||||
if (!cache.cells[i].has_seq_id(seq_id)) {
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
||||||
cache.cells[i].pos = -1;
|
cache.cells[i].pos = -1;
|
||||||
cache.cells[i].seq_id.clear();
|
cache.cells[i].seq_id.clear();
|
||||||
|
if (new_head == cache.size) new_head = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we freed up a slot, set head to it so searching can start there.
|
||||||
|
if (new_head != cache.size) cache.head = new_head;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_kv_cache_seq_shift(
|
static void llama_kv_cache_seq_shift(
|
||||||
|
@ -1424,6 +1484,8 @@ static void llama_kv_cache_seq_shift(
|
||||||
llama_pos p0,
|
llama_pos p0,
|
||||||
llama_pos p1,
|
llama_pos p1,
|
||||||
llama_pos delta) {
|
llama_pos delta) {
|
||||||
|
uint32_t new_head = cache.size;
|
||||||
|
|
||||||
if (p0 < 0) p0 = 0;
|
if (p0 < 0) p0 = 0;
|
||||||
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
||||||
|
|
||||||
|
@ -1433,12 +1495,17 @@ static void llama_kv_cache_seq_shift(
|
||||||
if (cache.cells[i].pos < 0) {
|
if (cache.cells[i].pos < 0) {
|
||||||
cache.cells[i].pos = -1;
|
cache.cells[i].pos = -1;
|
||||||
cache.cells[i].seq_id.clear();
|
cache.cells[i].seq_id.clear();
|
||||||
|
if (new_head == cache.size) new_head = i;
|
||||||
} else {
|
} else {
|
||||||
cache.has_shift = true;
|
cache.has_shift = true;
|
||||||
cache.cells[i].delta = delta;
|
cache.cells[i].delta = delta;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we freed up a slot, set head to it so searching can start there.
|
||||||
|
// Otherwise we just start the next search from the beginning.
|
||||||
|
cache.head = new_head != cache.size ? new_head : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -4460,10 +4527,6 @@ static int llama_decode_internal(
|
||||||
batch.seq_id = seq_id.data();
|
batch.seq_id = seq_id.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
// we always start to search for a free slot from the start of the cache
|
|
||||||
// TODO: better strategies can be implemented
|
|
||||||
kv_self.head = 0;
|
|
||||||
|
|
||||||
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -4549,8 +4612,12 @@ static int llama_decode_internal(
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// update the kv ring buffer
|
// update the kv ring buffer
|
||||||
lctx.kv_self.head += n_tokens;
|
|
||||||
lctx.kv_self.has_shift = false;
|
lctx.kv_self.has_shift = false;
|
||||||
|
lctx.kv_self.head += n_tokens;
|
||||||
|
// Ensure kv cache head points to a valid index.
|
||||||
|
if (lctx.kv_self.head >= lctx.kv_self.size) {
|
||||||
|
lctx.kv_self.head = 0;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef GGML_PERF
|
#ifdef GGML_PERF
|
||||||
// print timing information per ggml operation (for debugging purposes)
|
// print timing information per ggml operation (for debugging purposes)
|
||||||
|
@ -8190,14 +8257,14 @@ void llama_print_timings(struct llama_context * ctx) {
|
||||||
const llama_timings timings = llama_get_timings(ctx);
|
const llama_timings timings = llama_get_timings(ctx);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("\n");
|
LLAMA_LOG_INFO("\n");
|
||||||
LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
|
||||||
LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
||||||
LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
||||||
LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
||||||
LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_reset_timings(struct llama_context * ctx) {
|
void llama_reset_timings(struct llama_context * ctx) {
|
||||||
|
|
49
prompts/LLM-questions.txt
Normal file
49
prompts/LLM-questions.txt
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
In the context of LLMs, what is "Attention"?
|
||||||
|
In the context of LLMs, what is a completion?
|
||||||
|
In the context of LLMs, what is a prompt?
|
||||||
|
In the context of LLMs, what is GELU?
|
||||||
|
In the context of LLMs, what is RELU?
|
||||||
|
In the context of LLMs, what is softmax?
|
||||||
|
In the context of LLMs, what is decoding?
|
||||||
|
In the context of LLMs, what is encoding?
|
||||||
|
In the context of LLMs, what is tokenizing?
|
||||||
|
In the context of LLMs, what is an embedding?
|
||||||
|
In the context of LLMs, what is quantization?
|
||||||
|
In the context of LLMs, what is a tensor?
|
||||||
|
In the context of LLMs, what is a sparse tensor?
|
||||||
|
In the context of LLMs, what is a vector?
|
||||||
|
In the context of LLMs, how is attention implemented?
|
||||||
|
In the context of LLMs, why is attention all you need?
|
||||||
|
In the context of LLMs, what is "RoPe" and what is it used for?
|
||||||
|
In the context of LLMs, what is "LoRA" and what is it used for?
|
||||||
|
In the context of LLMs, what are weights?
|
||||||
|
In the context of LLMs, what are biases?
|
||||||
|
In the context of LLMs, what are checkpoints?
|
||||||
|
In the context of LLMs, what is "perplexity"?
|
||||||
|
In the context of LLMs, what are models?
|
||||||
|
In the context of machine-learning, what is "catastrophic forgetting"?
|
||||||
|
In the context of machine-learning, what is "elastic weight consolidation (EWC)"?
|
||||||
|
In the context of neural nets, what is a hidden layer?
|
||||||
|
In the context of neural nets, what is a convolution?
|
||||||
|
In the context of neural nets, what is dropout?
|
||||||
|
In the context of neural nets, what is cross-entropy?
|
||||||
|
In the context of neural nets, what is over-fitting?
|
||||||
|
In the context of neural nets, what is under-fitting?
|
||||||
|
What is the difference between an interpreted computer language and a compiled computer language?
|
||||||
|
In the context of software development, what is a debugger?
|
||||||
|
When processing using a GPU, what is off-loading?
|
||||||
|
When processing using a GPU, what is a batch?
|
||||||
|
When processing using a GPU, what is a block?
|
||||||
|
When processing using a GPU, what is the difference between a batch and a block?
|
||||||
|
When processing using a GPU, what is a scratch tensor?
|
||||||
|
When processing using a GPU, what is a layer?
|
||||||
|
When processing using a GPU, what is a cache?
|
||||||
|
When processing using a GPU, what is unified memory?
|
||||||
|
When processing using a GPU, what is VRAM?
|
||||||
|
When processing using a GPU, what is a kernel?
|
||||||
|
When processing using a GPU, what is "metal"?
|
||||||
|
In the context of LLMs, what are "Zero-Shot", "One-Shot" and "Few-Shot" learning models?
|
||||||
|
In the context of LLMs, what is the "Transformer-model" architecture?
|
||||||
|
In the context of LLMs, what is "Multi-Head Attention"?
|
||||||
|
In the context of LLMs, what is "Self-Attention"?
|
||||||
|
In the context of transformer-model architectures, how do attention mechanisms use masks?
|
43
prompts/parallel-questions.txt
Normal file
43
prompts/parallel-questions.txt
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
What do you know about Hobbits?
|
||||||
|
What is quantum field theory?
|
||||||
|
Why did the chicken cross the road?
|
||||||
|
Who is the president of the United States?
|
||||||
|
How do I run CMake on MacOS?
|
||||||
|
Do you agree that C++ is a really finicky language compared with Python3?
|
||||||
|
Is it a good idea to invest in technology?
|
||||||
|
Do you like Wagner's Ring?
|
||||||
|
Do you think this file input option is really neat?
|
||||||
|
What should we all do about climate change?
|
||||||
|
Is time-travel possible within the laws of current physics?
|
||||||
|
Is it like anything to be a bat?
|
||||||
|
Once the chicken has crossed the road, does it try to go back?
|
||||||
|
Who is the greatest of all musical composers?
|
||||||
|
What is art?
|
||||||
|
Is there life elsewhere in the universe?
|
||||||
|
What is intelligence?
|
||||||
|
What is the difference between knowledge and intelligence?
|
||||||
|
Will religion ever die?
|
||||||
|
Do we understand ourselves?
|
||||||
|
What is the best way to cook eggs?
|
||||||
|
If you cannot see things, on what basis do you evaluate them?
|
||||||
|
Explain the role of the np junction in photovoltaic cells?
|
||||||
|
Is professional sport a good or bad influence on human behaviour?
|
||||||
|
Is capital punishment immoral?
|
||||||
|
Should we care about other people?
|
||||||
|
Who are you?
|
||||||
|
Which sense would you surrender if you could?
|
||||||
|
Was Henry Ford a hero or a villain?
|
||||||
|
Do we need leaders?
|
||||||
|
What is nucleosynthesis?
|
||||||
|
Who is the greatest scientist of all time?
|
||||||
|
Who first observed what came to be known as the photovoltaic effect?
|
||||||
|
What is nuclear fusion and why does it release energy?
|
||||||
|
Can you know that you exist?
|
||||||
|
What is an exoplanet?
|
||||||
|
Do you like cream?
|
||||||
|
What is the difference?
|
||||||
|
Can I know that I exist while I'm dreaming that I'm Descartes?
|
||||||
|
Who said "I didn't know I thought that until I heard myself saying it"?
|
||||||
|
Does anything really matter?
|
||||||
|
Can you explain the unreasonable effectiveness of mathematics?
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue