Merge branch 'master' into concedo_experimental
# Conflicts: # .github/workflows/build.yml
This commit is contained in:
commit
d8f7a7077a
8 changed files with 242 additions and 111 deletions
|
@ -167,6 +167,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
// store the external file name in params
|
||||||
|
params.prompt_file = argv[i];
|
||||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||||
if (params.prompt.back() == '\n') {
|
if (params.prompt.back() == '\n') {
|
||||||
params.prompt.pop_back();
|
params.prompt.pop_back();
|
||||||
|
@ -1020,10 +1022,11 @@ llama_token llama_sample_token(
|
||||||
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||||
} else {
|
} else {
|
||||||
// Temperature sampling
|
// Temperature sampling
|
||||||
llama_sample_top_k (ctx, &cur_p, top_k, 1);
|
size_t min_keep = std::max(1, params.n_probs);
|
||||||
llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
|
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
|
||||||
llama_sample_typical (ctx, &cur_p, typical_p, 1);
|
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
|
||||||
llama_sample_top_p (ctx, &cur_p, top_p, 1);
|
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
|
||||||
|
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
|
||||||
llama_sample_temp(ctx, &cur_p, temp);
|
llama_sample_temp(ctx, &cur_p, temp);
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|
|
@ -79,6 +79,7 @@ struct gpt_params {
|
||||||
std::string model_draft = ""; // draft model for speculative decoding
|
std::string model_draft = ""; // draft model for speculative decoding
|
||||||
std::string model_alias = "unknown"; // model alias
|
std::string model_alias = "unknown"; // model alias
|
||||||
std::string prompt = "";
|
std::string prompt = "";
|
||||||
|
std::string prompt_file = ""; // store the external prompt file name
|
||||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||||
std::string input_prefix = ""; // string to prefix user inputs with
|
std::string input_prefix = ""; // string to prefix user inputs with
|
||||||
std::string input_suffix = ""; // string to suffix user inputs with
|
std::string input_suffix = ""; // string to suffix user inputs with
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
|
This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
|
||||||
|
|
||||||
The jeopardy test can be used to compare the fact knowledge of different models and compare them to eachother. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
|
The jeopardy test can be used to compare the fact knowledge of different models and compare them to each other. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
|
||||||
|
|
||||||
|
|
||||||
Step 1: Open jeopardy.sh and modify the following:
|
Step 1: Open jeopardy.sh and modify the following:
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <ctime>
|
||||||
|
|
||||||
// trim whitespace from the beginning and end of a string
|
// trim whitespace from the beginning and end of a string
|
||||||
static std::string trim(const std::string & str) {
|
static std::string trim(const std::string & str) {
|
||||||
|
@ -70,6 +71,26 @@ struct client {
|
||||||
std::vector<llama_token> tokens_prev;
|
std::vector<llama_token> tokens_prev;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static void print_date_time() {
|
||||||
|
std::time_t current_time = std::time(nullptr);
|
||||||
|
std::tm* local_time = std::localtime(¤t_time);
|
||||||
|
char buffer[80];
|
||||||
|
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
|
||||||
|
|
||||||
|
printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Define a split string function to ...
|
||||||
|
static std::vector<std::string> split_string(const std::string& input, char delimiter) {
|
||||||
|
std::vector<std::string> tokens;
|
||||||
|
std::istringstream stream(input);
|
||||||
|
std::string token;
|
||||||
|
while (std::getline(stream, token, delimiter)) {
|
||||||
|
tokens.push_back(token);
|
||||||
|
}
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
srand(1234);
|
srand(1234);
|
||||||
|
|
||||||
|
@ -104,6 +125,23 @@ int main(int argc, char ** argv) {
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
// load the prompts from an external file if there are any
|
||||||
|
if (params.prompt.empty()) {
|
||||||
|
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
||||||
|
} else {
|
||||||
|
// Output each line of the input params.prompts vector and copy to k_prompts
|
||||||
|
int index = 0;
|
||||||
|
printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
|
||||||
|
|
||||||
|
std::vector<std::string> prompts = split_string(params.prompt, '\n');
|
||||||
|
for (const auto& prompt : prompts) {
|
||||||
|
k_prompts.resize(index + 1);
|
||||||
|
k_prompts[index] = prompt;
|
||||||
|
index++;
|
||||||
|
printf("%3d prompt: %s\n", index, prompt.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
fprintf(stderr, "\n\n");
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
|
||||||
|
@ -233,7 +271,7 @@ int main(int argc, char ** argv) {
|
||||||
client.n_decoded = 0;
|
client.n_decoded = 0;
|
||||||
client.i_batch = batch.n_tokens - 1;
|
client.i_batch = batch.n_tokens - 1;
|
||||||
|
|
||||||
LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
||||||
|
|
||||||
g_seq_id += 1;
|
g_seq_id += 1;
|
||||||
|
|
||||||
|
@ -336,8 +374,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\nResponse: %s\n\n",
|
LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
|
||||||
client.id, client.seq_id, client.n_prompt, client.n_decoded,
|
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
|
||||||
(t_main_end - client.t_start_prompt) / 1e6,
|
(t_main_end - client.t_start_prompt) / 1e6,
|
||||||
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
||||||
n_cache_miss,
|
n_cache_miss,
|
||||||
|
@ -357,13 +395,21 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("\n\n");
|
print_date_time();
|
||||||
|
|
||||||
|
LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||||
|
if (params.prompt_file.empty()) {
|
||||||
|
params.prompt_file = "used built-in defaults";
|
||||||
|
}
|
||||||
|
LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
||||||
|
LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
||||||
|
|
||||||
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
|
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
|
||||||
|
|
||||||
LOG_TEE("\n\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
|
|
@ -534,98 +534,20 @@ struct llama_server_context
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// out of user input, sample next token
|
|
||||||
const float temp = params.temp;
|
|
||||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(model) : params.top_k;
|
|
||||||
const float top_p = params.top_p;
|
|
||||||
const float tfs_z = params.tfs_z;
|
|
||||||
const float typical_p = params.typical_p;
|
|
||||||
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
|
||||||
const float repeat_penalty = params.repeat_penalty;
|
|
||||||
const float alpha_presence = params.presence_penalty;
|
|
||||||
const float alpha_frequency = params.frequency_penalty;
|
|
||||||
const int mirostat = params.mirostat;
|
|
||||||
const float mirostat_tau = params.mirostat_tau;
|
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
|
||||||
const bool penalize_nl = params.penalize_nl;
|
|
||||||
const int32_t n_probs = params.n_probs;
|
|
||||||
|
|
||||||
{
|
{
|
||||||
auto *logits = llama_get_logits(ctx);
|
// out of user input, sample next token
|
||||||
auto n_vocab = llama_n_vocab(model);
|
|
||||||
|
|
||||||
// Apply params.logit_bias map
|
|
||||||
for (const auto &it : params.logit_bias)
|
|
||||||
{
|
|
||||||
logits[it.first] += it.second;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(llama_n_vocab(model));
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++)
|
|
||||||
|
result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
|
||||||
|
|
||||||
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
|
|
||||||
|
const int32_t n_probs = params.n_probs;
|
||||||
|
if (params.temp <= 0 && n_probs > 0)
|
||||||
{
|
{
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
// For llama_sample_token_greedy we need to sort candidates
|
||||||
}
|
llama_sample_softmax(ctx, &candidates_p);
|
||||||
|
|
||||||
llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
|
|
||||||
|
|
||||||
// Apply penalties
|
|
||||||
float nl_logit = logits[llama_token_nl(ctx)];
|
|
||||||
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
|
||||||
llama_sample_repetition_penalty(ctx, &candidates_p,
|
|
||||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
|
||||||
last_n_repeat, repeat_penalty);
|
|
||||||
llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
|
||||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
|
||||||
last_n_repeat, alpha_frequency, alpha_presence);
|
|
||||||
if (!penalize_nl)
|
|
||||||
{
|
|
||||||
logits[llama_token_nl(ctx)] = nl_logit;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (grammar != nullptr) {
|
|
||||||
llama_sample_grammar(ctx, &candidates_p, grammar);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (temp <= 0)
|
|
||||||
{
|
|
||||||
// Greedy sampling
|
|
||||||
result.tok = llama_sample_token_greedy(ctx, &candidates_p);
|
|
||||||
if (n_probs > 0)
|
|
||||||
{
|
|
||||||
llama_sample_softmax(ctx, &candidates_p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (mirostat == 1)
|
|
||||||
{
|
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
|
||||||
const int mirostat_m = 100;
|
|
||||||
llama_sample_temp(ctx, &candidates_p, temp);
|
|
||||||
result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
|
||||||
}
|
|
||||||
else if (mirostat == 2)
|
|
||||||
{
|
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
|
||||||
llama_sample_temp(ctx, &candidates_p, temp);
|
|
||||||
result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Temperature sampling
|
|
||||||
size_t min_keep = std::max(1, n_probs);
|
|
||||||
llama_sample_top_k(ctx, &candidates_p, top_k, min_keep);
|
|
||||||
llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
|
|
||||||
llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
|
|
||||||
llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
|
|
||||||
llama_sample_temp(ctx, &candidates_p, temp);
|
|
||||||
result.tok = llama_sample_token(ctx, &candidates_p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (grammar != nullptr) {
|
|
||||||
llama_grammar_accept_token(ctx, grammar, result.tok);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
|
for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
|
||||||
|
|
91
llama.cpp
91
llama.cpp
|
@ -126,6 +126,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
|
||||||
}
|
}
|
||||||
s = std::move(result);
|
s = std::move(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool is_float_close(float a, float b, float abs_tol) {
|
||||||
|
// Check for non-negative tolerance
|
||||||
|
if (abs_tol < 0.0) {
|
||||||
|
throw std::invalid_argument("Tolerance must be non-negative");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Exact equality check
|
||||||
|
if (a == b) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for infinities
|
||||||
|
if (std::isinf(a) || std::isinf(b)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Regular comparison using the provided absolute tolerance
|
||||||
|
return std::fabs(b - a) <= abs_tol;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
#ifdef GGML_USE_CPU_HBM
|
||||||
#include <hbwmalloc.h>
|
#include <hbwmalloc.h>
|
||||||
#endif
|
#endif
|
||||||
|
@ -974,7 +995,24 @@ struct llama_hparams {
|
||||||
float rope_freq_scale_train;
|
float rope_freq_scale_train;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams & other) const {
|
||||||
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
if (this->vocab_only != other.vocab_only) return true;
|
||||||
|
if (this->n_vocab != other.n_vocab) return true;
|
||||||
|
if (this->n_ctx_train != other.n_ctx_train) return true;
|
||||||
|
if (this->n_embd != other.n_embd) return true;
|
||||||
|
if (this->n_head != other.n_head) return true;
|
||||||
|
if (this->n_head_kv != other.n_head_kv) return true;
|
||||||
|
if (this->n_layer != other.n_layer) return true;
|
||||||
|
if (this->n_rot != other.n_rot) return true;
|
||||||
|
if (this->n_ff != other.n_ff) return true;
|
||||||
|
|
||||||
|
const float EPSILON = 1e-9;
|
||||||
|
|
||||||
|
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
||||||
|
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
||||||
|
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
||||||
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t n_gqa() const {
|
uint32_t n_gqa() const {
|
||||||
|
@ -1049,6 +1087,9 @@ struct llama_kv_cell {
|
||||||
struct llama_kv_cache {
|
struct llama_kv_cache {
|
||||||
bool has_shift = false;
|
bool has_shift = false;
|
||||||
|
|
||||||
|
// Note: The value of head isn't only used to optimize searching
|
||||||
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
||||||
|
// cannot be freely changed after a slot has been allocated.
|
||||||
uint32_t head = 0;
|
uint32_t head = 0;
|
||||||
uint32_t size = 0;
|
uint32_t size = 0;
|
||||||
|
|
||||||
|
@ -1306,6 +1347,8 @@ static bool llama_kv_cache_init(
|
||||||
|
|
||||||
// find an empty slot of size "n_tokens" in the cache
|
// find an empty slot of size "n_tokens" in the cache
|
||||||
// updates the cache head
|
// updates the cache head
|
||||||
|
// Note: On success, it's important that cache.head points
|
||||||
|
// to the first cell of the slot.
|
||||||
static bool llama_kv_cache_find_slot(
|
static bool llama_kv_cache_find_slot(
|
||||||
struct llama_kv_cache & cache,
|
struct llama_kv_cache & cache,
|
||||||
const struct llama_batch & batch) {
|
const struct llama_batch & batch) {
|
||||||
|
@ -1321,8 +1364,8 @@ static bool llama_kv_cache_find_slot(
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
if (cache.head + n_tokens > n_ctx) {
|
if (cache.head + n_tokens > n_ctx) {
|
||||||
|
n_tested += n_ctx - cache.head;
|
||||||
cache.head = 0;
|
cache.head = 0;
|
||||||
n_tested += n_ctx - cache.head;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1373,6 +1416,9 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
|
||||||
cache.cells[i].pos = -1;
|
cache.cells[i].pos = -1;
|
||||||
cache.cells[i].seq_id.clear();
|
cache.cells[i].seq_id.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Searching for a free slot can start here since we know it will be empty.
|
||||||
|
cache.head = uint32_t(c0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_kv_cache_seq_rm(
|
static void llama_kv_cache_seq_rm(
|
||||||
|
@ -1380,6 +1426,8 @@ static void llama_kv_cache_seq_rm(
|
||||||
llama_seq_id seq_id,
|
llama_seq_id seq_id,
|
||||||
llama_pos p0,
|
llama_pos p0,
|
||||||
llama_pos p1) {
|
llama_pos p1) {
|
||||||
|
uint32_t new_head = cache.size;
|
||||||
|
|
||||||
if (p0 < 0) p0 = 0;
|
if (p0 < 0) p0 = 0;
|
||||||
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
||||||
|
|
||||||
|
@ -1388,9 +1436,13 @@ static void llama_kv_cache_seq_rm(
|
||||||
cache.cells[i].seq_id.erase(seq_id);
|
cache.cells[i].seq_id.erase(seq_id);
|
||||||
if (cache.cells[i].seq_id.empty()) {
|
if (cache.cells[i].seq_id.empty()) {
|
||||||
cache.cells[i].pos = -1;
|
cache.cells[i].pos = -1;
|
||||||
|
if (new_head == cache.size) new_head = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we freed up a slot, set head to it so searching can start there.
|
||||||
|
if (new_head != cache.size) cache.head = new_head;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_kv_cache_seq_cp(
|
static void llama_kv_cache_seq_cp(
|
||||||
|
@ -1402,6 +1454,8 @@ static void llama_kv_cache_seq_cp(
|
||||||
if (p0 < 0) p0 = 0;
|
if (p0 < 0) p0 = 0;
|
||||||
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
||||||
|
|
||||||
|
cache.head = 0;
|
||||||
|
|
||||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||||
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
||||||
cache.cells[i].seq_id.insert(seq_id_dst);
|
cache.cells[i].seq_id.insert(seq_id_dst);
|
||||||
|
@ -1410,12 +1464,18 @@ static void llama_kv_cache_seq_cp(
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
||||||
|
uint32_t new_head = cache.size;
|
||||||
|
|
||||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||||
if (!cache.cells[i].has_seq_id(seq_id)) {
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
||||||
cache.cells[i].pos = -1;
|
cache.cells[i].pos = -1;
|
||||||
cache.cells[i].seq_id.clear();
|
cache.cells[i].seq_id.clear();
|
||||||
|
if (new_head == cache.size) new_head = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we freed up a slot, set head to it so searching can start there.
|
||||||
|
if (new_head != cache.size) cache.head = new_head;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_kv_cache_seq_shift(
|
static void llama_kv_cache_seq_shift(
|
||||||
|
@ -1424,6 +1484,8 @@ static void llama_kv_cache_seq_shift(
|
||||||
llama_pos p0,
|
llama_pos p0,
|
||||||
llama_pos p1,
|
llama_pos p1,
|
||||||
llama_pos delta) {
|
llama_pos delta) {
|
||||||
|
uint32_t new_head = cache.size;
|
||||||
|
|
||||||
if (p0 < 0) p0 = 0;
|
if (p0 < 0) p0 = 0;
|
||||||
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
||||||
|
|
||||||
|
@ -1433,12 +1495,17 @@ static void llama_kv_cache_seq_shift(
|
||||||
if (cache.cells[i].pos < 0) {
|
if (cache.cells[i].pos < 0) {
|
||||||
cache.cells[i].pos = -1;
|
cache.cells[i].pos = -1;
|
||||||
cache.cells[i].seq_id.clear();
|
cache.cells[i].seq_id.clear();
|
||||||
|
if (new_head == cache.size) new_head = i;
|
||||||
} else {
|
} else {
|
||||||
cache.has_shift = true;
|
cache.has_shift = true;
|
||||||
cache.cells[i].delta = delta;
|
cache.cells[i].delta = delta;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we freed up a slot, set head to it so searching can start there.
|
||||||
|
// Otherwise we just start the next search from the beginning.
|
||||||
|
cache.head = new_head != cache.size ? new_head : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -4460,10 +4527,6 @@ static int llama_decode_internal(
|
||||||
batch.seq_id = seq_id.data();
|
batch.seq_id = seq_id.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
// we always start to search for a free slot from the start of the cache
|
|
||||||
// TODO: better strategies can be implemented
|
|
||||||
kv_self.head = 0;
|
|
||||||
|
|
||||||
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -4549,8 +4612,12 @@ static int llama_decode_internal(
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// update the kv ring buffer
|
// update the kv ring buffer
|
||||||
lctx.kv_self.head += n_tokens;
|
|
||||||
lctx.kv_self.has_shift = false;
|
lctx.kv_self.has_shift = false;
|
||||||
|
lctx.kv_self.head += n_tokens;
|
||||||
|
// Ensure kv cache head points to a valid index.
|
||||||
|
if (lctx.kv_self.head >= lctx.kv_self.size) {
|
||||||
|
lctx.kv_self.head = 0;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef GGML_PERF
|
#ifdef GGML_PERF
|
||||||
// print timing information per ggml operation (for debugging purposes)
|
// print timing information per ggml operation (for debugging purposes)
|
||||||
|
@ -8190,14 +8257,14 @@ void llama_print_timings(struct llama_context * ctx) {
|
||||||
const llama_timings timings = llama_get_timings(ctx);
|
const llama_timings timings = llama_get_timings(ctx);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("\n");
|
LLAMA_LOG_INFO("\n");
|
||||||
LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
|
||||||
LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
||||||
LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
||||||
LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
||||||
LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_reset_timings(struct llama_context * ctx) {
|
void llama_reset_timings(struct llama_context * ctx) {
|
||||||
|
|
49
prompts/LLM-questions.txt
Normal file
49
prompts/LLM-questions.txt
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
In the context of LLMs, what is "Attention"?
|
||||||
|
In the context of LLMs, what is a completion?
|
||||||
|
In the context of LLMs, what is a prompt?
|
||||||
|
In the context of LLMs, what is GELU?
|
||||||
|
In the context of LLMs, what is RELU?
|
||||||
|
In the context of LLMs, what is softmax?
|
||||||
|
In the context of LLMs, what is decoding?
|
||||||
|
In the context of LLMs, what is encoding?
|
||||||
|
In the context of LLMs, what is tokenizing?
|
||||||
|
In the context of LLMs, what is an embedding?
|
||||||
|
In the context of LLMs, what is quantization?
|
||||||
|
In the context of LLMs, what is a tensor?
|
||||||
|
In the context of LLMs, what is a sparse tensor?
|
||||||
|
In the context of LLMs, what is a vector?
|
||||||
|
In the context of LLMs, how is attention implemented?
|
||||||
|
In the context of LLMs, why is attention all you need?
|
||||||
|
In the context of LLMs, what is "RoPe" and what is it used for?
|
||||||
|
In the context of LLMs, what is "LoRA" and what is it used for?
|
||||||
|
In the context of LLMs, what are weights?
|
||||||
|
In the context of LLMs, what are biases?
|
||||||
|
In the context of LLMs, what are checkpoints?
|
||||||
|
In the context of LLMs, what is "perplexity"?
|
||||||
|
In the context of LLMs, what are models?
|
||||||
|
In the context of machine-learning, what is "catastrophic forgetting"?
|
||||||
|
In the context of machine-learning, what is "elastic weight consolidation (EWC)"?
|
||||||
|
In the context of neural nets, what is a hidden layer?
|
||||||
|
In the context of neural nets, what is a convolution?
|
||||||
|
In the context of neural nets, what is dropout?
|
||||||
|
In the context of neural nets, what is cross-entropy?
|
||||||
|
In the context of neural nets, what is over-fitting?
|
||||||
|
In the context of neural nets, what is under-fitting?
|
||||||
|
What is the difference between an interpreted computer language and a compiled computer language?
|
||||||
|
In the context of software development, what is a debugger?
|
||||||
|
When processing using a GPU, what is off-loading?
|
||||||
|
When processing using a GPU, what is a batch?
|
||||||
|
When processing using a GPU, what is a block?
|
||||||
|
When processing using a GPU, what is the difference between a batch and a block?
|
||||||
|
When processing using a GPU, what is a scratch tensor?
|
||||||
|
When processing using a GPU, what is a layer?
|
||||||
|
When processing using a GPU, what is a cache?
|
||||||
|
When processing using a GPU, what is unified memory?
|
||||||
|
When processing using a GPU, what is VRAM?
|
||||||
|
When processing using a GPU, what is a kernel?
|
||||||
|
When processing using a GPU, what is "metal"?
|
||||||
|
In the context of LLMs, what are "Zero-Shot", "One-Shot" and "Few-Shot" learning models?
|
||||||
|
In the context of LLMs, what is the "Transformer-model" architecture?
|
||||||
|
In the context of LLMs, what is "Multi-Head Attention"?
|
||||||
|
In the context of LLMs, what is "Self-Attention"?
|
||||||
|
In the context of transformer-model architectures, how do attention mechanisms use masks?
|
43
prompts/parallel-questions.txt
Normal file
43
prompts/parallel-questions.txt
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
What do you know about Hobbits?
|
||||||
|
What is quantum field theory?
|
||||||
|
Why did the chicken cross the road?
|
||||||
|
Who is the president of the United States?
|
||||||
|
How do I run CMake on MacOS?
|
||||||
|
Do you agree that C++ is a really finicky language compared with Python3?
|
||||||
|
Is it a good idea to invest in technology?
|
||||||
|
Do you like Wagner's Ring?
|
||||||
|
Do you think this file input option is really neat?
|
||||||
|
What should we all do about climate change?
|
||||||
|
Is time-travel possible within the laws of current physics?
|
||||||
|
Is it like anything to be a bat?
|
||||||
|
Once the chicken has crossed the road, does it try to go back?
|
||||||
|
Who is the greatest of all musical composers?
|
||||||
|
What is art?
|
||||||
|
Is there life elsewhere in the universe?
|
||||||
|
What is intelligence?
|
||||||
|
What is the difference between knowledge and intelligence?
|
||||||
|
Will religion ever die?
|
||||||
|
Do we understand ourselves?
|
||||||
|
What is the best way to cook eggs?
|
||||||
|
If you cannot see things, on what basis do you evaluate them?
|
||||||
|
Explain the role of the np junction in photovoltaic cells?
|
||||||
|
Is professional sport a good or bad influence on human behaviour?
|
||||||
|
Is capital punishment immoral?
|
||||||
|
Should we care about other people?
|
||||||
|
Who are you?
|
||||||
|
Which sense would you surrender if you could?
|
||||||
|
Was Henry Ford a hero or a villain?
|
||||||
|
Do we need leaders?
|
||||||
|
What is nucleosynthesis?
|
||||||
|
Who is the greatest scientist of all time?
|
||||||
|
Who first observed what came to be known as the photovoltaic effect?
|
||||||
|
What is nuclear fusion and why does it release energy?
|
||||||
|
Can you know that you exist?
|
||||||
|
What is an exoplanet?
|
||||||
|
Do you like cream?
|
||||||
|
What is the difference?
|
||||||
|
Can I know that I exist while I'm dreaming that I'm Descartes?
|
||||||
|
Who said "I didn't know I thought that until I heard myself saying it"?
|
||||||
|
Does anything really matter?
|
||||||
|
Can you explain the unreasonable effectiveness of mathematics?
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue