Add space around * pointers and & references.
This commit is contained in:
parent
abe0829984
commit
9bedaf4c71
4 changed files with 34 additions and 35 deletions
|
@ -29,10 +29,10 @@
|
||||||
|
|
||||||
// Used for debugging to print out beam tokens.
|
// Used for debugging to print out beam tokens.
|
||||||
struct ostream_beam_view {
|
struct ostream_beam_view {
|
||||||
llama_context* ctx;
|
llama_context * ctx;
|
||||||
llama_beam_view beam_view;
|
llama_beam_view beam_view;
|
||||||
};
|
};
|
||||||
std::ostream& operator<<(std::ostream& os, ostream_beam_view const& obv) {
|
std::ostream& operator<<(std::ostream& os, ostream_beam_view const & obv) {
|
||||||
os << "p(" << obv.beam_view.p << ") eos(" << std::boolalpha << obv.beam_view.eos << ") tokens(";
|
os << "p(" << obv.beam_view.p << ") eos(" << std::boolalpha << obv.beam_view.eos << ") tokens(";
|
||||||
for (size_t i=0 ; i<obv.beam_view.n_tokens ; ++i) {
|
for (size_t i=0 ; i<obv.beam_view.n_tokens ; ++i) {
|
||||||
os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]);
|
os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]);
|
||||||
|
@ -42,11 +42,11 @@ std::ostream& operator<<(std::ostream& os, ostream_beam_view const& obv) {
|
||||||
|
|
||||||
// Put here anything you want back in beam_search_callback().
|
// Put here anything you want back in beam_search_callback().
|
||||||
struct beam_search_callback_data {
|
struct beam_search_callback_data {
|
||||||
llama_context* ctx;
|
llama_context * ctx;
|
||||||
std::vector<llama_token> response;
|
std::vector<llama_token> response;
|
||||||
};
|
};
|
||||||
|
|
||||||
bool is_at_eos(beam_search_callback_data const& callback_data, llama_token const* tokens, size_t const n_tokens) {
|
bool is_at_eos(beam_search_callback_data const & callback_data, llama_token const * tokens, size_t const n_tokens) {
|
||||||
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
|
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ bool is_at_eos(beam_search_callback_data const& callback_data, llama_token const
|
||||||
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
||||||
// This is also called when the stop condition is met.
|
// This is also called when the stop condition is met.
|
||||||
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
||||||
void beam_search_callback(void* callback_data_ptr, llama_beams_state beams_state) {
|
void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
||||||
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
||||||
// Mark beams as EOS as needed.
|
// Mark beams as EOS as needed.
|
||||||
for (size_t i=0 ; i<beams_state.n_beams ; ++i) {
|
for (size_t i=0 ; i<beams_state.n_beams ; ++i) {
|
||||||
|
@ -69,7 +69,7 @@ void beam_search_callback(void* callback_data_ptr, llama_beams_state beams_state
|
||||||
if (size_t const n = beams_state.common_prefix_length) {
|
if (size_t const n = beams_state.common_prefix_length) {
|
||||||
callback_data.response.resize(callback_data.response.size() + n);
|
callback_data.response.resize(callback_data.response.size() + n);
|
||||||
assert(0u < beams_state.n_beams);
|
assert(0u < beams_state.n_beams);
|
||||||
llama_token const* tokens = beams_state.beam_views[0].tokens;
|
llama_token const * tokens = beams_state.beam_views[0].tokens;
|
||||||
std::copy(tokens, tokens + n, callback_data.response.end() - n);
|
std::copy(tokens, tokens + n, callback_data.response.end() - n);
|
||||||
printf("%lu", n);
|
printf("%lu", n);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1209,7 +1209,7 @@ static void log_server_request(const Request &req, const Response &res)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_at_eos(llama_server_context& server_context, llama_token const* tokens, size_t const n_tokens) {
|
bool is_at_eos(llama_server_context & server_context, llama_token const * tokens, size_t const n_tokens) {
|
||||||
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
|
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1219,11 +1219,11 @@ bool is_at_eos(llama_server_context& server_context, llama_token const* tokens,
|
||||||
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
||||||
// This is also called when the stop condition is met.
|
// This is also called when the stop condition is met.
|
||||||
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
||||||
void beam_search_callback(void* callback_data, llama_beams_state beams_state) {
|
void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
|
||||||
auto& llama = *static_cast<llama_server_context*>(callback_data);
|
auto & llama = *static_cast<llama_server_context*>(callback_data);
|
||||||
// Mark beams as EOS as needed.
|
// Mark beams as EOS as needed.
|
||||||
for (size_t i=0 ; i<beams_state.n_beams ; ++i) {
|
for (size_t i=0 ; i<beams_state.n_beams ; ++i) {
|
||||||
llama_beam_view& beam_view = beams_state.beam_views[i];
|
llama_beam_view & beam_view = beams_state.beam_views[i];
|
||||||
if (!beam_view.eos && is_at_eos(llama, beam_view.tokens, beam_view.n_tokens)) {
|
if (!beam_view.eos && is_at_eos(llama, beam_view.tokens, beam_view.n_tokens)) {
|
||||||
beam_view.eos = true;
|
beam_view.eos = true;
|
||||||
}
|
}
|
||||||
|
@ -1232,8 +1232,7 @@ void beam_search_callback(void* callback_data, llama_beams_state beams_state) {
|
||||||
if (size_t const n = beams_state.common_prefix_length) {
|
if (size_t const n = beams_state.common_prefix_length) {
|
||||||
llama.generated_token_probs.resize(llama.generated_token_probs.size() + n);
|
llama.generated_token_probs.resize(llama.generated_token_probs.size() + n);
|
||||||
assert(0u < beams_state.n_beams);
|
assert(0u < beams_state.n_beams);
|
||||||
llama_token const* tokens = beams_state.beam_views[0].tokens;
|
llama_token const * tokens = beams_state.beam_views[0].tokens;
|
||||||
//std::copy(tokens, tokens + n, llama->generated_token_probs.end() - n);
|
|
||||||
auto const map = [](llama_token tok) { return completion_token_output{{},tok}; };
|
auto const map = [](llama_token tok) { return completion_token_output{{},tok}; };
|
||||||
std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
|
std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
|
||||||
printf("%lu", n);
|
printf("%lu", n);
|
||||||
|
@ -1248,20 +1247,20 @@ void beam_search_callback(void* callback_data, llama_beams_state beams_state) {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct token_translator {
|
struct token_translator {
|
||||||
llama_context* ctx;
|
llama_context * ctx;
|
||||||
std::string operator()(llama_token tok) const { return llama_token_to_str(ctx, tok); }
|
std::string operator()(llama_token tok) const { return llama_token_to_str(ctx, tok); }
|
||||||
std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
|
std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
|
||||||
};
|
};
|
||||||
|
|
||||||
void append_to_generated_text_from_generated_token_probs(llama_server_context& llama) {
|
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
|
||||||
auto& gtps = llama.generated_token_probs;
|
auto & gtps = llama.generated_token_probs;
|
||||||
auto translator = token_translator{llama.ctx};
|
auto translator = token_translator{llama.ctx};
|
||||||
auto add_strlen = [=](size_t sum, completion_token_output const& cto) { return sum + translator(cto).size(); };
|
auto add_strlen = [=](size_t sum, completion_token_output const & cto) { return sum + translator(cto).size(); };
|
||||||
size_t const len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
|
size_t const len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
|
||||||
if (llama.generated_text.capacity() < llama.generated_text.size() + len) {
|
if (llama.generated_text.capacity() < llama.generated_text.size() + len) {
|
||||||
llama.generated_text.reserve(llama.generated_text.size() + len);
|
llama.generated_text.reserve(llama.generated_text.size() + len);
|
||||||
}
|
}
|
||||||
for (completion_token_output const& cto : gtps) {
|
for (completion_token_output const & cto : gtps) {
|
||||||
llama.generated_text += translator(cto);
|
llama.generated_text += translator(cto);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
28
llama.cpp
28
llama.cpp
|
@ -4335,7 +4335,7 @@ struct llama_beam {
|
||||||
float p; // Cumulative beam probability (renormalized relative to all beams)
|
float p; // Cumulative beam probability (renormalized relative to all beams)
|
||||||
bool eos; // Initialize end-of-sentence to false. Callback sets this to true.
|
bool eos; // Initialize end-of-sentence to false. Callback sets this to true.
|
||||||
// Sort beams by probability. In case of ties, prefer beams at eos.
|
// Sort beams by probability. In case of ties, prefer beams at eos.
|
||||||
bool operator<(llama_beam const& rhs) const {
|
bool operator<(llama_beam const & rhs) const {
|
||||||
return std::make_tuple(p, eos) < std::make_tuple(rhs.p, rhs.eos);
|
return std::make_tuple(p, eos) < std::make_tuple(rhs.p, rhs.eos);
|
||||||
}
|
}
|
||||||
// Shift off first n tokens and discard them.
|
// Shift off first n tokens and discard them.
|
||||||
|
@ -4350,7 +4350,7 @@ struct llama_beam {
|
||||||
|
|
||||||
// A struct for calculating logit-related info.
|
// A struct for calculating logit-related info.
|
||||||
struct logit_info {
|
struct logit_info {
|
||||||
float const* const logits;
|
float const * const logits;
|
||||||
int const n_vocab;
|
int const n_vocab;
|
||||||
float const max_l;
|
float const max_l;
|
||||||
float const normalizer;
|
float const normalizer;
|
||||||
|
@ -4358,7 +4358,7 @@ struct logit_info {
|
||||||
float max_l;
|
float max_l;
|
||||||
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
||||||
};
|
};
|
||||||
logit_info(llama_context* ctx)
|
logit_info(llama_context * ctx)
|
||||||
: logits(llama_get_logits(ctx))
|
: logits(llama_get_logits(ctx))
|
||||||
, n_vocab(llama_n_vocab(ctx))
|
, n_vocab(llama_n_vocab(ctx))
|
||||||
, max_l(*std::max_element(logits, logits + n_vocab))
|
, max_l(*std::max_element(logits, logits + n_vocab))
|
||||||
|
@ -4376,7 +4376,7 @@ struct logit_info {
|
||||||
for (llama_token token_id=0 ; token_id<k_min ; ++token_id) {
|
for (llama_token token_id=0 ; token_id<k_min ; ++token_id) {
|
||||||
min_heap.push_back(get_token_data(token_id));
|
min_heap.push_back(get_token_data(token_id));
|
||||||
}
|
}
|
||||||
auto comp = [](llama_token_data const& a, llama_token_data const& b) { return a.logit > b.logit; };
|
auto comp = [](llama_token_data const & a, llama_token_data const & b) { return a.logit > b.logit; };
|
||||||
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
||||||
for (llama_token token_id=k_min ; token_id<n_vocab ; ++token_id) {
|
for (llama_token token_id=k_min ; token_id<n_vocab ; ++token_id) {
|
||||||
if (min_heap.front().logit < logits[token_id]) {
|
if (min_heap.front().logit < logits[token_id]) {
|
||||||
|
@ -4432,9 +4432,9 @@ struct beam_search {
|
||||||
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
||||||
// * If the heap is full and a new element is found that should be included, pop the
|
// * If the heap is full and a new element is found that should be included, pop the
|
||||||
// least element to the back(), replace it with the new, then push it into the heap.
|
// least element to the back(), replace it with the new, then push it into the heap.
|
||||||
void fill_next_beams_by_top_probabilities(llama_beam& beam) {
|
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
||||||
// Min-heaps use a greater-than comparator.
|
// Min-heaps use a greater-than comparator.
|
||||||
auto const comp = [](llama_beam const& a, llama_beam const& b) { return a.p > b.p; };
|
auto const comp = [](llama_beam const & a, llama_beam const & b) { return a.p > b.p; };
|
||||||
if (beam.eos) {
|
if (beam.eos) {
|
||||||
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
||||||
if (next_beams.size() < n_beams) {
|
if (next_beams.size() < n_beams) {
|
||||||
|
@ -4516,9 +4516,9 @@ struct beam_search {
|
||||||
// * any of the beams have not yet reached end-of-sentence, AND
|
// * any of the beams have not yet reached end-of-sentence, AND
|
||||||
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
||||||
// (since all other beam probabilities can only decrease)
|
// (since all other beam probabilities can only decrease)
|
||||||
void loop(llama_beam_search_callback_fn_t const callback, void* const callback_data) {
|
void loop(llama_beam_search_callback_fn_t const callback, void * const callback_data) {
|
||||||
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eos.
|
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eos.
|
||||||
auto const not_eos = [](llama_beam const& beam) { return !beam.eos; };
|
auto const not_eos = [](llama_beam const & beam) { return !beam.eos; };
|
||||||
for (int i=0 ; i<n_predict && std::any_of(beams.begin(),beams.end(),not_eos) &&
|
for (int i=0 ; i<n_predict && std::any_of(beams.begin(),beams.end(),not_eos) &&
|
||||||
!beams[top_beam_index()].eos ; ++i) {
|
!beams[top_beam_index()].eos ; ++i) {
|
||||||
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
||||||
|
@ -4528,8 +4528,8 @@ struct beam_search {
|
||||||
n_past += common_prefix_length;
|
n_past += common_prefix_length;
|
||||||
}
|
}
|
||||||
// Zero-out next_beam probabilities to place them last in following min-heap.
|
// Zero-out next_beam probabilities to place them last in following min-heap.
|
||||||
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam& beam) { beam.p = 0.0f; });
|
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
||||||
for (llama_beam& beam : beams) {
|
for (llama_beam & beam : beams) {
|
||||||
beam.shift_tokens(common_prefix_length);
|
beam.shift_tokens(common_prefix_length);
|
||||||
fill_next_beams_by_top_probabilities(beam);
|
fill_next_beams_by_top_probabilities(beam);
|
||||||
}
|
}
|
||||||
|
@ -4543,10 +4543,10 @@ struct beam_search {
|
||||||
|
|
||||||
// As beams grow, the cumulative probabilities decrease.
|
// As beams grow, the cumulative probabilities decrease.
|
||||||
// Renormalize them to avoid floating point underflow.
|
// Renormalize them to avoid floating point underflow.
|
||||||
static void renormalize_beam_probabilities(std::vector<llama_beam>& beams) {
|
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
||||||
auto const sum_p = [](float sum, llama_beam& beam) { return sum + beam.p; };
|
auto const sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
||||||
float const inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
float const inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
||||||
std::for_each(beams.begin(), beams.end(), [=](llama_beam& beam) { beam.p *= inv_sum; });
|
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
||||||
}
|
}
|
||||||
|
|
||||||
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
||||||
|
@ -4564,7 +4564,7 @@ struct beam_search {
|
||||||
};
|
};
|
||||||
|
|
||||||
void llama_beam_search(llama_context * ctx,
|
void llama_beam_search(llama_context * ctx,
|
||||||
llama_beam_search_callback_fn_t callback, void* callback_data,
|
llama_beam_search_callback_fn_t callback, void * callback_data,
|
||||||
size_t n_beams, int n_past, int n_predict, int n_threads) {
|
size_t n_beams, int n_past, int n_predict, int n_threads) {
|
||||||
assert(ctx);
|
assert(ctx);
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
8
llama.h
8
llama.h
|
@ -470,7 +470,7 @@ extern "C" {
|
||||||
//
|
//
|
||||||
|
|
||||||
struct llama_beam_view {
|
struct llama_beam_view {
|
||||||
llama_token const* tokens;
|
const llama_token * tokens;
|
||||||
size_t n_tokens;
|
size_t n_tokens;
|
||||||
float p; // Cumulative beam probability (renormalized relative to all beams)
|
float p; // Cumulative beam probability (renormalized relative to all beams)
|
||||||
bool eos; // Callback should set this to true when a beam is at end-of-sentence.
|
bool eos; // Callback should set this to true when a beam is at end-of-sentence.
|
||||||
|
@ -481,7 +481,7 @@ extern "C" {
|
||||||
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
||||||
// These pointers are valid only during the synchronous callback, so should not be saved.
|
// These pointers are valid only during the synchronous callback, so should not be saved.
|
||||||
struct llama_beams_state {
|
struct llama_beams_state {
|
||||||
llama_beam_view* beam_views;
|
llama_beam_view * beam_views;
|
||||||
size_t n_beams; // Number of elements in beam_views[].
|
size_t n_beams; // Number of elements in beam_views[].
|
||||||
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
||||||
bool last_call; // True iff this is the last callback invocation.
|
bool last_call; // True iff this is the last callback invocation.
|
||||||
|
@ -490,7 +490,7 @@ extern "C" {
|
||||||
// Type of pointer to the beam_search_callback function.
|
// Type of pointer to the beam_search_callback function.
|
||||||
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
||||||
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
||||||
typedef void (*llama_beam_search_callback_fn_t)(void* callback_data, llama_beams_state);
|
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, llama_beams_state);
|
||||||
|
|
||||||
/// @details Deterministically returns entire sentence constructed by a beam search.
|
/// @details Deterministically returns entire sentence constructed by a beam search.
|
||||||
/// @param ctx Pointer to the llama_context.
|
/// @param ctx Pointer to the llama_context.
|
||||||
|
@ -501,7 +501,7 @@ extern "C" {
|
||||||
/// @param n_past Number of tokens already evaluated.
|
/// @param n_past Number of tokens already evaluated.
|
||||||
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
||||||
/// @param n_threads Number of threads as passed to llama_eval().
|
/// @param n_threads Number of threads as passed to llama_eval().
|
||||||
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void* callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
|
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
|
||||||
|
|
||||||
// Performance information
|
// Performance information
|
||||||
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue