llama: updated comments
This commit is contained in:
parent
ee599f901a
commit
0638c44821
1 changed files with 19 additions and 8 deletions
|
@ -3502,13 +3502,10 @@ static bool llama_kv_cache_init(
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// find an empty slot of size "n_tokens" in the cache
|
// a structure holds information about the slot found in llama_kv_cache_find_slot
|
||||||
// updates the cache head
|
|
||||||
// Note: On success, it's important that cache.head points
|
|
||||||
// to the first cell of the slot.
|
|
||||||
struct llama_kv_cache_slot_info {
|
struct llama_kv_cache_slot_info {
|
||||||
std::pair<uint32_t, uint32_t> boundaries;
|
std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
|
||||||
bool found = false;
|
bool found = false; // the slot was found
|
||||||
|
|
||||||
explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
|
explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
|
||||||
llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
|
llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
|
||||||
|
@ -3517,6 +3514,11 @@ struct llama_kv_cache_slot_info {
|
||||||
};
|
};
|
||||||
static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
|
static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
|
||||||
|
|
||||||
|
// find an empty slot of size "n_tokens" in the cache
|
||||||
|
// updates the cache head
|
||||||
|
// returns a structure holding information about the slot found
|
||||||
|
// Note: On success, it's important that cache.head points
|
||||||
|
// to the first cell of the slot.
|
||||||
static struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
static struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||||
struct llama_kv_cache & cache,
|
struct llama_kv_cache & cache,
|
||||||
const struct llama_ubatch & batch) {
|
const struct llama_ubatch & batch) {
|
||||||
|
@ -4019,7 +4021,9 @@ struct llama_kv_slot_restorer {
|
||||||
uint32_t n = 0;
|
uint32_t n = 0;
|
||||||
} old_state;
|
} old_state;
|
||||||
|
|
||||||
std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries; // for non-recurrent models only
|
// for non-recurrent models only
|
||||||
|
// list of slots to restore
|
||||||
|
std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
|
||||||
|
|
||||||
bool do_restore = false;
|
bool do_restore = false;
|
||||||
|
|
||||||
|
@ -4028,7 +4032,8 @@ struct llama_kv_slot_restorer {
|
||||||
old_state.n = cache.n;
|
old_state.n = cache.n;
|
||||||
}
|
}
|
||||||
|
|
||||||
void save(const struct llama_kv_cache_slot_info& slot) {
|
// saves a slot information for future restoration
|
||||||
|
void save(const struct llama_kv_cache_slot_info & slot) {
|
||||||
if (slot) {
|
if (slot) {
|
||||||
do_restore = true;
|
do_restore = true;
|
||||||
if (slot.boundaries.first != slot.boundaries.second) {
|
if (slot.boundaries.first != slot.boundaries.second) {
|
||||||
|
@ -4037,6 +4042,8 @@ struct llama_kv_slot_restorer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// must be explicitly called to restore the kv_cache state
|
||||||
|
// and rollback changes from all llama_kv_cache_find_slot calls
|
||||||
void restore(struct llama_kv_cache & cache) {
|
void restore(struct llama_kv_cache & cache) {
|
||||||
if (do_restore) {
|
if (do_restore) {
|
||||||
cache.head = old_state.head;
|
cache.head = old_state.head;
|
||||||
|
@ -17236,6 +17243,7 @@ static void llama_output_reorder(struct llama_context * ctx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// returns the result of ggml_backend_sched_graph_compute_async execution
|
||||||
static enum ggml_status llama_graph_compute(
|
static enum ggml_status llama_graph_compute(
|
||||||
llama_context & lctx,
|
llama_context & lctx,
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
|
@ -17262,6 +17270,9 @@ static enum ggml_status llama_graph_compute(
|
||||||
}
|
}
|
||||||
|
|
||||||
// decode a batch of tokens by evaluating the transformer
|
// decode a batch of tokens by evaluating the transformer
|
||||||
|
// in case of unsuccessful decoding (error or warning),
|
||||||
|
// the kv_cache state will be returned to its original state
|
||||||
|
// (for non-recurrent models) or cleaned (for recurrent models)
|
||||||
//
|
//
|
||||||
// - lctx: llama context
|
// - lctx: llama context
|
||||||
// - batch: batch to evaluate
|
// - batch: batch to evaluate
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue