llama : handle errors from llama_output_reserve at call sites
This commit is contained in:
parent
615a3a4a50
commit
7d8d6b589f
1 changed files with 31 additions and 26 deletions
57
llama.cpp
57
llama.cpp
|
@ -9165,14 +9165,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Only alloc when needed
|
// Make sure enough space is available for outputs.
|
||||||
static void llama_output_reserve(llama_context & lctx, int32_t n_outputs) {
|
// Returns max number of outputs for which space was reserved.
|
||||||
GGML_ASSERT(0 <= n_outputs);
|
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
||||||
|
|
||||||
const auto & cparams = lctx.cparams;
|
const auto & cparams = lctx.cparams;
|
||||||
const auto & hparams = lctx.model.hparams;
|
const auto & hparams = lctx.model.hparams;
|
||||||
|
|
||||||
const int32_t n_outputs_max = std::max((uint32_t) n_outputs, cparams.n_seq_max);
|
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
|
||||||
|
|
||||||
const auto n_batch = cparams.n_batch;
|
const auto n_batch = cparams.n_batch;
|
||||||
const auto n_vocab = hparams.n_vocab;
|
const auto n_vocab = hparams.n_vocab;
|
||||||
|
@ -9209,7 +9208,8 @@ static void llama_output_reserve(llama_context & lctx, int32_t n_outputs) {
|
||||||
|
|
||||||
lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
|
lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
|
||||||
if (lctx.buf_output == nullptr) {
|
if (lctx.buf_output == nullptr) {
|
||||||
throw std::runtime_error(format("failed to allocate output buffer of size %.2f MiB", new_size / (1024.0 * 1024.0)));
|
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
|
float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
|
||||||
|
@ -9226,6 +9226,8 @@ static void llama_output_reserve(llama_context & lctx, int32_t n_outputs) {
|
||||||
ggml_backend_buffer_clear(lctx.buf_output, 0);
|
ggml_backend_buffer_clear(lctx.buf_output, 0);
|
||||||
|
|
||||||
lctx.n_outputs = 0;
|
lctx.n_outputs = 0;
|
||||||
|
|
||||||
|
return n_outputs_max;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -9304,8 +9306,8 @@ static int llama_decode_internal(
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
const int64_t n_vocab = hparams.n_vocab;
|
const int64_t n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
int32_t n_outputs = 0;
|
uint32_t n_outputs = 0;
|
||||||
int32_t n_outputs_prev = 0;
|
uint32_t n_outputs_prev = 0;
|
||||||
|
|
||||||
const auto n_ubatch = cparams.n_ubatch;
|
const auto n_ubatch = cparams.n_ubatch;
|
||||||
|
|
||||||
|
@ -9314,29 +9316,34 @@ static int llama_decode_internal(
|
||||||
std::vector<llama_seq_id *> seq_id_arr;
|
std::vector<llama_seq_id *> seq_id_arr;
|
||||||
std::vector<std::vector<llama_seq_id>> seq_id;
|
std::vector<std::vector<llama_seq_id>> seq_id;
|
||||||
|
|
||||||
// reserve output buffer
|
// count outputs
|
||||||
if (batch_all.logits) {
|
if (batch_all.logits) {
|
||||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||||
n_outputs += batch_all.logits[i] != 0;
|
n_outputs += batch_all.logits[i] != 0;
|
||||||
}
|
}
|
||||||
llama_output_reserve(lctx, n_outputs);
|
} else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
|
||||||
|
n_outputs = n_tokens_all;
|
||||||
|
} else {
|
||||||
|
// keep last output only
|
||||||
|
n_outputs = 1;
|
||||||
|
}
|
||||||
|
// reserve output buffer
|
||||||
|
if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
|
||||||
|
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
|
||||||
|
return -2;
|
||||||
|
};
|
||||||
|
// set output mappings
|
||||||
|
if (batch_all.logits) {
|
||||||
int32_t i_logits = 0;
|
int32_t i_logits = 0;
|
||||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||||
if (batch_all.logits[i]) {
|
if (batch_all.logits[i]) {
|
||||||
lctx.output_ids[i] = i_logits++;
|
lctx.output_ids[i] = i_logits++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
|
} else {
|
||||||
n_outputs = n_tokens_all;
|
for (uint32_t i = 0; i < n_outputs; ++i) {
|
||||||
llama_output_reserve(lctx, n_outputs);
|
|
||||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
|
||||||
lctx.output_ids[i] = i;
|
lctx.output_ids[i] = i;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// keep last output only
|
|
||||||
n_outputs = 1;
|
|
||||||
llama_output_reserve(lctx, n_outputs);
|
|
||||||
lctx.output_ids[0] = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
|
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
|
||||||
|
@ -9362,7 +9369,7 @@ static int llama_decode_internal(
|
||||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||||
n_outputs_new += u_batch.logits[i] != 0;
|
n_outputs_new += u_batch.logits[i] != 0;
|
||||||
}
|
}
|
||||||
} else if ((uint32_t) n_outputs == n_tokens_all) {
|
} else if (n_outputs == n_tokens_all) {
|
||||||
n_outputs_new = n_tokens;
|
n_outputs_new = n_tokens;
|
||||||
} else {
|
} else {
|
||||||
// keep last output only
|
// keep last output only
|
||||||
|
@ -13513,11 +13520,9 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
// graph outputs buffer
|
// graph outputs buffer
|
||||||
{
|
{
|
||||||
// resized during inference when more than n_seq_max logits are requested in a batch
|
// resized during inference when a batch uses more outputs
|
||||||
try {
|
if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
|
||||||
llama_output_reserve(*ctx, 0);
|
LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
|
||||||
} catch (const std::exception & err) {
|
|
||||||
LLAMA_LOG_ERROR("%s: error reserving logits buffer: %s\n", __func__, err.what());
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -14299,7 +14304,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||||
|
|
||||||
memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
|
memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
|
||||||
|
|
||||||
llama_output_reserve(*ctx, n_outputs);
|
GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs));
|
||||||
|
|
||||||
if (n_outputs) {
|
if (n_outputs) {
|
||||||
output_pos.resize(n_outputs);
|
output_pos.resize(n_outputs);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue