update session copy/set to use ggml-backend
ggml-ci
This commit is contained in:
parent
bcd87ca925
commit
24cc321931
3 changed files with 49 additions and 32 deletions
|
@ -792,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
} else {
|
} else {
|
||||||
ggml_backend_view_init(buffer, t);
|
ggml_backend_view_init(buffer, t);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
if (t->view_src != NULL) {
|
||||||
|
// view of a pre-allocated tensor
|
||||||
|
ggml_backend_view_init(buffer, t);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1250,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
|
||||||
// utils
|
// utils
|
||||||
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(tensor->buffer == NULL);
|
GGML_ASSERT(tensor->buffer == NULL);
|
||||||
GGML_ASSERT(tensor->data == NULL);
|
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
|
||||||
GGML_ASSERT(tensor->view_src != NULL);
|
GGML_ASSERT(tensor->view_src != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->data != NULL);
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
||||||
|
|
74
llama.cpp
74
llama.cpp
|
@ -3661,7 +3661,7 @@ static void llm_load_tensors(
|
||||||
LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
|
||||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
||||||
|
@ -9830,17 +9830,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||||
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
||||||
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
||||||
|
|
||||||
std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
|
std::vector<struct ggml_tensor *> kout2d(n_layer);
|
||||||
std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
|
std::vector<struct ggml_tensor *> vout2d(n_layer);
|
||||||
|
|
||||||
for (int il = 0; il < (int) n_layer; ++il) {
|
for (int il = 0; il < (int) n_layer; ++il) {
|
||||||
ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
||||||
kout2d_data[il].resize(ggml_nbytes(kout2d));
|
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
||||||
kout2d->data = kout2d_data[il].data();
|
|
||||||
|
|
||||||
ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
|
||||||
vout2d_data[il].resize(ggml_nbytes(vout2d));
|
|
||||||
vout2d->data = vout2d_data[il].data();
|
|
||||||
|
|
||||||
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
||||||
n_embd, kv_head,
|
n_embd, kv_head,
|
||||||
|
@ -9850,21 +9845,28 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||||
kv_head, n_embd,
|
kv_head, n_embd,
|
||||||
elt_size*n_ctx, 0);
|
elt_size*n_ctx, 0);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
|
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
|
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<uint8_t> work_buffer;
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
|
||||||
ggml_graph_compute_helper(work_buffer, gf, ctx->cparams.n_threads);
|
|
||||||
|
ggml_backend_graph_compute(ctx->backend, gf);
|
||||||
|
|
||||||
|
std::vector<uint8_t> tmp_buf;
|
||||||
|
for (int il = 0; il < (int) n_layer; ++il) {
|
||||||
|
tmp_buf.resize(ggml_nbytes(kout2d[il]));
|
||||||
|
ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size());
|
||||||
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
||||||
|
|
||||||
|
tmp_buf.resize(ggml_nbytes(vout2d[il]));
|
||||||
|
ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size());
|
||||||
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
||||||
|
}
|
||||||
|
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
|
|
||||||
// our data is now in the kout2d_data and vout2d_data buffers
|
ggml_backend_buffer_free(buf);
|
||||||
// write them to file
|
|
||||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
||||||
data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
|
|
||||||
data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t i = 0; i < kv_size; ++i) {
|
for (uint32_t i = 0; i < kv_size; ++i) {
|
||||||
|
@ -9969,14 +9971,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
||||||
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
std::vector<struct ggml_tensor *> kin2d(n_layer);
|
||||||
ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
std::vector<struct ggml_tensor *> vin2d(n_layer);
|
||||||
kin2d->data = (void *) inp;
|
|
||||||
inp += ggml_nbytes(kin2d);
|
|
||||||
|
|
||||||
ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
vin2d->data = (void *) inp;
|
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
||||||
inp += ggml_nbytes(vin2d);
|
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
||||||
|
|
||||||
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
||||||
n_embd, kv_head,
|
n_embd, kv_head,
|
||||||
|
@ -9986,14 +9986,26 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
kv_head, n_embd,
|
kv_head, n_embd,
|
||||||
elt_size*n_ctx, 0);
|
elt_size*n_ctx, 0);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
|
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
|
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<uint8_t> work_buffer;
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
|
||||||
ggml_graph_compute_helper(work_buffer, gf, ctx->cparams.n_threads);
|
|
||||||
|
// load data into the tensors
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
|
||||||
|
inp += ggml_nbytes(kin2d[il]);
|
||||||
|
|
||||||
|
ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
|
||||||
|
inp += ggml_nbytes(vin2d[il]);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_graph_compute(ctx->backend, gf);
|
||||||
|
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
|
|
||||||
|
ggml_backend_buffer_free(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->kv_self.head = kv_head;
|
ctx->kv_self.head = kv_head;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue