sync : ggml (backend v2) (#3912)
* sync : ggml (backend v2) (wip) * sync : migrate examples and llama.cpp to dynamic graphs (wip) * sync : update tests + fix max op params to 64 ggml-ci * sync : ggml-cuda ggml-ci * llama : fix save/load state context size ggml-ci * sync : try to fix build on tvOS * sync : pass custom graph sizes in training examples * sync : update graph copies to new ggml API * sync : update sync-ggml.sh with new files * scripts : fix header in sync script * train : fix context size calculations * llama : increase inference graph size up to 4096 nodes * train : allocate grads for backward graphs * train : allocate grads for gb_tmp
This commit is contained in:
parent
bb50a792ec
commit
4760e7cc0b
22 changed files with 1994 additions and 864 deletions
40
llama.cpp
40
llama.cpp
|
@ -91,6 +91,8 @@
|
|||
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
||||
#endif
|
||||
|
||||
#define LLAMA_MAX_NODES 4096
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
|
@ -3618,7 +3620,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_llama() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
|
@ -3730,7 +3732,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_baichuan() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -3850,7 +3852,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_falcon() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -3972,7 +3974,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_starcoder() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * pos;
|
||||
|
@ -4071,7 +4073,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_persimmon() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_rot = n_embd_head / 2;
|
||||
|
||||
|
@ -4281,7 +4283,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_refact() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -4372,7 +4374,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_bloom() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -4466,7 +4468,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_mpt() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -8208,7 +8210,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
{
|
||||
static const size_t tensor_alignment = 32;
|
||||
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
||||
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
|
||||
ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
||||
|
||||
// create measure allocator
|
||||
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||
|
@ -8597,8 +8599,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|||
if (kv_buf_size) {
|
||||
const size_t elt_size = ggml_element_size(kv_self.k);
|
||||
|
||||
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
||||
ggml_cgraph gf{};
|
||||
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
||||
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
||||
|
||||
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
||||
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
||||
|
@ -8616,9 +8618,9 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|||
kv_head, n_embd, n_layer,
|
||||
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
||||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
||||
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
||||
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
||||
|
||||
ggml_free(cpy_ctx);
|
||||
|
||||
|
@ -8725,8 +8727,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|||
|
||||
const size_t elt_size = ggml_element_size(kv_self.k);
|
||||
|
||||
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
||||
ggml_cgraph gf{};
|
||||
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
||||
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
||||
|
||||
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
||||
kin3d->data = (void *) inp;
|
||||
|
@ -8744,9 +8746,9 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|||
kv_head, n_embd, n_layer,
|
||||
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
||||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
||||
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
||||
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
||||
|
||||
ggml_free(cpy_ctx);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue