llama : increase inference graph size up to 4096 nodes
This commit is contained in:
parent
b1592ea054
commit
e50ab5af5b
1 changed files with 11 additions and 9 deletions
20
llama.cpp
20
llama.cpp
|
@ -91,6 +91,8 @@
|
||||||
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define LLAMA_MAX_NODES 4096
|
||||||
|
|
||||||
//
|
//
|
||||||
// logging
|
// logging
|
||||||
//
|
//
|
||||||
|
@ -3580,7 +3582,7 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_llama() {
|
struct ggml_cgraph * build_llama() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
@ -3692,7 +3694,7 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_baichuan() {
|
struct ggml_cgraph * build_baichuan() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
@ -3812,7 +3814,7 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_falcon() {
|
struct ggml_cgraph * build_falcon() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
@ -3934,7 +3936,7 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_starcoder() {
|
struct ggml_cgraph * build_starcoder() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * pos;
|
struct ggml_tensor * pos;
|
||||||
|
@ -4033,7 +4035,7 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_persimmon() {
|
struct ggml_cgraph * build_persimmon() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
const int64_t n_rot = n_embd_head / 2;
|
const int64_t n_rot = n_embd_head / 2;
|
||||||
|
|
||||||
|
@ -4243,7 +4245,7 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_refact() {
|
struct ggml_cgraph * build_refact() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
@ -4334,7 +4336,7 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_bloom() {
|
struct ggml_cgraph * build_bloom() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
@ -4428,7 +4430,7 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_mpt() {
|
struct ggml_cgraph * build_mpt() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
@ -8169,7 +8171,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
{
|
{
|
||||||
static const size_t tensor_alignment = 32;
|
static const size_t tensor_alignment = 32;
|
||||||
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
||||||
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
|
ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
||||||
|
|
||||||
// create measure allocator
|
// create measure allocator
|
||||||
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue