sync : ggml (backend v2) (#3912)
* sync : ggml (backend v2) (wip) * sync : migrate examples and llama.cpp to dynamic graphs (wip) * sync : update tests + fix max op params to 64 ggml-ci * sync : ggml-cuda ggml-ci * llama : fix save/load state context size ggml-ci * sync : try to fix build on tvOS * sync : pass custom graph sizes in training examples * sync : update graph copies to new ggml API * sync : update sync-ggml.sh with new files * scripts : fix header in sync script * train : fix context size calculations * llama : increase inference graph size up to 4096 nodes * train : allocate grads for backward graphs * train : allocate grads for gb_tmp
This commit is contained in:
parent
bb50a792ec
commit
4760e7cc0b
22 changed files with 1994 additions and 864 deletions
|
@ -171,7 +171,8 @@ int main(int argc, char ** argv) {
|
|||
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
|
||||
|
||||
// printf("Creating compute graph\n");
|
||||
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
||||
ggml_build_forward_expand(gf, m11xm2);
|
||||
|
||||
printf("n_threads=%i\n", benchmark_params.n_threads);
|
||||
|
||||
|
@ -180,9 +181,9 @@ int main(int argc, char ** argv) {
|
|||
|
||||
std::vector<uint8_t> work_buffer;
|
||||
|
||||
ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
|
||||
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
|
||||
|
||||
TENSOR_DUMP(gf.nodes[0]);
|
||||
TENSOR_DUMP(gf->nodes[0]);
|
||||
|
||||
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
||||
|
||||
|
@ -200,7 +201,8 @@ int main(int argc, char ** argv) {
|
|||
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
|
||||
|
||||
// printf("Creating compute graph\n");
|
||||
struct ggml_cgraph gf31 = ggml_build_forward(q31);
|
||||
struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
|
||||
ggml_build_forward_expand(gf31, q31);
|
||||
|
||||
// Set up a second graph computation to make sure we override the CPU cache lines
|
||||
// printf("Creating new tensor q12 & Running quantize\n");
|
||||
|
@ -211,7 +213,8 @@ int main(int argc, char ** argv) {
|
|||
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
|
||||
|
||||
//printf("Creating compute graph\n");
|
||||
struct ggml_cgraph gf32 = ggml_build_forward(q32);
|
||||
struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
|
||||
ggml_build_forward_expand(gf32, q32);
|
||||
printf("n_threads=%i\n", benchmark_params.n_threads);
|
||||
|
||||
const int dimx = sizex;
|
||||
|
@ -223,7 +226,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
|
||||
// Let's use the F32 result from above as a reference for the quantized multiplication
|
||||
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
|
||||
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
|
||||
|
||||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
||||
printf("=====================================================================================\n");
|
||||
|
@ -233,7 +236,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
long long int start = ggml_time_us();
|
||||
//printf("Running ggml_graph_compute\n");
|
||||
ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
|
||||
ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
|
||||
|
||||
long long int stop = ggml_time_us();
|
||||
long long int usec = stop-start;
|
||||
|
@ -251,7 +254,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// Check that the matrix multiplication result is in the right ballpark
|
||||
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
||||
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
|
||||
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
|
||||
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
||||
|
||||
|
@ -266,7 +269,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// Running a different graph computation to make sure we override the CPU cache lines
|
||||
ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
|
||||
ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
|
||||
}
|
||||
printf("\n");
|
||||
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
|
||||
|
|
|
@ -240,7 +240,7 @@ static struct lora_data * load_lora(struct lora_info * info) {
|
|||
}
|
||||
|
||||
struct ggml_init_params params_ggml;
|
||||
params_ggml.mem_size = ggml_tensor_overhead() * GGML_MAX_NODES;
|
||||
params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
|
||||
params_ggml.mem_buffer = NULL;
|
||||
params_ggml.no_alloc = true;
|
||||
result->ctx = ggml_init(params_ggml);
|
||||
|
@ -334,7 +334,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
|
|||
float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
|
||||
params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
|
||||
params.mem_buffer = NULL;
|
||||
params.no_alloc = true;
|
||||
struct ggml_context * ctx = NULL;
|
||||
|
|
|
@ -772,7 +772,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
|||
if (enable_checkpointing) {
|
||||
ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
|
||||
} else {
|
||||
*gb = *gf;
|
||||
ggml_graph_cpy(gf, gb);
|
||||
ggml_build_backward_expand(ctx, gf, gb, true);
|
||||
}
|
||||
|
||||
|
@ -1615,6 +1615,7 @@ int main(int argc, char ** argv) {
|
|||
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||
opt->params.print_forward_graph = false;
|
||||
opt->params.print_backward_graph = false;
|
||||
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||
opt->params.n_threads = params.common.n_threads;
|
||||
opt->params.past = params.common.opt_past;
|
||||
opt->params.delta = params.common.opt_delta;
|
||||
|
@ -1741,11 +1742,9 @@ int main(int argc, char ** argv) {
|
|||
ggml_allocr_free(alloc);
|
||||
|
||||
// context for compute tensors without their data
|
||||
size_t estimated_compute_size_wo_data = (
|
||||
ggml_tensor_overhead()*GGML_MAX_NODES*2
|
||||
+ (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*(
|
||||
params.common.use_checkpointing ? 3 : 2
|
||||
)
|
||||
const size_t estimated_compute_size_wo_data = (
|
||||
2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
|
||||
(params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
|
||||
);
|
||||
struct ggml_init_params ctx_compute_params = {
|
||||
estimated_compute_size_wo_data, // mem_size
|
||||
|
@ -1768,11 +1767,11 @@ int main(int argc, char ** argv) {
|
|||
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
|
||||
ctx_compute = ggml_init(ctx_compute_params);
|
||||
alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||
gf = ggml_new_graph(ctx_compute);
|
||||
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
gf->order = (enum ggml_cgraph_eval_order) order;
|
||||
gb = ggml_new_graph(ctx_compute);
|
||||
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
gb_tmp = params.common.use_checkpointing
|
||||
? ggml_new_graph(ctx_compute)
|
||||
? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
|
||||
: NULL;
|
||||
loss = llama_build_lora_finetune_graphs(
|
||||
&model, &lora, alloc, ctx_compute,
|
||||
|
@ -1801,11 +1800,11 @@ int main(int argc, char ** argv) {
|
|||
mem_compute_data.resize(max_compute_size);
|
||||
ctx_compute = ggml_init(ctx_compute_params);
|
||||
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
|
||||
gf = ggml_new_graph(ctx_compute);
|
||||
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
gf->order = best_order;
|
||||
gb = ggml_new_graph(ctx_compute);
|
||||
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
gb_tmp = params.common.use_checkpointing
|
||||
? ggml_new_graph(ctx_compute)
|
||||
? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
|
||||
: NULL;
|
||||
loss = llama_build_lora_finetune_graphs(
|
||||
&model, &lora, alloc, ctx_compute,
|
||||
|
|
|
@ -664,7 +664,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
// measure mem requirement and allocate
|
||||
{
|
||||
static const size_t tensor_alignment = 32;
|
||||
new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
|
||||
new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
|
||||
new_clip->alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||
clip_image_f32_batch batch;
|
||||
batch.size = 1;
|
||||
|
|
|
@ -34,7 +34,7 @@ int main(int argc, char ** argv) {
|
|||
struct ggml_context * ctx_data = NULL;
|
||||
struct ggml_context * ctx_eval = NULL;
|
||||
|
||||
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
|
||||
struct ggml_cgraph * gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
|
||||
|
||||
// this allocates all Metal resources and memory buffers
|
||||
auto * ctx_metal = ggml_metal_init(1);
|
||||
|
@ -46,13 +46,13 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// main
|
||||
{
|
||||
struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
|
||||
struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
|
||||
*(int32_t *) input->data = 1; // BOS
|
||||
|
||||
ggml_metal_set_tensor(ctx_metal, input);
|
||||
|
||||
// warmup
|
||||
ggml_metal_graph_compute(ctx_metal, &gf);
|
||||
ggml_metal_graph_compute(ctx_metal, gf);
|
||||
|
||||
const int n_iter = 16;
|
||||
|
||||
|
@ -60,7 +60,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// the actual inference happens here
|
||||
for (int i = 0; i < n_iter; ++i) {
|
||||
ggml_metal_graph_compute(ctx_metal, &gf);
|
||||
ggml_metal_graph_compute(ctx_metal, gf);
|
||||
}
|
||||
|
||||
const int64_t t1 = ggml_time_us();
|
||||
|
@ -70,7 +70,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// debug output
|
||||
{
|
||||
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
|
||||
struct ggml_tensor * logits = gf->nodes[gf->n_nodes - 1];
|
||||
ggml_metal_get_tensor(ctx_metal, logits);
|
||||
|
||||
float * ptr = (float *) ggml_get_data(logits);
|
||||
|
|
|
@ -436,7 +436,7 @@ static struct ggml_tensor * llama_build_train_graphs(
|
|||
if (enable_checkpointing) {
|
||||
ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
|
||||
} else {
|
||||
*gb = *gf;
|
||||
ggml_graph_cpy(gf, gb);
|
||||
ggml_build_backward_expand(ctx, gf, gb, true);
|
||||
}
|
||||
|
||||
|
@ -1006,6 +1006,7 @@ int main(int argc, char ** argv) {
|
|||
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||
opt->params.print_forward_graph = false;
|
||||
opt->params.print_backward_graph = false;
|
||||
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||
opt->params.n_threads = params.common.n_threads;
|
||||
opt->params.past = params.common.opt_past;
|
||||
opt->params.delta = params.common.opt_delta;
|
||||
|
@ -1108,11 +1109,9 @@ int main(int argc, char ** argv) {
|
|||
ggml_allocr_free(alloc);
|
||||
|
||||
// context for compute tensors without their data
|
||||
size_t estimated_compute_size_wo_data = (
|
||||
ggml_tensor_overhead()*GGML_MAX_NODES*2
|
||||
+ (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*(
|
||||
params.common.use_checkpointing ? 3 : 2
|
||||
)
|
||||
const size_t estimated_compute_size_wo_data = (
|
||||
2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
|
||||
(params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
|
||||
);
|
||||
struct ggml_init_params ctx_compute_params = {
|
||||
estimated_compute_size_wo_data, // mem_size
|
||||
|
@ -1135,11 +1134,11 @@ int main(int argc, char ** argv) {
|
|||
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
|
||||
ctx_compute = ggml_init(ctx_compute_params);
|
||||
alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||
gf = ggml_new_graph(ctx_compute);
|
||||
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
gf->order = (enum ggml_cgraph_eval_order) order;
|
||||
gb = ggml_new_graph(ctx_compute);
|
||||
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
gb_tmp = params.common.use_checkpointing
|
||||
? ggml_new_graph(ctx_compute)
|
||||
? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
|
||||
: NULL;
|
||||
loss = llama_build_train_graphs(
|
||||
&model, alloc, ctx_compute,
|
||||
|
@ -1168,11 +1167,11 @@ int main(int argc, char ** argv) {
|
|||
mem_compute_data.resize(max_compute_size);
|
||||
ctx_compute = ggml_init(ctx_compute_params);
|
||||
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
|
||||
gf = ggml_new_graph(ctx_compute);
|
||||
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
gf->order = best_order;
|
||||
gb = ggml_new_graph(ctx_compute);
|
||||
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
gb_tmp = params.common.use_checkpointing
|
||||
? ggml_new_graph(ctx_compute)
|
||||
? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
|
||||
: NULL;
|
||||
loss = llama_build_train_graphs(
|
||||
&model, alloc, ctx_compute,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue