ggml : more consistent naming + metal fixes

This commit is contained in:
Georgi Gerganov 2023-07-06 20:23:08 +03:00
parent b1331d7e60
commit 53cfb4b995
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
12 changed files with 195 additions and 176 deletions

View file

@ -1569,7 +1569,7 @@ int main(int argc, char ** argv) {
int n_tokens = model.hparams.n_ctx; int n_tokens = model.hparams.n_ctx;
int n_vocab = model.hparams.n_vocab; int n_vocab = model.hparams.n_vocab;
auto compute_plan_buffer = std::vector<uint8_t>(); std::vector<uint8_t> work_buffer;
for (int ex=0; ex<n_examples; ++ex) { for (int ex=0; ex<n_examples; ++ex) {
struct ggml_init_params params = { struct ggml_init_params params = {
@ -1598,12 +1598,12 @@ int main(int argc, char ** argv) {
ggml_build_forward_expand(&gf, e); ggml_build_forward_expand(&gf, e);
{ {
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) { if (pf.work_size > 0) {
compute_plan_buffer.resize(plan.work_size); work_buffer.resize(pf.work_size);
plan.work_data = compute_plan_buffer.data(); pf.work_data = work_buffer.data();
} }
ggml_graph_compute(&plan, &gf); ggml_graph_compute(&gf, &pf);
} }
float error_before_opt = ggml_get_f32_1d(e, 0); float error_before_opt = ggml_get_f32_1d(e, 0);
@ -1622,12 +1622,12 @@ int main(int argc, char ** argv) {
ggml_build_forward_expand(&gf, e); ggml_build_forward_expand(&gf, e);
{ {
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) { if (pf.work_size > 0) {
compute_plan_buffer.resize(plan.work_size); work_buffer.resize(pf.work_size);
plan.work_data = compute_plan_buffer.data(); pf.work_data = work_buffer.data();
} }
ggml_graph_compute(&plan, &gf); ggml_graph_compute(&gf, &pf);
} }
float error_after_opt = ggml_get_f32_1d(e, 0); float error_after_opt = ggml_get_f32_1d(e, 0);
@ -1683,12 +1683,12 @@ int main(int argc, char ** argv) {
ggml_build_forward_expand(&gf, logits); ggml_build_forward_expand(&gf, logits);
{ {
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) { if (pf.work_size > 0) {
compute_plan_buffer.resize(plan.work_size); work_buffer.resize(pf.work_size);
plan.work_data = compute_plan_buffer.data(); pf.work_data = work_buffer.data();
} }
ggml_graph_compute(&plan, &gf); ggml_graph_compute(&gf, &pf);
} }
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);

View file

@ -164,15 +164,15 @@ int main(int argc, char ** argv) {
TENSOR_DUMP(m11); TENSOR_DUMP(m11);
TENSOR_DUMP(m2); TENSOR_DUMP(m2);
auto compute_plan_buffer = std::vector<uint8_t>(); std::vector<uint8_t> work_buffer;
{ {
auto plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads); ggml_cplan pf = ggml_graph_plan(&gf, benchmark_params.n_threads);
if (plan.work_size > 0) { if (pf.work_size > 0) {
compute_plan_buffer.resize(plan.work_size); work_buffer.resize(pf.work_size);
plan.work_data = compute_plan_buffer.data(); pf.work_data = work_buffer.data();
} }
ggml_graph_compute(&plan, &gf); ggml_graph_compute(&gf, &pf);
} }
TENSOR_DUMP(gf.nodes[0]); TENSOR_DUMP(gf.nodes[0]);
@ -228,12 +228,12 @@ int main(int argc, char ** argv) {
long long int start = ggml_time_us(); long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n"); //printf("Running ggml_graph_compute\n");
{ {
auto plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads); ggml_cplan pf31 = ggml_graph_plan(&gf31, benchmark_params.n_threads);
if (plan.work_size > 0) { if (pf31.work_size > 0) {
compute_plan_buffer.resize(plan.work_size); work_buffer.resize(pf31.work_size);
plan.work_data = compute_plan_buffer.data(); pf31.work_data = work_buffer.data();
} }
ggml_graph_compute(&plan, &gf31); ggml_graph_compute(&gf31, &pf31);
} }
long long int stop = ggml_time_us(); long long int stop = ggml_time_us();
@ -268,12 +268,12 @@ int main(int argc, char ** argv) {
// Running a different graph computation to make sure we override the CPU cache lines // Running a different graph computation to make sure we override the CPU cache lines
{ {
auto plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads); ggml_cplan pf32 = ggml_graph_plan(&gf32, benchmark_params.n_threads);
if (plan.work_size > 0) { if (pf32.work_size > 0) {
compute_plan_buffer.resize(plan.work_size); work_buffer.resize(pf32.work_size);
plan.work_data = compute_plan_buffer.data(); pf32.work_data = work_buffer.data();
} }
ggml_graph_compute(&plan, &gf32); ggml_graph_compute(&gf32, &pf32);
} }
} }
printf("\n"); printf("\n");

View file

@ -35,10 +35,9 @@ int main(int argc, char ** argv) {
struct ggml_context * ctx_eval = NULL; struct ggml_context * ctx_eval = NULL;
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
gf.n_threads = 1;
// this allocates all Metal resources and memory buffers // this allocates all Metal resources and memory buffers
auto * ctx_metal = ggml_metal_init(); auto * ctx_metal = ggml_metal_init(1);
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data); const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval); const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);

View file

@ -3160,6 +3160,7 @@ int main(int argc, char ** argv) {
printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx)); printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
// ggml_print_tensor_objects(model.ctx); // ggml_print_tensor_objects(model.ctx);
// TODO: use std::vector<uint8_t> intead of "new"
size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb); size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
uint8_t * compute_addr = new uint8_t[compute_size]; uint8_t * compute_addr = new uint8_t[compute_size];
@ -3181,7 +3182,7 @@ int main(int argc, char ** argv) {
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
} }
auto compute_plan_buffer = std::vector<uint8_t>(); std::vector<uint8_t> work_buffer;
printf("%s: begin training\n", __func__); printf("%s: begin training\n", __func__);
@ -3246,12 +3247,12 @@ int main(int argc, char ** argv) {
} }
{ {
auto plan = ggml_graph_compute_make_plan(gf, params.n_threads); ggml_cplan pf = ggml_graph_plan(gf, params.n_threads);
if (plan.work_size > 0) { if (pf.work_size > 0) {
compute_plan_buffer.resize(plan.work_size); work_buffer.resize(pf.work_size);
plan.work_data = compute_plan_buffer.data(); pf.work_data = work_buffer.data();
} }
ggml_graph_compute(&plan, gf); ggml_graph_compute(gf, &pf);
} }
size_t used_mem_before_opt = ggml_used_mem(ctx0); size_t used_mem_before_opt = ggml_used_mem(ctx0);
@ -3277,12 +3278,12 @@ int main(int argc, char ** argv) {
model.train_tokens += n_batch * n_tokens; model.train_tokens += n_batch * n_tokens;
{ {
auto plan = ggml_graph_compute_make_plan(gf, params.n_threads); ggml_cplan pf = ggml_graph_plan(gf, params.n_threads);
if (plan.work_size > 0) { if (pf.work_size > 0) {
compute_plan_buffer.resize(plan.work_size); work_buffer.resize(pf.work_size);
plan.work_data = compute_plan_buffer.data(); pf.work_data = work_buffer.data();
} }
ggml_graph_compute(&plan, gf); ggml_graph_compute(gf, &pf);
} }
float error_after_opt = ggml_get_f32_1d(loss, 0); float error_after_opt = ggml_get_f32_1d(loss, 0);
@ -3372,12 +3373,12 @@ int main(int argc, char ** argv) {
ggml_build_forward_expand(&gf, logits); ggml_build_forward_expand(&gf, logits);
{ {
auto plan = ggml_graph_compute_make_plan(&gf, params.n_threads); ggml_cplan pf = ggml_graph_plan(&gf, params.n_threads);
if (plan.work_size > 0) { if (pf.work_size > 0) {
compute_plan_buffer.resize(plan.work_size); work_buffer.resize(pf.work_size);
plan.work_data = compute_plan_buffer.data(); pf.work_data = work_buffer.data();
} }
ggml_graph_compute(&plan, &gf); ggml_graph_compute(&gf, &pf);
} }
//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
@ -3404,6 +3405,7 @@ int main(int argc, char ** argv) {
delete[] compute_addr; delete[] compute_addr;
delete[] compute_buf_0; delete[] compute_buf_0;
delete[] compute_buf_1; delete[] compute_buf_1;
llama_free(lctx); llama_free(lctx);
llama_free_model(lmodel); llama_free_model(lmodel);
ggml_free(model.ctx); ggml_free(model.ctx);

View file

@ -34,9 +34,13 @@ extern "C" {
struct ggml_metal_context; struct ggml_metal_context;
struct ggml_metal_context * ggml_metal_init(void); // number of command buffers to use
struct ggml_metal_context * ggml_metal_init(int n_cb);
void ggml_metal_free(struct ggml_metal_context * ctx); void ggml_metal_free(struct ggml_metal_context * ctx);
// set the number of command buffers to use
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
// creates a mapping between a host memory buffer and a device memory buffer // creates a mapping between a host memory buffer and a device memory buffer
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
// - the mapping is used during computation to determine the arguments of the compute kernels // - the mapping is used during computation to determine the arguments of the compute kernels

View file

@ -25,6 +25,8 @@ struct ggml_metal_buffer {
}; };
struct ggml_metal_context { struct ggml_metal_context {
int n_cb;
float * logits; float * logits;
id<MTLDevice> device; id<MTLDevice> device;
@ -86,11 +88,12 @@ static NSString * const msl_library_source = @"see metal.metal";
@implementation GGMLMetalClass @implementation GGMLMetalClass
@end @end
struct ggml_metal_context * ggml_metal_init(void) { struct ggml_metal_context * ggml_metal_init(int n_cb) {
fprintf(stderr, "%s: allocating\n", __func__); fprintf(stderr, "%s: allocating\n", __func__);
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
ctx->n_cb = n_cb;
ctx->device = MTLCreateSystemDefaultDevice(); ctx->device = MTLCreateSystemDefaultDevice();
ctx->queue = [ctx->device newCommandQueue]; ctx->queue = [ctx->device newCommandQueue];
ctx->n_buffers = 0; ctx->n_buffers = 0;
@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
free(ctx); free(ctx);
} }
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
ctx->n_cb = n_cb;
}
// finds the Metal buffer that contains the tensor data on the GPU device // finds the Metal buffer that contains the tensor data on the GPU device
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
// Metal buffer based on the host memory pointer // Metal buffer based on the host memory pointer
@ -354,7 +361,7 @@ void ggml_metal_graph_compute(
// create multiple command buffers and enqueue them // create multiple command buffers and enqueue them
// then, we encode the graph into the command buffers in parallel // then, we encode the graph into the command buffers in parallel
const int n_cb = gf->n_threads; const int n_cb = ctx->n_cb;
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb]; NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];

77
ggml.c
View file

@ -15942,7 +15942,7 @@ void clear_numa_thread_affinity(void) {}
struct ggml_compute_state_shared { struct ggml_compute_state_shared {
const struct ggml_cgraph * cgraph; const struct ggml_cgraph * cgraph;
const struct ggml_graph_compute_plan * plan; const struct ggml_cplan * cplan;
int64_t perf_node_start_cycles; int64_t perf_node_start_cycles;
int64_t perf_node_start_time_us; int64_t perf_node_start_time_us;
@ -15971,12 +15971,13 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
static thread_ret_t ggml_graph_compute_thread(void * data) { static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data; struct ggml_compute_state * state = (struct ggml_compute_state *) data;
const struct ggml_cgraph * cgraph = state->shared->cgraph; const struct ggml_cgraph * cgraph = state->shared->cgraph;
const struct ggml_cplan * cplan = state->shared->cplan;
const struct ggml_graph_compute_plan * plan = state->shared->plan; const int * n_tasks_arr = cplan->n_tasks;
const int * n_tasks_arr = plan->n_tasks;
const int n_threads = state->shared->n_threads; const int n_threads = state->shared->n_threads;
set_numa_thread_affinity(state->ith, n_threads); set_numa_thread_affinity(state->ith, n_threads);
int node_n = -1; int node_n = -1;
@ -15989,8 +15990,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
/*.type =*/ GGML_TASK_FINALIZE, /*.type =*/ GGML_TASK_FINALIZE,
/*.ith =*/ 0, /*.ith =*/ 0,
/*.nth =*/ 0, /*.nth =*/ 0,
/*.wsize =*/ plan->work_size, /*.wsize =*/ cplan->work_size,
/*.wdata =*/ plan->work_data, /*.wdata =*/ cplan->work_data,
}; };
if (node_n != -1) { if (node_n != -1) {
@ -16059,8 +16060,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
/*.type =*/ GGML_TASK_COMPUTE, /*.type =*/ GGML_TASK_COMPUTE,
/*.ith =*/ state->ith, /*.ith =*/ state->ith,
/*.nth =*/ n_tasks, /*.nth =*/ n_tasks,
/*.wsize =*/ plan->work_size, /*.wsize =*/ cplan->work_size,
/*.wdata =*/ plan->work_data, /*.wdata =*/ cplan->work_data,
}; };
if (state->ith < n_tasks) { if (state->ith < n_tasks) {
@ -16072,14 +16073,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
} }
// Prepare for graph computing. // Prepare for graph computing.
struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, int n_threads) { struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
if (n_threads <= 0) { if (n_threads <= 0) {
n_threads = GGML_DEFAULT_N_THREADS; n_threads = GGML_DEFAULT_N_THREADS;
} }
struct ggml_graph_compute_plan plan; struct ggml_cplan cplan;
memset(&plan, 0, sizeof(struct ggml_graph_compute_plan)); memset(&cplan, 0, sizeof(struct ggml_cplan));
int * n_tasks = plan.n_tasks;
int * n_tasks = cplan.n_tasks;
size_t work_size = 0; size_t work_size = 0;
// initialize tasks + work buffer // initialize tasks + work buffer
@ -16403,34 +16406,34 @@ struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph *
work_size += CACHE_LINE_SIZE*(n_threads - 1); work_size += CACHE_LINE_SIZE*(n_threads - 1);
} }
plan.n_threads = n_threads; cplan.n_threads = n_threads;
plan.work_size = work_size; cplan.work_size = work_size;
plan.work_data = NULL; cplan.work_data = NULL;
return plan; return cplan;
} }
void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph) { void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
{ {
GGML_ASSERT(plan); GGML_ASSERT(cplan);
GGML_ASSERT(plan->n_threads > 0); GGML_ASSERT(cplan->n_threads > 0);
if (plan->work_size > 0) { if (cplan->work_size > 0) {
GGML_ASSERT(plan->work_data); GGML_ASSERT(cplan->work_data);
} }
for (int i = 0; i < cgraph->n_nodes; ++i) { for (int i = 0; i < cgraph->n_nodes; ++i) {
if (cgraph->nodes[i]->op != GGML_OP_NONE) { if (cgraph->nodes[i]->op != GGML_OP_NONE) {
GGML_ASSERT(plan->n_tasks[i] > 0); GGML_ASSERT(cplan->n_tasks[i] > 0);
} }
} }
} }
const int n_threads = plan->n_threads; const int n_threads = cplan->n_threads;
struct ggml_compute_state_shared state_shared = { struct ggml_compute_state_shared state_shared = {
/*.cgraph =*/ cgraph, /*.cgraph =*/ cgraph,
/*.cgraph_plan =*/ plan, /*.cgraph_plan =*/ cplan,
/*.perf_node_start_cycles =*/ 0, /*.perf_node_start_cycles =*/ 0,
/*.perf_node_start_time_us =*/ 0, /*.perf_node_start_time_us =*/ 0,
/*.n_threads =*/ n_threads, /*.n_threads =*/ n_threads,
@ -16491,17 +16494,19 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap
} }
// TODO: avoid allocating memory frequently. // TODO: avoid allocating memory frequently.
static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) { // TODO: make part of public API - use different name and put warning that it makes allocations
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads); static void ggml_graph_compute_helper(struct ggml_cgraph * cgraph, int n_threads) {
if (plan.work_size > 0) { struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data); if (cplan.work_size > 0) {
cplan.work_data = malloc(cplan.work_size);
GGML_ASSERT(cplan.work_data);
} }
ggml_graph_compute(&plan, cgraph); ggml_graph_compute(cgraph, &cplan);
if (plan.work_data) { if (cplan.work_data) {
free(plan.work_data); free(cplan.work_data);
} }
} }
@ -17341,7 +17346,7 @@ static enum ggml_opt_result ggml_opt_adam(
ggml_graph_reset (gf); ggml_graph_reset (gf);
ggml_set_f32 (f->grad, 1.0f); ggml_set_f32 (f->grad, 1.0f);
ggml_graph_compute_sugar(gb, params.n_threads); ggml_graph_compute_helper(gb, params.n_threads);
opt->adam.fx_prev = ggml_get_f32_1d(f, 0); opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
opt->adam.fx_best = opt->adam.fx_prev; opt->adam.fx_best = opt->adam.fx_prev;
@ -17422,7 +17427,7 @@ static enum ggml_opt_result ggml_opt_adam(
ggml_graph_reset (gf); ggml_graph_reset (gf);
ggml_set_f32 (f->grad, 1.0f); ggml_set_f32 (f->grad, 1.0f);
ggml_graph_compute_sugar(gb, params.n_threads); ggml_graph_compute_helper(gb, params.n_threads);
const float fx = ggml_get_f32_1d(f, 0); const float fx = ggml_get_f32_1d(f, 0);
@ -17544,7 +17549,7 @@ static enum ggml_opt_result linesearch_backtracking(
ggml_graph_reset (gf); ggml_graph_reset (gf);
ggml_set_f32 (f->grad, 1.0f); ggml_set_f32 (f->grad, 1.0f);
ggml_graph_compute_sugar(gb, params->n_threads); ggml_graph_compute_helper(gb, params->n_threads);
ggml_opt_get_grad(np, ps, g); ggml_opt_get_grad(np, ps, g);
@ -17664,7 +17669,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
ggml_graph_reset (gf); ggml_graph_reset (gf);
ggml_set_f32 (f->grad, 1.0f); ggml_set_f32 (f->grad, 1.0f);
ggml_graph_compute_sugar(gb, params.n_threads); ggml_graph_compute_helper(gb, params.n_threads);
ggml_opt_get_grad(np, ps, g); ggml_opt_get_grad(np, ps, g);

22
ggml.h
View file

@ -443,17 +443,15 @@ extern "C" {
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
// The default graph compute plan that needs to be prepared for ggml_graph_compute(). // the compute plan that needs to be prepared for ggml_graph_compute()
// Since https://github.com/ggerganov/ggml/issues/287 // since https://github.com/ggerganov/ggml/issues/287
struct ggml_graph_compute_plan { struct ggml_cplan {
// Size of work buffer, calculated by `ggml_graph_compute_make_plan()`. size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
size_t work_size; uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
// Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`.
uint8_t * work_data;
int n_threads; int n_threads;
// The `n_tasks` of nodes, 1:1 mapping to cgraph nodes. // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
int n_tasks[GGML_MAX_NODES]; int n_tasks[GGML_MAX_NODES];
}; };
@ -1313,10 +1311,10 @@ extern "C" {
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
// ggml_graph_compute_make_plan() needs to be called before ggml_graph_compute(). // ggml_graph_plan() has to be called before ggml_graph_compute()
// Returns a plan object. When plan.work_size > 0, caller must allocate memory for plan.work_data. // when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, const int n_threads/*=GGML_DEFAULT_N_THREADS*/); GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);

View file

@ -321,9 +321,8 @@ struct llama_context {
// input embedding (1-dimensional array: [n_embd]) // input embedding (1-dimensional array: [n_embd])
std::vector<float> embedding; std::vector<float> embedding;
// reusable buffer for `struct ggml_graph_compute_plan.work_data` // reusable buffer for `struct ggml_graph_plan.work_data`
// std::vector guarantees the elements are stored contiguously. std::vector<uint8_t> work_buffer;
std::vector<uint8_t> compute_plan_buffer;
// memory buffers used to evaluate the model // memory buffers used to evaluate the model
// TODO: move in llama_state // TODO: move in llama_state
@ -1599,6 +1598,7 @@ static bool llama_eval_internal(
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) { if (lctx.ctx_metal && N == 1) {
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
ggml_metal_graph_compute(lctx.ctx_metal, &gf); ggml_metal_graph_compute(lctx.ctx_metal, &gf);
ggml_metal_get_tensor (lctx.ctx_metal, cur); ggml_metal_get_tensor (lctx.ctx_metal, cur);
call_ggml_graph_compute = false; call_ggml_graph_compute = false;
@ -1622,12 +1622,12 @@ static bool llama_eval_internal(
#endif #endif
if (call_ggml_graph_compute) { if (call_ggml_graph_compute) {
auto plan = ggml_graph_compute_make_plan(&gf, actual_n_threads); ggml_cplan pf = ggml_graph_plan(&gf, actual_n_threads);
if (plan.work_size > 0) { if (pf.work_size > 0) {
lctx.compute_plan_buffer.resize(plan.work_size); lctx.work_buffer.resize(pf.work_size);
plan.work_data = lctx.compute_plan_buffer.data(); pf.work_data = lctx.work_buffer.data();
} }
ggml_graph_compute(&plan, &gf); ggml_graph_compute(&gf, &pf);
} }
if (cgraph_fname) { if (cgraph_fname) {
@ -2657,7 +2657,7 @@ struct llama_context * llama_new_context_with_model(
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
if (params.n_gpu_layers > 0) { if (params.n_gpu_layers > 0) {
// this allocates all Metal resources and memory buffers // this allocates all Metal resources and memory buffers
ctx->ctx_metal = ggml_metal_init(); ctx->ctx_metal = ggml_metal_init(1);
void * data_ptr = NULL; void * data_ptr = NULL;
size_t data_size = 0; size_t data_size = 0;
@ -2815,7 +2815,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
bool warned = false; bool warned = false;
int n_tensors = 0; int n_tensors = 0;
auto compute_plan_buffer = std::vector<uint8_t>(); std::vector<uint8_t> work_buffer;
while (true) { while (true) {
int32_t n_dims; int32_t n_dims;
@ -2983,12 +2983,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
struct ggml_cgraph gf = ggml_build_forward(r); struct ggml_cgraph gf = ggml_build_forward(r);
{ {
auto plan = ggml_graph_compute_make_plan(&gf, n_threads); ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
if (plan.work_size > 0) { if (pf.work_size > 0) {
compute_plan_buffer.resize(plan.work_size); work_buffer.resize(pf.work_size);
plan.work_data = compute_plan_buffer.data(); pf.work_data = work_buffer.data();
} }
ggml_graph_compute(&plan, &gf); ggml_graph_compute(&gf, &pf);
} }
// we won't need these tensors again, reset the context to save memory // we won't need these tensors again, reset the context to save memory
@ -3163,12 +3163,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
{ {
auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) { if (pf.work_size > 0) {
ctx->compute_plan_buffer.resize(plan.work_size); ctx->work_buffer.resize(pf.work_size);
plan.work_data = ctx->compute_plan_buffer.data(); pf.work_data = ctx->work_buffer.data();
} }
ggml_graph_compute(&plan, &gf); ggml_graph_compute(&gf, &pf);
} }
ggml_free(cpy_ctx); ggml_free(cpy_ctx);
@ -3276,12 +3276,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
{ {
auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) { if (pf.work_size > 0) {
ctx->compute_plan_buffer.resize(plan.work_size); ctx->work_buffer.resize(pf.work_size);
plan.work_data = ctx->compute_plan_buffer.data(); pf.work_data = ctx->work_buffer.data();
} }
ggml_graph_compute(&plan, &gf); ggml_graph_compute(&gf, &pf);
} }
ggml_free(cpy_ctx); ggml_free(cpy_ctx);

View file

@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp)
llama_add_test(test-quantize-perf.cpp) llama_add_test(test-quantize-perf.cpp)
llama_add_test(test-sampling.cpp) llama_add_test(test-sampling.cpp)
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
# llama_add_test(test-grad0.c) # SLOW llama_add_test(test-grad0.c) # SLOW
# llama_add_test(test-opt.c) # SLOW llama_add_test(test-opt.c) # SLOW

View file

@ -10,6 +10,8 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#pragma GCC diagnostic ignored "-Wdouble-promotion"
#define MAX_NARGS 3 #define MAX_NARGS 3
#undef MIN #undef MIN
@ -49,7 +51,7 @@ float frand(void) {
int irand(int n) { int irand(int n) {
if (n == 0) return 0; if (n == 0) return 0;
else return rand()%n; return rand()%n;
} }
void get_random_dims(int64_t * dims, int ndims) { void get_random_dims(int64_t * dims, int ndims) {
@ -159,12 +161,14 @@ struct ggml_tensor * get_random_tensor_int(
float get_element(const struct ggml_tensor * t, int idx) { float get_element(const struct ggml_tensor * t, int idx) {
if (t->type == GGML_TYPE_F32) { if (t->type == GGML_TYPE_F32) {
return ((float *)t->data)[idx]; return ((float *)t->data)[idx];
} else if (t->type == GGML_TYPE_I32) { }
if (t->type == GGML_TYPE_I32) {
return ((int32_t *)t->data)[idx]; return ((int32_t *)t->data)[idx];
} else { }
assert(false); assert(false);
return INFINITY; return INFINITY;
}
} }
void set_element(struct ggml_tensor * t, int idx, float value) { void set_element(struct ggml_tensor * t, int idx, float value) {
@ -191,12 +195,12 @@ void print_elements(const char* label, const struct ggml_tensor * t) {
} }
struct compute_plan_buffer { struct work_buffer {
size_t size; size_t size;
uint8_t * data; uint8_t * data;
}; };
static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) { static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) {
if (size == 0) { if (size == 0) {
return NULL; return NULL;
} }
@ -241,20 +245,19 @@ bool check_gradient(
} }
struct ggml_cgraph gf = ggml_build_forward (f); struct ggml_cgraph gf = ggml_build_forward (f);
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL }; struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL };
{ {
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
if (plan.work_size > 0) { if (pf.work_size > 0) {
plan.work_data = malloc(plan.work_size); pf.work_data = malloc(pf.work_size);
GGML_ASSERT(plan.work_data); GGML_ASSERT(pf.work_data);
} }
ggml_graph_compute(&plan, &gf); ggml_graph_compute(&gf, &pf);
if (plan.work_data) { if (pf.work_data) {
free(plan.work_data); free(pf.work_data);
} }
} }
@ -262,9 +265,9 @@ bool check_gradient(
ggml_set_f32 (f->grad, 1.0f); ggml_set_f32 (f->grad, 1.0f);
{ {
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads); struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads);
plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); pf.work_data = work_buffer_resize(&buf, pf.work_size);
ggml_graph_compute(&plan, &gb); ggml_graph_compute(&gf, &pf);
} }
// ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot"); // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
@ -280,9 +283,9 @@ bool check_gradient(
set_element(x[i], k, xp); set_element(x[i], k, xp);
{ {
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); pf.work_data = work_buffer_resize(&buf, pf.work_size);
ggml_graph_compute(&plan, &gf); ggml_graph_compute(&gf, &pf);
} }
const float f0 = ggml_get_f32_1d(f, 0); const float f0 = ggml_get_f32_1d(f, 0);
@ -290,9 +293,9 @@ bool check_gradient(
set_element(x[i], k, xm); set_element(x[i], k, xm);
{ {
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); pf.work_data = work_buffer_resize(&buf, pf.work_size);
ggml_graph_compute(&plan, &gf); ggml_graph_compute(&gf, &pf);
} }
const float f1 = ggml_get_f32_1d(f, 0); const float f1 = ggml_get_f32_1d(f, 0);
@ -306,15 +309,15 @@ bool check_gradient(
ggml_set_f32 (f->grad, 1.0f); ggml_set_f32 (f->grad, 1.0f);
{ {
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads); struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads);
plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); pf.work_data = work_buffer_resize(&buf, pf.work_size);
ggml_graph_compute(&plan, &gb); ggml_graph_compute(&gf, &pf);
} }
const float g1 = get_element(x[i]->grad, k); const float g1 = get_element(x[i]->grad, k);
const float error_abs = fabsf(g0 - g1); const float error_abs = fabsf(g0 - g1);
const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0; const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
if (error_abs > max_error_abs || error_rel > max_error_rel) { if (error_abs > max_error_abs || error_rel > max_error_rel) {
printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n", printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
@ -325,8 +328,8 @@ bool check_gradient(
} }
} }
if (plan_buf.data) { if (buf.data) {
free(plan_buf.data); free(buf.data);
} }
return true; return true;

View file

@ -7,6 +7,7 @@
#define MAX_NARGS 2 #define MAX_NARGS 2
#pragma GCC diagnostic ignored "-Wdouble-promotion"
// //
// logging // logging
@ -33,7 +34,7 @@
#define GGML_PRINT(...) printf(__VA_ARGS__) #define GGML_PRINT(...) printf(__VA_ARGS__)
float frand() { float frand(void) {
return (float)rand()/(float)RAND_MAX; return (float)rand()/(float)RAND_MAX;
} }
@ -115,12 +116,12 @@ void set_element(struct ggml_tensor * t, int idx, float value) {
} }
struct compute_plan_buffer { struct work_buffer {
size_t size; size_t size;
uint8_t * data; uint8_t * data;
}; };
static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) { static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) {
if (size == 0) { if (size == 0) {
return NULL; return NULL;
} }
@ -139,7 +140,7 @@ static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t s
return buf->data; return buf->data;
} }
int main(int argc, const char ** argv) { int main(void) {
struct ggml_init_params params = { struct ggml_init_params params = {
.mem_size = 1024*1024*1024, .mem_size = 1024*1024*1024,
.mem_buffer = NULL, .mem_buffer = NULL,
@ -166,11 +167,11 @@ int main(int argc, const char ** argv) {
struct ggml_cgraph ge = ggml_build_forward(e); struct ggml_cgraph ge = ggml_build_forward(e);
ggml_graph_reset (&ge); ggml_graph_reset (&ge);
struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL }; struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL };
{ {
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1); struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1);
plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); pe.work_data = work_buffer_resize(&buf, pe.work_size);
ggml_graph_compute(&plan, &ge); ggml_graph_compute(&ge, &pe);
} }
const float fe = ggml_get_f32_1d(e, 0); const float fe = ggml_get_f32_1d(e, 0);
@ -183,13 +184,13 @@ int main(int argc, const char ** argv) {
ggml_graph_reset (&ge); ggml_graph_reset (&ge);
{ {
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1); struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1);
plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); pe.work_data = work_buffer_resize(&buf, pe.work_size);
ggml_graph_compute(&plan, &ge); ggml_graph_compute(&ge, &pe);
} }
if (plan_buf.data) { if (buf.data) {
free(plan_buf.data); free(buf.data);
} }
const float fe_opt = ggml_get_f32_1d(e, 0); const float fe_opt = ggml_get_f32_1d(e, 0);