ggml : more consistent naming + metal fixes
This commit is contained in:
parent
b1331d7e60
commit
53cfb4b995
12 changed files with 195 additions and 176 deletions
|
@ -1569,7 +1569,7 @@ int main(int argc, char ** argv) {
|
||||||
int n_tokens = model.hparams.n_ctx;
|
int n_tokens = model.hparams.n_ctx;
|
||||||
int n_vocab = model.hparams.n_vocab;
|
int n_vocab = model.hparams.n_vocab;
|
||||||
|
|
||||||
auto compute_plan_buffer = std::vector<uint8_t>();
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
for (int ex=0; ex<n_examples; ++ex) {
|
for (int ex=0; ex<n_examples; ++ex) {
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
|
@ -1598,12 +1598,12 @@ int main(int argc, char ** argv) {
|
||||||
ggml_build_forward_expand(&gf, e);
|
ggml_build_forward_expand(&gf, e);
|
||||||
|
|
||||||
{
|
{
|
||||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
|
struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
|
||||||
if (plan.work_size > 0) {
|
if (pf.work_size > 0) {
|
||||||
compute_plan_buffer.resize(plan.work_size);
|
work_buffer.resize(pf.work_size);
|
||||||
plan.work_data = compute_plan_buffer.data();
|
pf.work_data = work_buffer.data();
|
||||||
}
|
}
|
||||||
ggml_graph_compute(&plan, &gf);
|
ggml_graph_compute(&gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
float error_before_opt = ggml_get_f32_1d(e, 0);
|
float error_before_opt = ggml_get_f32_1d(e, 0);
|
||||||
|
@ -1622,12 +1622,12 @@ int main(int argc, char ** argv) {
|
||||||
ggml_build_forward_expand(&gf, e);
|
ggml_build_forward_expand(&gf, e);
|
||||||
|
|
||||||
{
|
{
|
||||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
|
struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
|
||||||
if (plan.work_size > 0) {
|
if (pf.work_size > 0) {
|
||||||
compute_plan_buffer.resize(plan.work_size);
|
work_buffer.resize(pf.work_size);
|
||||||
plan.work_data = compute_plan_buffer.data();
|
pf.work_data = work_buffer.data();
|
||||||
}
|
}
|
||||||
ggml_graph_compute(&plan, &gf);
|
ggml_graph_compute(&gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
float error_after_opt = ggml_get_f32_1d(e, 0);
|
float error_after_opt = ggml_get_f32_1d(e, 0);
|
||||||
|
@ -1683,12 +1683,12 @@ int main(int argc, char ** argv) {
|
||||||
ggml_build_forward_expand(&gf, logits);
|
ggml_build_forward_expand(&gf, logits);
|
||||||
|
|
||||||
{
|
{
|
||||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
|
struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
|
||||||
if (plan.work_size > 0) {
|
if (pf.work_size > 0) {
|
||||||
compute_plan_buffer.resize(plan.work_size);
|
work_buffer.resize(pf.work_size);
|
||||||
plan.work_data = compute_plan_buffer.data();
|
pf.work_data = work_buffer.data();
|
||||||
}
|
}
|
||||||
ggml_graph_compute(&plan, &gf);
|
ggml_graph_compute(&gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
||||||
|
|
|
@ -164,15 +164,15 @@ int main(int argc, char ** argv) {
|
||||||
TENSOR_DUMP(m11);
|
TENSOR_DUMP(m11);
|
||||||
TENSOR_DUMP(m2);
|
TENSOR_DUMP(m2);
|
||||||
|
|
||||||
auto compute_plan_buffer = std::vector<uint8_t>();
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
{
|
{
|
||||||
auto plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
|
ggml_cplan pf = ggml_graph_plan(&gf, benchmark_params.n_threads);
|
||||||
if (plan.work_size > 0) {
|
if (pf.work_size > 0) {
|
||||||
compute_plan_buffer.resize(plan.work_size);
|
work_buffer.resize(pf.work_size);
|
||||||
plan.work_data = compute_plan_buffer.data();
|
pf.work_data = work_buffer.data();
|
||||||
}
|
}
|
||||||
ggml_graph_compute(&plan, &gf);
|
ggml_graph_compute(&gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_DUMP(gf.nodes[0]);
|
TENSOR_DUMP(gf.nodes[0]);
|
||||||
|
@ -228,12 +228,12 @@ int main(int argc, char ** argv) {
|
||||||
long long int start = ggml_time_us();
|
long long int start = ggml_time_us();
|
||||||
//printf("Running ggml_graph_compute\n");
|
//printf("Running ggml_graph_compute\n");
|
||||||
{
|
{
|
||||||
auto plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
|
ggml_cplan pf31 = ggml_graph_plan(&gf31, benchmark_params.n_threads);
|
||||||
if (plan.work_size > 0) {
|
if (pf31.work_size > 0) {
|
||||||
compute_plan_buffer.resize(plan.work_size);
|
work_buffer.resize(pf31.work_size);
|
||||||
plan.work_data = compute_plan_buffer.data();
|
pf31.work_data = work_buffer.data();
|
||||||
}
|
}
|
||||||
ggml_graph_compute(&plan, &gf31);
|
ggml_graph_compute(&gf31, &pf31);
|
||||||
}
|
}
|
||||||
|
|
||||||
long long int stop = ggml_time_us();
|
long long int stop = ggml_time_us();
|
||||||
|
@ -268,12 +268,12 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// Running a different graph computation to make sure we override the CPU cache lines
|
// Running a different graph computation to make sure we override the CPU cache lines
|
||||||
{
|
{
|
||||||
auto plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
|
ggml_cplan pf32 = ggml_graph_plan(&gf32, benchmark_params.n_threads);
|
||||||
if (plan.work_size > 0) {
|
if (pf32.work_size > 0) {
|
||||||
compute_plan_buffer.resize(plan.work_size);
|
work_buffer.resize(pf32.work_size);
|
||||||
plan.work_data = compute_plan_buffer.data();
|
pf32.work_data = work_buffer.data();
|
||||||
}
|
}
|
||||||
ggml_graph_compute(&plan, &gf32);
|
ggml_graph_compute(&gf32, &pf32);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
|
@ -35,10 +35,9 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_context * ctx_eval = NULL;
|
struct ggml_context * ctx_eval = NULL;
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
|
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
|
||||||
gf.n_threads = 1;
|
|
||||||
|
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
auto * ctx_metal = ggml_metal_init();
|
auto * ctx_metal = ggml_metal_init(1);
|
||||||
|
|
||||||
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
|
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
|
||||||
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
|
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
|
||||||
|
|
|
@ -3160,6 +3160,7 @@ int main(int argc, char ** argv) {
|
||||||
printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
|
printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
|
||||||
// ggml_print_tensor_objects(model.ctx);
|
// ggml_print_tensor_objects(model.ctx);
|
||||||
|
|
||||||
|
// TODO: use std::vector<uint8_t> intead of "new"
|
||||||
size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
|
size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
|
||||||
uint8_t * compute_addr = new uint8_t[compute_size];
|
uint8_t * compute_addr = new uint8_t[compute_size];
|
||||||
|
|
||||||
|
@ -3181,7 +3182,7 @@ int main(int argc, char ** argv) {
|
||||||
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
|
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
auto compute_plan_buffer = std::vector<uint8_t>();
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
printf("%s: begin training\n", __func__);
|
printf("%s: begin training\n", __func__);
|
||||||
|
|
||||||
|
@ -3246,12 +3247,12 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
|
ggml_cplan pf = ggml_graph_plan(gf, params.n_threads);
|
||||||
if (plan.work_size > 0) {
|
if (pf.work_size > 0) {
|
||||||
compute_plan_buffer.resize(plan.work_size);
|
work_buffer.resize(pf.work_size);
|
||||||
plan.work_data = compute_plan_buffer.data();
|
pf.work_data = work_buffer.data();
|
||||||
}
|
}
|
||||||
ggml_graph_compute(&plan, gf);
|
ggml_graph_compute(gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t used_mem_before_opt = ggml_used_mem(ctx0);
|
size_t used_mem_before_opt = ggml_used_mem(ctx0);
|
||||||
|
@ -3277,12 +3278,12 @@ int main(int argc, char ** argv) {
|
||||||
model.train_tokens += n_batch * n_tokens;
|
model.train_tokens += n_batch * n_tokens;
|
||||||
|
|
||||||
{
|
{
|
||||||
auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
|
ggml_cplan pf = ggml_graph_plan(gf, params.n_threads);
|
||||||
if (plan.work_size > 0) {
|
if (pf.work_size > 0) {
|
||||||
compute_plan_buffer.resize(plan.work_size);
|
work_buffer.resize(pf.work_size);
|
||||||
plan.work_data = compute_plan_buffer.data();
|
pf.work_data = work_buffer.data();
|
||||||
}
|
}
|
||||||
ggml_graph_compute(&plan, gf);
|
ggml_graph_compute(gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
float error_after_opt = ggml_get_f32_1d(loss, 0);
|
float error_after_opt = ggml_get_f32_1d(loss, 0);
|
||||||
|
@ -3372,12 +3373,12 @@ int main(int argc, char ** argv) {
|
||||||
ggml_build_forward_expand(&gf, logits);
|
ggml_build_forward_expand(&gf, logits);
|
||||||
|
|
||||||
{
|
{
|
||||||
auto plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
|
ggml_cplan pf = ggml_graph_plan(&gf, params.n_threads);
|
||||||
if (plan.work_size > 0) {
|
if (pf.work_size > 0) {
|
||||||
compute_plan_buffer.resize(plan.work_size);
|
work_buffer.resize(pf.work_size);
|
||||||
plan.work_data = compute_plan_buffer.data();
|
pf.work_data = work_buffer.data();
|
||||||
}
|
}
|
||||||
ggml_graph_compute(&plan, &gf);
|
ggml_graph_compute(&gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
||||||
|
@ -3404,6 +3405,7 @@ int main(int argc, char ** argv) {
|
||||||
delete[] compute_addr;
|
delete[] compute_addr;
|
||||||
delete[] compute_buf_0;
|
delete[] compute_buf_0;
|
||||||
delete[] compute_buf_1;
|
delete[] compute_buf_1;
|
||||||
|
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(lmodel);
|
llama_free_model(lmodel);
|
||||||
ggml_free(model.ctx);
|
ggml_free(model.ctx);
|
||||||
|
|
|
@ -34,9 +34,13 @@ extern "C" {
|
||||||
|
|
||||||
struct ggml_metal_context;
|
struct ggml_metal_context;
|
||||||
|
|
||||||
struct ggml_metal_context * ggml_metal_init(void);
|
// number of command buffers to use
|
||||||
|
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
||||||
void ggml_metal_free(struct ggml_metal_context * ctx);
|
void ggml_metal_free(struct ggml_metal_context * ctx);
|
||||||
|
|
||||||
|
// set the number of command buffers to use
|
||||||
|
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
||||||
|
|
||||||
// creates a mapping between a host memory buffer and a device memory buffer
|
// creates a mapping between a host memory buffer and a device memory buffer
|
||||||
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
||||||
// - the mapping is used during computation to determine the arguments of the compute kernels
|
// - the mapping is used during computation to determine the arguments of the compute kernels
|
||||||
|
|
11
ggml-metal.m
11
ggml-metal.m
|
@ -25,6 +25,8 @@ struct ggml_metal_buffer {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_metal_context {
|
struct ggml_metal_context {
|
||||||
|
int n_cb;
|
||||||
|
|
||||||
float * logits;
|
float * logits;
|
||||||
|
|
||||||
id<MTLDevice> device;
|
id<MTLDevice> device;
|
||||||
|
@ -86,11 +88,12 @@ static NSString * const msl_library_source = @"see metal.metal";
|
||||||
@implementation GGMLMetalClass
|
@implementation GGMLMetalClass
|
||||||
@end
|
@end
|
||||||
|
|
||||||
struct ggml_metal_context * ggml_metal_init(void) {
|
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
fprintf(stderr, "%s: allocating\n", __func__);
|
fprintf(stderr, "%s: allocating\n", __func__);
|
||||||
|
|
||||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||||
|
|
||||||
|
ctx->n_cb = n_cb;
|
||||||
ctx->device = MTLCreateSystemDefaultDevice();
|
ctx->device = MTLCreateSystemDefaultDevice();
|
||||||
ctx->queue = [ctx->device newCommandQueue];
|
ctx->queue = [ctx->device newCommandQueue];
|
||||||
ctx->n_buffers = 0;
|
ctx->n_buffers = 0;
|
||||||
|
@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
free(ctx);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
|
||||||
|
ctx->n_cb = n_cb;
|
||||||
|
}
|
||||||
|
|
||||||
// finds the Metal buffer that contains the tensor data on the GPU device
|
// finds the Metal buffer that contains the tensor data on the GPU device
|
||||||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||||
// Metal buffer based on the host memory pointer
|
// Metal buffer based on the host memory pointer
|
||||||
|
@ -354,7 +361,7 @@ void ggml_metal_graph_compute(
|
||||||
// create multiple command buffers and enqueue them
|
// create multiple command buffers and enqueue them
|
||||||
// then, we encode the graph into the command buffers in parallel
|
// then, we encode the graph into the command buffers in parallel
|
||||||
|
|
||||||
const int n_cb = gf->n_threads;
|
const int n_cb = ctx->n_cb;
|
||||||
|
|
||||||
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
||||||
|
|
||||||
|
|
77
ggml.c
77
ggml.c
|
@ -15942,7 +15942,7 @@ void clear_numa_thread_affinity(void) {}
|
||||||
|
|
||||||
struct ggml_compute_state_shared {
|
struct ggml_compute_state_shared {
|
||||||
const struct ggml_cgraph * cgraph;
|
const struct ggml_cgraph * cgraph;
|
||||||
const struct ggml_graph_compute_plan * plan;
|
const struct ggml_cplan * cplan;
|
||||||
|
|
||||||
int64_t perf_node_start_cycles;
|
int64_t perf_node_start_cycles;
|
||||||
int64_t perf_node_start_time_us;
|
int64_t perf_node_start_time_us;
|
||||||
|
@ -15971,12 +15971,13 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
|
||||||
|
|
||||||
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
||||||
|
|
||||||
const struct ggml_cgraph * cgraph = state->shared->cgraph;
|
const struct ggml_cgraph * cgraph = state->shared->cgraph;
|
||||||
|
const struct ggml_cplan * cplan = state->shared->cplan;
|
||||||
|
|
||||||
const struct ggml_graph_compute_plan * plan = state->shared->plan;
|
const int * n_tasks_arr = cplan->n_tasks;
|
||||||
const int * n_tasks_arr = plan->n_tasks;
|
|
||||||
|
|
||||||
const int n_threads = state->shared->n_threads;
|
const int n_threads = state->shared->n_threads;
|
||||||
|
|
||||||
set_numa_thread_affinity(state->ith, n_threads);
|
set_numa_thread_affinity(state->ith, n_threads);
|
||||||
|
|
||||||
int node_n = -1;
|
int node_n = -1;
|
||||||
|
@ -15989,8 +15990,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
/*.type =*/ GGML_TASK_FINALIZE,
|
/*.type =*/ GGML_TASK_FINALIZE,
|
||||||
/*.ith =*/ 0,
|
/*.ith =*/ 0,
|
||||||
/*.nth =*/ 0,
|
/*.nth =*/ 0,
|
||||||
/*.wsize =*/ plan->work_size,
|
/*.wsize =*/ cplan->work_size,
|
||||||
/*.wdata =*/ plan->work_data,
|
/*.wdata =*/ cplan->work_data,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (node_n != -1) {
|
if (node_n != -1) {
|
||||||
|
@ -16059,8 +16060,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
/*.type =*/ GGML_TASK_COMPUTE,
|
/*.type =*/ GGML_TASK_COMPUTE,
|
||||||
/*.ith =*/ state->ith,
|
/*.ith =*/ state->ith,
|
||||||
/*.nth =*/ n_tasks,
|
/*.nth =*/ n_tasks,
|
||||||
/*.wsize =*/ plan->work_size,
|
/*.wsize =*/ cplan->work_size,
|
||||||
/*.wdata =*/ plan->work_data,
|
/*.wdata =*/ cplan->work_data,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (state->ith < n_tasks) {
|
if (state->ith < n_tasks) {
|
||||||
|
@ -16072,14 +16073,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prepare for graph computing.
|
// Prepare for graph computing.
|
||||||
struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
if (n_threads <= 0) {
|
if (n_threads <= 0) {
|
||||||
n_threads = GGML_DEFAULT_N_THREADS;
|
n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_graph_compute_plan plan;
|
struct ggml_cplan cplan;
|
||||||
memset(&plan, 0, sizeof(struct ggml_graph_compute_plan));
|
memset(&cplan, 0, sizeof(struct ggml_cplan));
|
||||||
int * n_tasks = plan.n_tasks;
|
|
||||||
|
int * n_tasks = cplan.n_tasks;
|
||||||
|
|
||||||
size_t work_size = 0;
|
size_t work_size = 0;
|
||||||
|
|
||||||
// initialize tasks + work buffer
|
// initialize tasks + work buffer
|
||||||
|
@ -16403,34 +16406,34 @@ struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph *
|
||||||
work_size += CACHE_LINE_SIZE*(n_threads - 1);
|
work_size += CACHE_LINE_SIZE*(n_threads - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
plan.n_threads = n_threads;
|
cplan.n_threads = n_threads;
|
||||||
plan.work_size = work_size;
|
cplan.work_size = work_size;
|
||||||
plan.work_data = NULL;
|
cplan.work_data = NULL;
|
||||||
|
|
||||||
return plan;
|
return cplan;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph) {
|
void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
||||||
{
|
{
|
||||||
GGML_ASSERT(plan);
|
GGML_ASSERT(cplan);
|
||||||
GGML_ASSERT(plan->n_threads > 0);
|
GGML_ASSERT(cplan->n_threads > 0);
|
||||||
|
|
||||||
if (plan->work_size > 0) {
|
if (cplan->work_size > 0) {
|
||||||
GGML_ASSERT(plan->work_data);
|
GGML_ASSERT(cplan->work_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||||
if (cgraph->nodes[i]->op != GGML_OP_NONE) {
|
if (cgraph->nodes[i]->op != GGML_OP_NONE) {
|
||||||
GGML_ASSERT(plan->n_tasks[i] > 0);
|
GGML_ASSERT(cplan->n_tasks[i] > 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_threads = plan->n_threads;
|
const int n_threads = cplan->n_threads;
|
||||||
|
|
||||||
struct ggml_compute_state_shared state_shared = {
|
struct ggml_compute_state_shared state_shared = {
|
||||||
/*.cgraph =*/ cgraph,
|
/*.cgraph =*/ cgraph,
|
||||||
/*.cgraph_plan =*/ plan,
|
/*.cgraph_plan =*/ cplan,
|
||||||
/*.perf_node_start_cycles =*/ 0,
|
/*.perf_node_start_cycles =*/ 0,
|
||||||
/*.perf_node_start_time_us =*/ 0,
|
/*.perf_node_start_time_us =*/ 0,
|
||||||
/*.n_threads =*/ n_threads,
|
/*.n_threads =*/ n_threads,
|
||||||
|
@ -16491,17 +16494,19 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: avoid allocating memory frequently.
|
// TODO: avoid allocating memory frequently.
|
||||||
static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) {
|
// TODO: make part of public API - use different name and put warning that it makes allocations
|
||||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads);
|
static void ggml_graph_compute_helper(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
if (plan.work_size > 0) {
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
|
||||||
plan.work_data = malloc(plan.work_size);
|
|
||||||
GGML_ASSERT(plan.work_data);
|
if (cplan.work_size > 0) {
|
||||||
|
cplan.work_data = malloc(cplan.work_size);
|
||||||
|
GGML_ASSERT(cplan.work_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_graph_compute(&plan, cgraph);
|
ggml_graph_compute(cgraph, &cplan);
|
||||||
|
|
||||||
if (plan.work_data) {
|
if (cplan.work_data) {
|
||||||
free(plan.work_data);
|
free(cplan.work_data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17341,7 +17346,7 @@ static enum ggml_opt_result ggml_opt_adam(
|
||||||
ggml_graph_reset (gf);
|
ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
|
|
||||||
ggml_graph_compute_sugar(gb, params.n_threads);
|
ggml_graph_compute_helper(gb, params.n_threads);
|
||||||
|
|
||||||
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
||||||
opt->adam.fx_best = opt->adam.fx_prev;
|
opt->adam.fx_best = opt->adam.fx_prev;
|
||||||
|
@ -17422,7 +17427,7 @@ static enum ggml_opt_result ggml_opt_adam(
|
||||||
ggml_graph_reset (gf);
|
ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
|
|
||||||
ggml_graph_compute_sugar(gb, params.n_threads);
|
ggml_graph_compute_helper(gb, params.n_threads);
|
||||||
|
|
||||||
const float fx = ggml_get_f32_1d(f, 0);
|
const float fx = ggml_get_f32_1d(f, 0);
|
||||||
|
|
||||||
|
@ -17544,7 +17549,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
||||||
ggml_graph_reset (gf);
|
ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
|
|
||||||
ggml_graph_compute_sugar(gb, params->n_threads);
|
ggml_graph_compute_helper(gb, params->n_threads);
|
||||||
|
|
||||||
ggml_opt_get_grad(np, ps, g);
|
ggml_opt_get_grad(np, ps, g);
|
||||||
|
|
||||||
|
@ -17664,7 +17669,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
ggml_graph_reset (gf);
|
ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
|
|
||||||
ggml_graph_compute_sugar(gb, params.n_threads);
|
ggml_graph_compute_helper(gb, params.n_threads);
|
||||||
|
|
||||||
ggml_opt_get_grad(np, ps, g);
|
ggml_opt_get_grad(np, ps, g);
|
||||||
|
|
||||||
|
|
22
ggml.h
22
ggml.h
|
@ -443,17 +443,15 @@ extern "C" {
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
|
||||||
// The default graph compute plan that needs to be prepared for ggml_graph_compute().
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||||
// Since https://github.com/ggerganov/ggml/issues/287
|
// since https://github.com/ggerganov/ggml/issues/287
|
||||||
struct ggml_graph_compute_plan {
|
struct ggml_cplan {
|
||||||
// Size of work buffer, calculated by `ggml_graph_compute_make_plan()`.
|
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||||
size_t work_size;
|
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||||
// Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`.
|
|
||||||
uint8_t * work_data;
|
|
||||||
|
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
|
||||||
// The `n_tasks` of nodes, 1:1 mapping to cgraph nodes.
|
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
||||||
int n_tasks[GGML_MAX_NODES];
|
int n_tasks[GGML_MAX_NODES];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1313,10 +1311,10 @@ extern "C" {
|
||||||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
||||||
|
|
||||||
// ggml_graph_compute_make_plan() needs to be called before ggml_graph_compute().
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
// Returns a plan object. When plan.work_size > 0, caller must allocate memory for plan.work_data.
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
GGML_API struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, const int n_threads/*=GGML_DEFAULT_N_THREADS*/);
|
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||||
GGML_API void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph);
|
GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
||||||
|
|
50
llama.cpp
50
llama.cpp
|
@ -321,9 +321,8 @@ struct llama_context {
|
||||||
// input embedding (1-dimensional array: [n_embd])
|
// input embedding (1-dimensional array: [n_embd])
|
||||||
std::vector<float> embedding;
|
std::vector<float> embedding;
|
||||||
|
|
||||||
// reusable buffer for `struct ggml_graph_compute_plan.work_data`
|
// reusable buffer for `struct ggml_graph_plan.work_data`
|
||||||
// std::vector guarantees the elements are stored contiguously.
|
std::vector<uint8_t> work_buffer;
|
||||||
std::vector<uint8_t> compute_plan_buffer;
|
|
||||||
|
|
||||||
// memory buffers used to evaluate the model
|
// memory buffers used to evaluate the model
|
||||||
// TODO: move in llama_state
|
// TODO: move in llama_state
|
||||||
|
@ -1599,6 +1598,7 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (lctx.ctx_metal && N == 1) {
|
if (lctx.ctx_metal && N == 1) {
|
||||||
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
||||||
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
||||||
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
||||||
call_ggml_graph_compute = false;
|
call_ggml_graph_compute = false;
|
||||||
|
@ -1622,12 +1622,12 @@ static bool llama_eval_internal(
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (call_ggml_graph_compute) {
|
if (call_ggml_graph_compute) {
|
||||||
auto plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
|
ggml_cplan pf = ggml_graph_plan(&gf, actual_n_threads);
|
||||||
if (plan.work_size > 0) {
|
if (pf.work_size > 0) {
|
||||||
lctx.compute_plan_buffer.resize(plan.work_size);
|
lctx.work_buffer.resize(pf.work_size);
|
||||||
plan.work_data = lctx.compute_plan_buffer.data();
|
pf.work_data = lctx.work_buffer.data();
|
||||||
}
|
}
|
||||||
ggml_graph_compute(&plan, &gf);
|
ggml_graph_compute(&gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cgraph_fname) {
|
if (cgraph_fname) {
|
||||||
|
@ -2657,7 +2657,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (params.n_gpu_layers > 0) {
|
if (params.n_gpu_layers > 0) {
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
ctx->ctx_metal = ggml_metal_init();
|
ctx->ctx_metal = ggml_metal_init(1);
|
||||||
|
|
||||||
void * data_ptr = NULL;
|
void * data_ptr = NULL;
|
||||||
size_t data_size = 0;
|
size_t data_size = 0;
|
||||||
|
@ -2815,7 +2815,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
bool warned = false;
|
bool warned = false;
|
||||||
int n_tensors = 0;
|
int n_tensors = 0;
|
||||||
|
|
||||||
auto compute_plan_buffer = std::vector<uint8_t>();
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
int32_t n_dims;
|
int32_t n_dims;
|
||||||
|
@ -2983,12 +2983,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
struct ggml_cgraph gf = ggml_build_forward(r);
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
||||||
|
|
||||||
{
|
{
|
||||||
auto plan = ggml_graph_compute_make_plan(&gf, n_threads);
|
ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
|
||||||
if (plan.work_size > 0) {
|
if (pf.work_size > 0) {
|
||||||
compute_plan_buffer.resize(plan.work_size);
|
work_buffer.resize(pf.work_size);
|
||||||
plan.work_data = compute_plan_buffer.data();
|
pf.work_data = work_buffer.data();
|
||||||
}
|
}
|
||||||
ggml_graph_compute(&plan, &gf);
|
ggml_graph_compute(&gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
// we won't need these tensors again, reset the context to save memory
|
// we won't need these tensors again, reset the context to save memory
|
||||||
|
@ -3163,12 +3163,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
||||||
|
|
||||||
{
|
{
|
||||||
auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
|
ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
|
||||||
if (plan.work_size > 0) {
|
if (pf.work_size > 0) {
|
||||||
ctx->compute_plan_buffer.resize(plan.work_size);
|
ctx->work_buffer.resize(pf.work_size);
|
||||||
plan.work_data = ctx->compute_plan_buffer.data();
|
pf.work_data = ctx->work_buffer.data();
|
||||||
}
|
}
|
||||||
ggml_graph_compute(&plan, &gf);
|
ggml_graph_compute(&gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
|
@ -3276,12 +3276,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
||||||
|
|
||||||
{
|
{
|
||||||
auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
|
ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
|
||||||
if (plan.work_size > 0) {
|
if (pf.work_size > 0) {
|
||||||
ctx->compute_plan_buffer.resize(plan.work_size);
|
ctx->work_buffer.resize(pf.work_size);
|
||||||
plan.work_data = ctx->compute_plan_buffer.data();
|
pf.work_data = ctx->work_buffer.data();
|
||||||
}
|
}
|
||||||
ggml_graph_compute(&plan, &gf);
|
ggml_graph_compute(&gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
|
|
|
@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp)
|
||||||
llama_add_test(test-quantize-perf.cpp)
|
llama_add_test(test-quantize-perf.cpp)
|
||||||
llama_add_test(test-sampling.cpp)
|
llama_add_test(test-sampling.cpp)
|
||||||
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
||||||
# llama_add_test(test-grad0.c) # SLOW
|
llama_add_test(test-grad0.c) # SLOW
|
||||||
# llama_add_test(test-opt.c) # SLOW
|
llama_add_test(test-opt.c) # SLOW
|
||||||
|
|
|
@ -10,6 +10,8 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#pragma GCC diagnostic ignored "-Wdouble-promotion"
|
||||||
|
|
||||||
#define MAX_NARGS 3
|
#define MAX_NARGS 3
|
||||||
|
|
||||||
#undef MIN
|
#undef MIN
|
||||||
|
@ -49,7 +51,7 @@ float frand(void) {
|
||||||
|
|
||||||
int irand(int n) {
|
int irand(int n) {
|
||||||
if (n == 0) return 0;
|
if (n == 0) return 0;
|
||||||
else return rand()%n;
|
return rand()%n;
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_random_dims(int64_t * dims, int ndims) {
|
void get_random_dims(int64_t * dims, int ndims) {
|
||||||
|
@ -159,12 +161,14 @@ struct ggml_tensor * get_random_tensor_int(
|
||||||
float get_element(const struct ggml_tensor * t, int idx) {
|
float get_element(const struct ggml_tensor * t, int idx) {
|
||||||
if (t->type == GGML_TYPE_F32) {
|
if (t->type == GGML_TYPE_F32) {
|
||||||
return ((float *)t->data)[idx];
|
return ((float *)t->data)[idx];
|
||||||
} else if (t->type == GGML_TYPE_I32) {
|
}
|
||||||
|
|
||||||
|
if (t->type == GGML_TYPE_I32) {
|
||||||
return ((int32_t *)t->data)[idx];
|
return ((int32_t *)t->data)[idx];
|
||||||
} else {
|
}
|
||||||
|
|
||||||
assert(false);
|
assert(false);
|
||||||
return INFINITY;
|
return INFINITY;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_element(struct ggml_tensor * t, int idx, float value) {
|
void set_element(struct ggml_tensor * t, int idx, float value) {
|
||||||
|
@ -191,12 +195,12 @@ void print_elements(const char* label, const struct ggml_tensor * t) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct compute_plan_buffer {
|
struct work_buffer {
|
||||||
size_t size;
|
size_t size;
|
||||||
uint8_t * data;
|
uint8_t * data;
|
||||||
};
|
};
|
||||||
|
|
||||||
static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) {
|
static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) {
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -241,20 +245,19 @@ bool check_gradient(
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_build_forward (f);
|
struct ggml_cgraph gf = ggml_build_forward (f);
|
||||||
|
|
||||||
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
|
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
|
||||||
|
|
||||||
struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL };
|
struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL };
|
||||||
|
|
||||||
{
|
{
|
||||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
|
struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
|
||||||
if (plan.work_size > 0) {
|
if (pf.work_size > 0) {
|
||||||
plan.work_data = malloc(plan.work_size);
|
pf.work_data = malloc(pf.work_size);
|
||||||
GGML_ASSERT(plan.work_data);
|
GGML_ASSERT(pf.work_data);
|
||||||
}
|
}
|
||||||
ggml_graph_compute(&plan, &gf);
|
ggml_graph_compute(&gf, &pf);
|
||||||
if (plan.work_data) {
|
if (pf.work_data) {
|
||||||
free(plan.work_data);
|
free(pf.work_data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -262,9 +265,9 @@ bool check_gradient(
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
|
|
||||||
{
|
{
|
||||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads);
|
struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads);
|
||||||
plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
|
pf.work_data = work_buffer_resize(&buf, pf.work_size);
|
||||||
ggml_graph_compute(&plan, &gb);
|
ggml_graph_compute(&gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
|
// ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
|
||||||
|
@ -280,9 +283,9 @@ bool check_gradient(
|
||||||
set_element(x[i], k, xp);
|
set_element(x[i], k, xp);
|
||||||
|
|
||||||
{
|
{
|
||||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
|
struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
|
||||||
plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
|
pf.work_data = work_buffer_resize(&buf, pf.work_size);
|
||||||
ggml_graph_compute(&plan, &gf);
|
ggml_graph_compute(&gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
const float f0 = ggml_get_f32_1d(f, 0);
|
const float f0 = ggml_get_f32_1d(f, 0);
|
||||||
|
@ -290,9 +293,9 @@ bool check_gradient(
|
||||||
set_element(x[i], k, xm);
|
set_element(x[i], k, xm);
|
||||||
|
|
||||||
{
|
{
|
||||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
|
struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
|
||||||
plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
|
pf.work_data = work_buffer_resize(&buf, pf.work_size);
|
||||||
ggml_graph_compute(&plan, &gf);
|
ggml_graph_compute(&gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
const float f1 = ggml_get_f32_1d(f, 0);
|
const float f1 = ggml_get_f32_1d(f, 0);
|
||||||
|
@ -306,15 +309,15 @@ bool check_gradient(
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
|
|
||||||
{
|
{
|
||||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads);
|
struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads);
|
||||||
plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
|
pf.work_data = work_buffer_resize(&buf, pf.work_size);
|
||||||
ggml_graph_compute(&plan, &gb);
|
ggml_graph_compute(&gf, &pf);
|
||||||
}
|
}
|
||||||
|
|
||||||
const float g1 = get_element(x[i]->grad, k);
|
const float g1 = get_element(x[i]->grad, k);
|
||||||
|
|
||||||
const float error_abs = fabsf(g0 - g1);
|
const float error_abs = fabsf(g0 - g1);
|
||||||
const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0;
|
const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
|
||||||
|
|
||||||
if (error_abs > max_error_abs || error_rel > max_error_rel) {
|
if (error_abs > max_error_abs || error_rel > max_error_rel) {
|
||||||
printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
|
printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
|
||||||
|
@ -325,8 +328,8 @@ bool check_gradient(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (plan_buf.data) {
|
if (buf.data) {
|
||||||
free(plan_buf.data);
|
free(buf.data);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
|
|
||||||
#define MAX_NARGS 2
|
#define MAX_NARGS 2
|
||||||
|
|
||||||
|
#pragma GCC diagnostic ignored "-Wdouble-promotion"
|
||||||
|
|
||||||
//
|
//
|
||||||
// logging
|
// logging
|
||||||
|
@ -33,7 +34,7 @@
|
||||||
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
||||||
|
|
||||||
|
|
||||||
float frand() {
|
float frand(void) {
|
||||||
return (float)rand()/(float)RAND_MAX;
|
return (float)rand()/(float)RAND_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -115,12 +116,12 @@ void set_element(struct ggml_tensor * t, int idx, float value) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
struct compute_plan_buffer {
|
struct work_buffer {
|
||||||
size_t size;
|
size_t size;
|
||||||
uint8_t * data;
|
uint8_t * data;
|
||||||
};
|
};
|
||||||
|
|
||||||
static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) {
|
static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) {
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -139,7 +140,7 @@ static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t s
|
||||||
return buf->data;
|
return buf->data;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, const char ** argv) {
|
int main(void) {
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
.mem_size = 1024*1024*1024,
|
.mem_size = 1024*1024*1024,
|
||||||
.mem_buffer = NULL,
|
.mem_buffer = NULL,
|
||||||
|
@ -166,11 +167,11 @@ int main(int argc, const char ** argv) {
|
||||||
struct ggml_cgraph ge = ggml_build_forward(e);
|
struct ggml_cgraph ge = ggml_build_forward(e);
|
||||||
ggml_graph_reset (&ge);
|
ggml_graph_reset (&ge);
|
||||||
|
|
||||||
struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL };
|
struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL };
|
||||||
{
|
{
|
||||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1);
|
struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1);
|
||||||
plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
|
pe.work_data = work_buffer_resize(&buf, pe.work_size);
|
||||||
ggml_graph_compute(&plan, &ge);
|
ggml_graph_compute(&ge, &pe);
|
||||||
}
|
}
|
||||||
|
|
||||||
const float fe = ggml_get_f32_1d(e, 0);
|
const float fe = ggml_get_f32_1d(e, 0);
|
||||||
|
@ -183,13 +184,13 @@ int main(int argc, const char ** argv) {
|
||||||
ggml_graph_reset (&ge);
|
ggml_graph_reset (&ge);
|
||||||
|
|
||||||
{
|
{
|
||||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1);
|
struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1);
|
||||||
plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
|
pe.work_data = work_buffer_resize(&buf, pe.work_size);
|
||||||
ggml_graph_compute(&plan, &ge);
|
ggml_graph_compute(&ge, &pe);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (plan_buf.data) {
|
if (buf.data) {
|
||||||
free(plan_buf.data);
|
free(buf.data);
|
||||||
}
|
}
|
||||||
|
|
||||||
const float fe_opt = ggml_get_f32_1d(e, 0);
|
const float fe_opt = ggml_get_f32_1d(e, 0);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue