Compare commits
5 commits
master
...
gg/ggml-re
Author | SHA1 | Date | |
---|---|---|---|
|
f9968f661d | ||
|
119e0bc9ae | ||
|
ee154457dd | ||
|
92a96865cd | ||
|
c8a3f291fe |
23 changed files with 413 additions and 299 deletions
|
@ -17,17 +17,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
||||||
constexpr float rms_norm_eps = 5e-6f;
|
constexpr float rms_norm_eps = 5e-6f;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
|
||||||
|
|
||||||
if (plan.work_size > 0) {
|
|
||||||
buf.resize(plan.work_size);
|
|
||||||
plan.work_data = buf.data();
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_graph_compute(graph, &plan);
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct ggml_tensor * randomize_tensor(
|
static struct ggml_tensor * randomize_tensor(
|
||||||
struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
|
struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
|
||||||
) {
|
) {
|
||||||
|
@ -1514,8 +1503,6 @@ int main(int argc, char ** argv) {
|
||||||
int n_tokens = model.hparams.n_ctx;
|
int n_tokens = model.hparams.n_ctx;
|
||||||
int n_vocab = model.hparams.n_vocab;
|
int n_vocab = model.hparams.n_vocab;
|
||||||
|
|
||||||
std::vector<uint8_t> work_buffer;
|
|
||||||
|
|
||||||
for (int ex=0; ex<n_examples; ++ex) {
|
for (int ex=0; ex<n_examples; ++ex) {
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ compute_size,
|
/*.mem_size =*/ compute_size,
|
||||||
|
@ -1542,7 +1529,10 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
|
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, e);
|
ggml_build_forward_expand(gf, e);
|
||||||
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
|
ggml_graph_prepare(gf, 1, nullptr);
|
||||||
|
ggml_graph_work_init(gf, nullptr);
|
||||||
|
ggml_graph_compute(gf);
|
||||||
|
ggml_graph_work_free(gf);
|
||||||
|
|
||||||
float error_before_opt = ggml_get_f32_1d(e, 0);
|
float error_before_opt = ggml_get_f32_1d(e, 0);
|
||||||
|
|
||||||
|
@ -1553,7 +1543,10 @@ int main(int argc, char ** argv) {
|
||||||
ggml_opt(ctx0, opt_params_lbfgs, e);
|
ggml_opt(ctx0, opt_params_lbfgs, e);
|
||||||
//
|
//
|
||||||
ggml_build_forward_expand(gf, e);
|
ggml_build_forward_expand(gf, e);
|
||||||
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
|
ggml_graph_prepare(gf, 1, nullptr);
|
||||||
|
ggml_graph_work_init(gf, nullptr);
|
||||||
|
ggml_graph_compute(gf);
|
||||||
|
ggml_graph_work_free(gf);
|
||||||
|
|
||||||
float error_after_opt = ggml_get_f32_1d(e, 0);
|
float error_after_opt = ggml_get_f32_1d(e, 0);
|
||||||
|
|
||||||
|
@ -1607,7 +1600,10 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
|
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, logits);
|
ggml_build_forward_expand(gf, logits);
|
||||||
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
|
ggml_graph_prepare(gf, 1, nullptr);
|
||||||
|
ggml_graph_work_init(gf, nullptr);
|
||||||
|
ggml_graph_compute(gf);
|
||||||
|
ggml_graph_work_free(gf);
|
||||||
|
|
||||||
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
||||||
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
||||||
|
|
|
@ -20,17 +20,6 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
|
||||||
|
|
||||||
if (plan.work_size > 0) {
|
|
||||||
buf.resize(plan.work_size);
|
|
||||||
plan.work_data = buf.data();
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_graph_compute(graph, &plan);
|
|
||||||
}
|
|
||||||
|
|
||||||
static float tensor_sum_elements(const ggml_tensor * tensor) {
|
static float tensor_sum_elements(const ggml_tensor * tensor) {
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
if (tensor->type == GGML_TYPE_F32) {
|
if (tensor->type == GGML_TYPE_F32) {
|
||||||
|
@ -179,11 +168,10 @@ int main(int argc, char ** argv) {
|
||||||
TENSOR_DUMP(m11);
|
TENSOR_DUMP(m11);
|
||||||
TENSOR_DUMP(m2);
|
TENSOR_DUMP(m2);
|
||||||
|
|
||||||
std::vector<uint8_t> work_buffer;
|
ggml_graph_prepare(gf, benchmark_params.n_threads, nullptr);
|
||||||
|
ggml_graph_work_init(gf, nullptr);
|
||||||
|
|
||||||
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
|
TENSOR_DUMP(ggml_graph_node(gf, 0));
|
||||||
|
|
||||||
TENSOR_DUMP(gf->nodes[0]);
|
|
||||||
|
|
||||||
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
||||||
|
|
||||||
|
@ -224,7 +212,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
|
|
||||||
// Let's use the F32 result from above as a reference for the quantized multiplication
|
// Let's use the F32 result from above as a reference for the quantized multiplication
|
||||||
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
|
float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));
|
||||||
|
|
||||||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
||||||
printf("=====================================================================================\n");
|
printf("=====================================================================================\n");
|
||||||
|
@ -234,7 +222,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
long long int start = ggml_time_us();
|
long long int start = ggml_time_us();
|
||||||
//printf("Running ggml_graph_compute\n");
|
//printf("Running ggml_graph_compute\n");
|
||||||
ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
|
ggml_graph_compute(gf31);
|
||||||
|
|
||||||
long long int stop = ggml_time_us();
|
long long int stop = ggml_time_us();
|
||||||
long long int usec = stop-start;
|
long long int usec = stop-start;
|
||||||
|
@ -252,7 +240,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// Check that the matrix multiplication result is in the right ballpark
|
// Check that the matrix multiplication result is in the right ballpark
|
||||||
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
||||||
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
|
float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
|
||||||
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||||
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
||||||
|
|
||||||
|
@ -267,8 +255,11 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Running a different graph computation to make sure we override the CPU cache lines
|
// Running a different graph computation to make sure we override the CPU cache lines
|
||||||
ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
|
ggml_graph_compute(gf32);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_graph_work_free(gf);
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
|
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
|
||||||
printf("=====================================================================================\n");
|
printf("=====================================================================================\n");
|
||||||
|
|
|
@ -226,8 +226,8 @@ static ggml_status compute_piter(
|
||||||
result.eigenvectors.resize(params.n_batch);
|
result.eigenvectors.resize(params.n_batch);
|
||||||
result.distances.resize(params.n_batch);
|
result.distances.resize(params.n_batch);
|
||||||
// get output nodes
|
// get output nodes
|
||||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
|
||||||
auto node = gf->nodes[i];
|
auto node = ggml_graph_node(gf, i);
|
||||||
int iter = -1;
|
int iter = -1;
|
||||||
// find b_tensor (without copying data from device)
|
// find b_tensor (without copying data from device)
|
||||||
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
||||||
|
|
|
@ -370,7 +370,7 @@ struct lora_merge_ctx {
|
||||||
|
|
||||||
// write data to output file
|
// write data to output file
|
||||||
{
|
{
|
||||||
auto result = gf->nodes[gf->n_nodes - 1];
|
auto * result = ggml_graph_node(gf, -1);
|
||||||
size_t len = ggml_nbytes(result);
|
size_t len = ggml_nbytes(result);
|
||||||
if (read_buf.size() < len) {
|
if (read_buf.size() < len) {
|
||||||
read_buf.resize(len);
|
read_buf.resize(len);
|
||||||
|
|
|
@ -2449,7 +2449,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
ggml_backend_graph_compute(ctx->backend, gf);
|
ggml_backend_graph_compute(ctx->backend, gf);
|
||||||
|
|
||||||
// the last node is the embedding tensor
|
// the last node is the embedding tensor
|
||||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||||
|
|
||||||
// copy the embeddings to the location passed by the user
|
// copy the embeddings to the location passed by the user
|
||||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||||
|
|
|
@ -183,8 +183,10 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||||
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
|
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
|
||||||
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
||||||
ggml_build_forward_expand(gf, flatten);
|
ggml_build_forward_expand(gf, flatten);
|
||||||
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
ggml_graph_prepare(gf, 1, nullptr);
|
||||||
struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
|
ggml_graph_work_init(gf, model.ctx);
|
||||||
|
ggml_graph_compute(gf);
|
||||||
|
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
||||||
|
|
||||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
||||||
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
||||||
|
|
|
@ -358,6 +358,7 @@ extern "C" {
|
||||||
|
|
||||||
struct ggml_object;
|
struct ggml_object;
|
||||||
struct ggml_context;
|
struct ggml_context;
|
||||||
|
struct ggml_cgraph;
|
||||||
|
|
||||||
// NOTE: always add types at the end of the enum to keep backward compatibility
|
// NOTE: always add types at the end of the enum to keep backward compatibility
|
||||||
enum ggml_type {
|
enum ggml_type {
|
||||||
|
@ -575,20 +576,6 @@ extern "C" {
|
||||||
GGML_TENSOR_FLAG_PARAM = 4,
|
GGML_TENSOR_FLAG_PARAM = 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
// ggml object
|
|
||||||
struct ggml_object {
|
|
||||||
size_t offs;
|
|
||||||
size_t size;
|
|
||||||
|
|
||||||
struct ggml_object * next;
|
|
||||||
|
|
||||||
enum ggml_object_type type;
|
|
||||||
|
|
||||||
char padding[4];
|
|
||||||
};
|
|
||||||
|
|
||||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
|
||||||
|
|
||||||
// n-dimensional tensor
|
// n-dimensional tensor
|
||||||
struct ggml_tensor {
|
struct ggml_tensor {
|
||||||
enum ggml_type type;
|
enum ggml_type type;
|
||||||
|
@ -657,49 +644,6 @@ extern "C" {
|
||||||
|
|
||||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||||
|
|
||||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
|
||||||
// since https://github.com/ggerganov/ggml/issues/287
|
|
||||||
struct ggml_cplan {
|
|
||||||
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
|
||||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
|
||||||
|
|
||||||
int n_threads;
|
|
||||||
struct ggml_threadpool * threadpool;
|
|
||||||
|
|
||||||
// abort ggml_graph_compute when true
|
|
||||||
ggml_abort_callback abort_callback;
|
|
||||||
void * abort_callback_data;
|
|
||||||
};
|
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order {
|
|
||||||
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
|
||||||
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
|
||||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef uint32_t ggml_bitset_t;
|
|
||||||
|
|
||||||
struct ggml_hash_set {
|
|
||||||
size_t size;
|
|
||||||
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
|
||||||
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
|
||||||
};
|
|
||||||
|
|
||||||
// computation graph
|
|
||||||
struct ggml_cgraph {
|
|
||||||
int size;
|
|
||||||
int n_nodes;
|
|
||||||
int n_leafs;
|
|
||||||
|
|
||||||
struct ggml_tensor ** nodes;
|
|
||||||
struct ggml_tensor ** grads;
|
|
||||||
struct ggml_tensor ** leafs;
|
|
||||||
|
|
||||||
struct ggml_hash_set visited_hash_set;
|
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order order;
|
|
||||||
};
|
|
||||||
|
|
||||||
// scratch buffer
|
// scratch buffer
|
||||||
struct ggml_scratch {
|
struct ggml_scratch {
|
||||||
size_t offs;
|
size_t offs;
|
||||||
|
@ -2017,8 +1961,6 @@ extern "C" {
|
||||||
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
|
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
|
||||||
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
|
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
|
||||||
|
|
||||||
#define GGML_N_TASKS_MAX -1
|
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_map_custom1(
|
GGML_API struct ggml_tensor * ggml_map_custom1(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -2088,22 +2030,28 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * tensor);
|
struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
|
||||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
||||||
|
|
||||||
// graph allocation in a context
|
|
||||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||||
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
|
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
|
||||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||||
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
|
|
||||||
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
||||||
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
|
GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
|
||||||
|
GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
|
||||||
|
GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
|
||||||
|
GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
|
GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API size_t ggml_graph_overhead(void);
|
GGML_API size_t ggml_graph_overhead(void);
|
||||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||||
|
|
||||||
|
// TODO: move these declarations above before the ggml_graph API and reorder the implementation order in ggml.c
|
||||||
|
// (unless the code has been moved to a separate source file)
|
||||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||||
|
@ -2113,17 +2061,61 @@ extern "C" {
|
||||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||||
|
|
||||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
// =================================================================================================
|
||||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
// CPU-only API for ggml_cgraph
|
||||||
GGML_API struct ggml_cplan ggml_graph_plan(
|
//
|
||||||
const struct ggml_cgraph * cgraph,
|
// TODO: move to the CPU backend
|
||||||
|
// NOTE: avoid using, will be removed
|
||||||
|
//
|
||||||
|
|
||||||
|
// loops through the graph and determines:
|
||||||
|
//
|
||||||
|
// - work size needed for CPU computation
|
||||||
|
// - number of threads to start
|
||||||
|
//
|
||||||
|
GGML_API enum ggml_status ggml_graph_prepare(
|
||||||
|
struct ggml_cgraph * cgraph,
|
||||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||||
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
|
||||||
|
|
||||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
// get the estimated work size for the graph from ggml_graph_prepare()
|
||||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
GGML_API size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph);
|
||||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
|
||||||
|
// if ctx is NULL, the work buffer will be dynamically allocated. in this case, call ggml_graph_work_free() to free the buffer
|
||||||
|
// otherwise, the work buffer will be allocated in the context. no need to free it
|
||||||
|
GGML_API enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx);
|
||||||
|
GGML_API void ggml_graph_work_free(struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
|
// note: call ggml_graph_prepare() and ggml_graph_work_init() first
|
||||||
|
//
|
||||||
|
// sample usages:
|
||||||
|
//
|
||||||
|
// - no dynamic allocations:
|
||||||
|
//
|
||||||
|
// ... prepare ggml_context ctx ...
|
||||||
|
//
|
||||||
|
// ggml_graph_prepare (cgraph, n_threads, threadpool);
|
||||||
|
// ggml_graph_work_init(cgraph, ctx);
|
||||||
|
//
|
||||||
|
// ggml_graph_compute (cgraph); // can call many times
|
||||||
|
//
|
||||||
|
// // no need to call ggml_graph_work_free() because it is allocated in ctx
|
||||||
|
//
|
||||||
|
// - dynamic allocations:
|
||||||
|
//
|
||||||
|
// ggml_graph_prepare (cgraph, n_threads, threadpool);
|
||||||
|
// ggml_graph_work_init(cgraph, NULL); // will allocate memory
|
||||||
|
//
|
||||||
|
// ggml_graph_compute (cgraph); // can call many times
|
||||||
|
//
|
||||||
|
// ggml_graph_work_free(cgraph);
|
||||||
|
//
|
||||||
|
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
|
// end of CPU-only API
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
GGML_API void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
||||||
|
|
||||||
|
@ -2146,6 +2138,7 @@ extern "C" {
|
||||||
struct ggml_cgraph * gb_tmp,
|
struct ggml_cgraph * gb_tmp,
|
||||||
struct ggml_tensor * * checkpoints,
|
struct ggml_tensor * * checkpoints,
|
||||||
int n_checkpoints);
|
int n_checkpoints);
|
||||||
|
|
||||||
//
|
//
|
||||||
// optimization
|
// optimization
|
||||||
//
|
//
|
||||||
|
|
|
@ -752,7 +752,8 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_plan_cpu {
|
struct ggml_backend_plan_cpu {
|
||||||
struct ggml_cplan cplan;
|
// TODO: move member from ggml_cgraph here when the public CPU-only API is removed
|
||||||
|
|
||||||
struct ggml_cgraph cgraph;
|
struct ggml_cgraph cgraph;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -761,19 +762,19 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
||||||
|
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||||
|
|
||||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
|
||||||
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
||||||
|
ggml_graph_prepare(&cpu_plan->cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||||
|
|
||||||
if (cpu_plan->cplan.work_size > 0) {
|
if (cpu_plan->cgraph.work_size > 0) {
|
||||||
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
cpu_plan->cgraph.work_data = malloc(cpu_plan->cgraph.work_size);
|
||||||
if (cpu_plan->cplan.work_data == NULL) {
|
if (cpu_plan->cgraph.work_data == NULL) {
|
||||||
free(cpu_plan);
|
free(cpu_plan);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
cpu_plan->cgraph.abort_callback = cpu_ctx->abort_callback;
|
||||||
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
cpu_plan->cgraph.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||||
|
|
||||||
return cpu_plan;
|
return cpu_plan;
|
||||||
}
|
}
|
||||||
|
@ -781,7 +782,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
||||||
GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||||
|
|
||||||
free(cpu_plan->cplan.work_data);
|
free(cpu_plan->cgraph.work_data);
|
||||||
free(cpu_plan);
|
free(cpu_plan);
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
|
@ -790,7 +791,7 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
|
||||||
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||||
|
|
||||||
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
return ggml_graph_compute(&cpu_plan->cgraph);
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
@ -798,23 +799,24 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
|
||||||
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
ggml_graph_prepare(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||||
|
|
||||||
if (cpu_ctx->work_size < cplan.work_size) {
|
if (cpu_ctx->work_size < cgraph->work_size) {
|
||||||
free(cpu_ctx->work_data);
|
free(cpu_ctx->work_data);
|
||||||
cpu_ctx->work_data = malloc(cplan.work_size);
|
cpu_ctx->work_data = malloc(cgraph->work_size);
|
||||||
if (cpu_ctx->work_data == NULL) {
|
if (cpu_ctx->work_data == NULL) {
|
||||||
cpu_ctx->work_size = 0;
|
cpu_ctx->work_size = 0;
|
||||||
return GGML_STATUS_ALLOC_FAILED;
|
return GGML_STATUS_ALLOC_FAILED;
|
||||||
}
|
}
|
||||||
cpu_ctx->work_size = cplan.work_size;
|
cpu_ctx->work_size = cgraph->work_size;
|
||||||
}
|
}
|
||||||
cplan.work_data = cpu_ctx->work_data;
|
cgraph->work_data = cpu_ctx->work_data;
|
||||||
|
cgraph->work_own = false; // always freed by ggml_backend_cpu_graph_plan_free
|
||||||
|
|
||||||
cplan.abort_callback = cpu_ctx->abort_callback;
|
cgraph->abort_callback = cpu_ctx->abort_callback;
|
||||||
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
cgraph->abort_callback_data = cpu_ctx->abort_callback_data;
|
||||||
|
|
||||||
return ggml_graph_compute(cgraph, &cplan);
|
return ggml_graph_compute(cgraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "ggml-impl.h"
|
||||||
#include "ggml-blas.h"
|
#include "ggml-blas.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,7 @@
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
|
||||||
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
#include "ggml-cann/aclnn_ops.h"
|
#include "ggml-cann/aclnn_ops.h"
|
||||||
#include "ggml-cann/common.h"
|
#include "ggml-cann/common.h"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
#include "ggml-cuda/common.cuh"
|
#include "ggml-cuda/common.cuh"
|
||||||
|
|
|
@ -629,8 +629,16 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
enum ggml_cgraph_eval_order {
|
||||||
|
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
||||||
|
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
||||||
|
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||||
|
};
|
||||||
|
|
||||||
// bitset
|
// bitset
|
||||||
|
|
||||||
|
typedef uint32_t ggml_bitset_t;
|
||||||
|
|
||||||
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
||||||
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
||||||
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
||||||
|
@ -656,6 +664,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
|
||||||
#define GGML_HASHSET_FULL ((size_t)-1)
|
#define GGML_HASHSET_FULL ((size_t)-1)
|
||||||
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
||||||
|
|
||||||
|
struct ggml_hash_set {
|
||||||
|
size_t size;
|
||||||
|
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||||
|
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||||
|
};
|
||||||
|
|
||||||
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
||||||
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
||||||
|
|
||||||
|
@ -745,6 +759,37 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// computation graph
|
||||||
|
|
||||||
|
struct ggml_cgraph {
|
||||||
|
int size;
|
||||||
|
int n_nodes;
|
||||||
|
int n_leafs;
|
||||||
|
|
||||||
|
struct ggml_tensor ** nodes;
|
||||||
|
struct ggml_tensor ** grads;
|
||||||
|
struct ggml_tensor ** leafs;
|
||||||
|
|
||||||
|
struct ggml_hash_set visited_hash_set;
|
||||||
|
|
||||||
|
enum ggml_cgraph_eval_order order;
|
||||||
|
|
||||||
|
// TODO: after the CPU-only API is removed, we can move the members below to ggml_backend_plan_cpu
|
||||||
|
|
||||||
|
bool work_own;
|
||||||
|
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||||
|
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||||
|
|
||||||
|
int n_threads;
|
||||||
|
struct ggml_threadpool * threadpool;
|
||||||
|
|
||||||
|
// abort ggml_graph_compute when true
|
||||||
|
ggml_abort_callback abort_callback;
|
||||||
|
void * abort_callback_data;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
#include "ggml-kompute.h"
|
#include "ggml-kompute.h"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
#import "ggml-metal.h"
|
#import "ggml-metal.h"
|
||||||
|
|
||||||
|
#import "ggml-impl.h"
|
||||||
#import "ggml-backend-impl.h"
|
#import "ggml-backend-impl.h"
|
||||||
#import "ggml.h"
|
|
||||||
|
|
||||||
#import <Foundation/Foundation.h>
|
#import <Foundation/Foundation.h>
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#include "ggml-rpc.h"
|
#include "ggml-rpc.h"
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
#include <sycl/half_type.hpp>
|
#include <sycl/half_type.hpp>
|
||||||
|
|
||||||
#include "ggml-sycl.h"
|
#include "ggml-sycl.h"
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
#include "ggml-sycl/backend.hpp"
|
#include "ggml-sycl/backend.hpp"
|
||||||
|
|
|
@ -21,7 +21,7 @@
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
#include "ggml-vulkan-shaders.hpp"
|
#include "ggml-vulkan-shaders.hpp"
|
||||||
|
|
199
ggml/src/ggml.c
199
ggml/src/ggml.c
|
@ -287,6 +287,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
||||||
#define GGML_DEBUG 0
|
#define GGML_DEBUG 0
|
||||||
#define GGML_GELU_FP16
|
#define GGML_GELU_FP16
|
||||||
#define GGML_GELU_QUICK_FP16
|
#define GGML_GELU_QUICK_FP16
|
||||||
|
#define GGML_N_TASKS_MAX (-1)
|
||||||
|
|
||||||
#define GGML_SOFT_MAX_UNROLL 4
|
#define GGML_SOFT_MAX_UNROLL 4
|
||||||
#define GGML_VEC_DOT_UNROLL 2
|
#define GGML_VEC_DOT_UNROLL 2
|
||||||
|
@ -1124,17 +1125,17 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||||
{ \
|
{ \
|
||||||
int offset = GGML_F32_ARR >> 1; \
|
int offset = GGML_F32_ARR >> 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
(res) = GGML_F32x4_REDUCE_ONE((x)[0]); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GGML_F32_VEC GGML_F32x4
|
#define GGML_F32_VEC GGML_F32x4
|
||||||
|
@ -1165,26 +1166,26 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||||
do { \
|
do { \
|
||||||
int offset = GGML_F16_ARR >> 1; \
|
int offset = GGML_F16_ARR >> 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
|
||||||
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
|
||||||
res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define GGML_F16_VEC GGML_F16x8
|
#define GGML_F16_VEC GGML_F16x8
|
||||||
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
||||||
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
||||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
||||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
|
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
|
||||||
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
||||||
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
||||||
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
||||||
|
@ -1893,6 +1894,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||||
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// ggml object
|
||||||
|
//
|
||||||
|
|
||||||
|
struct ggml_object {
|
||||||
|
size_t offs;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
|
struct ggml_object * next;
|
||||||
|
|
||||||
|
enum ggml_object_type type;
|
||||||
|
|
||||||
|
char padding[4];
|
||||||
|
};
|
||||||
|
|
||||||
|
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||||
|
|
||||||
//
|
//
|
||||||
// ggml context
|
// ggml context
|
||||||
//
|
//
|
||||||
|
@ -1983,7 +2001,6 @@ struct ggml_threadpool {
|
||||||
ggml_cond_t cond; // cond.var for waiting for new work
|
ggml_cond_t cond; // cond.var for waiting for new work
|
||||||
|
|
||||||
struct ggml_cgraph * cgraph;
|
struct ggml_cgraph * cgraph;
|
||||||
struct ggml_cplan * cplan;
|
|
||||||
|
|
||||||
// synchronization primitives
|
// synchronization primitives
|
||||||
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
|
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
|
||||||
|
@ -19077,8 +19094,15 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
|
||||||
/*.nodes =*/ nodes_ptr,
|
/*.nodes =*/ nodes_ptr,
|
||||||
/*.grads =*/ grads_ptr,
|
/*.grads =*/ grads_ptr,
|
||||||
/*.leafs =*/ leafs_ptr,
|
/*.leafs =*/ leafs_ptr,
|
||||||
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
|
/*.visited_hash_set =*/ { hash_size, hash_used, hash_keys_ptr },
|
||||||
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
|
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
|
||||||
|
/*.work_own =*/ false,
|
||||||
|
/*.work_size =*/ 0,
|
||||||
|
/*.work_data =*/ NULL,
|
||||||
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS,
|
||||||
|
/*.threadpool =*/ NULL,
|
||||||
|
/*.abort_callback =*/ NULL,
|
||||||
|
/*.abort_callback_data =*/ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
||||||
|
@ -19100,6 +19124,13 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
|
||||||
/*.leafs =*/ NULL,
|
/*.leafs =*/ NULL,
|
||||||
/*.hash_table =*/ { 0, NULL, NULL },
|
/*.hash_table =*/ { 0, NULL, NULL },
|
||||||
/*.order =*/ cgraph0->order,
|
/*.order =*/ cgraph0->order,
|
||||||
|
/*.work_own =*/ false,
|
||||||
|
/*.work_size =*/ 0,
|
||||||
|
/*.work_data =*/ NULL,
|
||||||
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS,
|
||||||
|
/*.threadpool =*/ NULL,
|
||||||
|
/*.abort_callback =*/ NULL,
|
||||||
|
/*.abort_callback_data =*/ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
return cgraph;
|
return cgraph;
|
||||||
|
@ -19161,6 +19192,33 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
|
||||||
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_graph_size(struct ggml_cgraph * cgraph) {
|
||||||
|
return cgraph->size;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
|
||||||
|
if (i < 0) {
|
||||||
|
GGML_ASSERT(cgraph->n_nodes + i >= 0);
|
||||||
|
return cgraph->nodes[cgraph->n_nodes + i];
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(i < cgraph->n_nodes);
|
||||||
|
return cgraph->nodes[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
|
||||||
|
return cgraph->nodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
|
||||||
|
return cgraph->n_nodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
|
||||||
|
cgraph->nodes[cgraph->n_nodes] = tensor;
|
||||||
|
cgraph->n_nodes++;
|
||||||
|
}
|
||||||
|
|
||||||
// Android's libc implementation "bionic" does not support setting affinity
|
// Android's libc implementation "bionic" does not support setting affinity
|
||||||
#if defined(__gnu_linux__)
|
#if defined(__gnu_linux__)
|
||||||
static void set_numa_thread_affinity(int thread_n) {
|
static void set_numa_thread_affinity(int thread_n) {
|
||||||
|
@ -19708,11 +19766,10 @@ void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cplan ggml_graph_plan(
|
enum ggml_status ggml_graph_prepare(
|
||||||
const struct ggml_cgraph * cgraph,
|
struct ggml_cgraph * cgraph,
|
||||||
int n_threads,
|
int n_threads,
|
||||||
struct ggml_threadpool * threadpool) {
|
struct ggml_threadpool * threadpool) {
|
||||||
|
|
||||||
if (threadpool == NULL) {
|
if (threadpool == NULL) {
|
||||||
GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
|
GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
|
||||||
}
|
}
|
||||||
|
@ -19722,9 +19779,6 @@ struct ggml_cplan ggml_graph_plan(
|
||||||
|
|
||||||
size_t work_size = 0;
|
size_t work_size = 0;
|
||||||
|
|
||||||
struct ggml_cplan cplan;
|
|
||||||
memset(&cplan, 0, sizeof(struct ggml_cplan));
|
|
||||||
|
|
||||||
int max_tasks = 1;
|
int max_tasks = 1;
|
||||||
|
|
||||||
// thread scheduling for the different operations + work buffer size estimation
|
// thread scheduling for the different operations + work buffer size estimation
|
||||||
|
@ -19876,27 +19930,62 @@ struct ggml_cplan ggml_graph_plan(
|
||||||
work_size += CACHE_LINE_SIZE*(n_threads);
|
work_size += CACHE_LINE_SIZE*(n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
cplan.threadpool = threadpool;
|
cgraph->threadpool = threadpool;
|
||||||
cplan.n_threads = MIN(max_tasks, n_threads);
|
cgraph->n_threads = MIN(max_tasks, n_threads);
|
||||||
cplan.work_size = work_size;
|
cgraph->work_size = work_size;
|
||||||
cplan.work_data = NULL;
|
|
||||||
|
|
||||||
return cplan;
|
ggml_graph_work_free(cgraph);
|
||||||
|
|
||||||
|
return GGML_STATUS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph) {
|
||||||
|
return cgraph->work_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx) {
|
||||||
|
GGML_ASSERT(cgraph->n_threads > 0 && "call ggml_graph_prepare first");
|
||||||
|
|
||||||
|
ggml_graph_work_free(cgraph);
|
||||||
|
|
||||||
|
if (cgraph->work_size > 0) {
|
||||||
|
if (ctx == NULL) {
|
||||||
|
cgraph->work_data = GGML_ALIGNED_MALLOC(cgraph->work_size);
|
||||||
|
if (cgraph->work_data == NULL) {
|
||||||
|
return GGML_STATUS_ALLOC_FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
|
cgraph->work_own = true;
|
||||||
|
} else {
|
||||||
|
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cgraph->work_size);
|
||||||
|
|
||||||
|
cgraph->work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
||||||
|
cgraph->work_own = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return GGML_STATUS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_graph_work_free(struct ggml_cgraph * cgraph) {
|
||||||
|
if (cgraph->work_data && cgraph->work_own) {
|
||||||
|
GGML_ALIGNED_FREE(cgraph->work_data);
|
||||||
|
cgraph->work_data = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
||||||
|
|
||||||
const struct ggml_cgraph * cgraph = state->threadpool->cgraph;
|
const struct ggml_cgraph * cgraph = state->threadpool->cgraph;
|
||||||
const struct ggml_cplan * cplan = state->threadpool->cplan;
|
|
||||||
|
|
||||||
set_numa_thread_affinity(state->ith);
|
set_numa_thread_affinity(state->ith);
|
||||||
|
|
||||||
struct ggml_compute_params params = {
|
struct ggml_compute_params params = {
|
||||||
/*.ith =*/ state->ith,
|
/*.ith =*/ state->ith,
|
||||||
/*.nth =*/ state->threadpool->n_threads_cur,
|
/*.nth =*/ state->threadpool->n_threads_cur,
|
||||||
/*.wsize =*/ cplan->work_size,
|
/*.wsize =*/ cgraph->work_size,
|
||||||
/*.wdata =*/ cplan->work_data,
|
/*.wdata =*/ cgraph->work_data,
|
||||||
/*.threadpool =*/ state->threadpool,
|
/*.threadpool =*/ state->threadpool,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -19905,7 +19994,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
|
|
||||||
ggml_compute_forward(¶ms, node);
|
ggml_compute_forward(¶ms, node);
|
||||||
|
|
||||||
if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
if (state->ith == 0 && cgraph->abort_callback && cgraph->abort_callback(cgraph->abort_callback_data)) {
|
||||||
state->threadpool->ec = GGML_STATUS_ABORTED;
|
state->threadpool->ec = GGML_STATUS_ABORTED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20059,14 +20148,12 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
|
||||||
|
|
||||||
static struct ggml_threadpool * ggml_threadpool_new_impl(
|
static struct ggml_threadpool * ggml_threadpool_new_impl(
|
||||||
struct ggml_threadpool_params * tpp,
|
struct ggml_threadpool_params * tpp,
|
||||||
struct ggml_cgraph * cgraph,
|
struct ggml_cgraph * cgraph) {
|
||||||
struct ggml_cplan * cplan) {
|
|
||||||
|
|
||||||
struct ggml_threadpool * threadpool =
|
struct ggml_threadpool * threadpool =
|
||||||
GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
|
GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
|
||||||
{
|
{
|
||||||
threadpool->cgraph = cgraph;
|
threadpool->cgraph = cgraph;
|
||||||
threadpool->cplan = cplan;
|
|
||||||
threadpool->n_graph = 0;
|
threadpool->n_graph = 0;
|
||||||
threadpool->n_barrier = 0;
|
threadpool->n_barrier = 0;
|
||||||
threadpool->n_barrier_passed = 0;
|
threadpool->n_barrier_passed = 0;
|
||||||
|
@ -20124,16 +20211,15 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) {
|
struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) {
|
||||||
return ggml_threadpool_new_impl(tpp, NULL, NULL);
|
return ggml_threadpool_new_impl(tpp, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph) {
|
||||||
GGML_ASSERT(cplan);
|
GGML_ASSERT((cgraph->n_threads > 0 ) && "call ggml_graph_prepare first");
|
||||||
GGML_ASSERT(cplan->n_threads > 0);
|
GGML_ASSERT((cgraph->work_size == 0 || cgraph->work_data != NULL) && "call ggml_graph_work_init first");
|
||||||
GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
|
|
||||||
|
|
||||||
int n_threads = cplan->n_threads;
|
int n_threads = cgraph->n_threads;
|
||||||
struct ggml_threadpool * threadpool = cplan->threadpool;
|
struct ggml_threadpool * threadpool = cgraph->threadpool;
|
||||||
|
|
||||||
bool disposable_threadpool = false;
|
bool disposable_threadpool = false;
|
||||||
|
|
||||||
|
@ -20142,19 +20228,18 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||||
disposable_threadpool = true;
|
disposable_threadpool = true;
|
||||||
|
|
||||||
struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
|
struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
|
||||||
threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan);
|
threadpool = ggml_threadpool_new_impl(&ttp, cgraph);
|
||||||
} else {
|
} else {
|
||||||
// Reset some of the parameters that need resetting
|
// Reset some of the parameters that need resetting
|
||||||
// No worker threads should be accessing the parameters below at this stage
|
// No worker threads should be accessing the parameters below at this stage
|
||||||
threadpool->cgraph = cgraph;
|
threadpool->cgraph = cgraph;
|
||||||
threadpool->cplan = cplan;
|
|
||||||
threadpool->n_threads_cur = n_threads;
|
threadpool->n_threads_cur = n_threads;
|
||||||
threadpool->current_chunk = 0;
|
threadpool->current_chunk = 0;
|
||||||
threadpool->ec = GGML_STATUS_SUCCESS;
|
threadpool->ec = GGML_STATUS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_threads > threadpool->n_threads_max) {
|
if (n_threads > threadpool->n_threads_max) {
|
||||||
GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
|
GGML_PRINT("WARNING: cgraph is requesting more threads than the threadpool contains. Expect a bad time!\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_OPENMP
|
#ifdef GGML_USE_OPENMP
|
||||||
|
@ -20193,14 +20278,9 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
|
void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data) {
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
|
cgraph->abort_callback = abort_callback;
|
||||||
|
cgraph->abort_callback_data = abort_data;
|
||||||
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
|
|
||||||
|
|
||||||
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
|
||||||
|
|
||||||
return ggml_graph_compute(cgraph, &cplan);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
||||||
|
@ -21010,9 +21090,8 @@ static enum ggml_opt_result ggml_opt_adam(
|
||||||
|
|
||||||
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
|
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
|
||||||
|
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
|
ggml_graph_prepare (gb, params.n_threads, NULL);
|
||||||
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
|
ggml_graph_work_init(gb, ctx);
|
||||||
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
|
||||||
|
|
||||||
bool cancel = false;
|
bool cancel = false;
|
||||||
|
|
||||||
|
@ -21028,7 +21107,7 @@ static enum ggml_opt_result ggml_opt_adam(
|
||||||
}
|
}
|
||||||
// ggml_graph_reset (gf);
|
// ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
ggml_graph_compute(gb, &cplan);
|
ggml_graph_compute(gb);
|
||||||
ggml_opt_acc_grad(np, ps, g, accum_norm);
|
ggml_opt_acc_grad(np, ps, g, accum_norm);
|
||||||
fx += ggml_get_f32_1d(f, 0);
|
fx += ggml_get_f32_1d(f, 0);
|
||||||
}
|
}
|
||||||
|
@ -21119,7 +21198,7 @@ static enum ggml_opt_result ggml_opt_adam(
|
||||||
}
|
}
|
||||||
// ggml_graph_reset (gf);
|
// ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
ggml_graph_compute(gb, &cplan);
|
ggml_graph_compute(gb);
|
||||||
ggml_opt_acc_grad(np, ps, g, accum_norm);
|
ggml_opt_acc_grad(np, ps, g, accum_norm);
|
||||||
fx += ggml_get_f32_1d(f, 0);
|
fx += ggml_get_f32_1d(f, 0);
|
||||||
}
|
}
|
||||||
|
@ -21204,7 +21283,6 @@ static enum ggml_opt_result linesearch_backtracking(
|
||||||
const float * xp,
|
const float * xp,
|
||||||
struct ggml_tensor * f,
|
struct ggml_tensor * f,
|
||||||
struct ggml_cgraph * gb,
|
struct ggml_cgraph * gb,
|
||||||
struct ggml_cplan * cplan,
|
|
||||||
const int np,
|
const int np,
|
||||||
struct ggml_tensor * ps[],
|
struct ggml_tensor * ps[],
|
||||||
bool * cancel,
|
bool * cancel,
|
||||||
|
@ -21261,7 +21339,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
||||||
}
|
}
|
||||||
// ggml_graph_reset (gf);
|
// ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
ggml_graph_compute(gb, cplan);
|
ggml_graph_compute(gb);
|
||||||
ggml_opt_acc_grad(np, ps, g, accum_norm);
|
ggml_opt_acc_grad(np, ps, g, accum_norm);
|
||||||
*fx += ggml_get_f32_1d(f, 0);
|
*fx += ggml_get_f32_1d(f, 0);
|
||||||
}
|
}
|
||||||
|
@ -21357,9 +21435,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
opt->iter = iter;
|
opt->iter = iter;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
|
ggml_graph_prepare (gb, params.n_threads, NULL);
|
||||||
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
|
ggml_graph_work_init(gb, ctx);
|
||||||
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
|
||||||
|
|
||||||
float * x = opt->lbfgs.x->data; // current parameters
|
float * x = opt->lbfgs.x->data; // current parameters
|
||||||
float * xp = opt->lbfgs.xp->data; // previous parameters
|
float * xp = opt->lbfgs.xp->data; // previous parameters
|
||||||
|
@ -21404,7 +21481,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
}
|
}
|
||||||
// ggml_graph_reset (gf);
|
// ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
ggml_graph_compute(gb, &cplan);
|
ggml_graph_compute(gb);
|
||||||
ggml_opt_acc_grad(np, ps, g, accum_norm);
|
ggml_opt_acc_grad(np, ps, g, accum_norm);
|
||||||
fx += ggml_get_f32_1d(f, 0);
|
fx += ggml_get_f32_1d(f, 0);
|
||||||
}
|
}
|
||||||
|
@ -21470,7 +21547,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
// to determine if the optimization should be cancelled
|
// to determine if the optimization should be cancelled
|
||||||
// this is a simple change, but not doing this atm, since I don't have a nice
|
// this is a simple change, but not doing this atm, since I don't have a nice
|
||||||
// way to test and don't want to break something with so many changes lined up
|
// way to test and don't want to break something with so many changes lined up
|
||||||
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
|
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, np, ps, &cancel, callback, callback_data);
|
||||||
if (cancel) {
|
if (cancel) {
|
||||||
return GGML_OPT_RESULT_CANCEL;
|
return GGML_OPT_RESULT_CANCEL;
|
||||||
}
|
}
|
||||||
|
|
|
@ -9877,8 +9877,8 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
||||||
// find result_norm tensor for input
|
// find result_norm tensor for input
|
||||||
struct ggml_tensor * inp = nullptr;
|
struct ggml_tensor * inp = nullptr;
|
||||||
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
||||||
inp = gf->nodes[i];
|
inp = ggml_graph_node(gf, i);
|
||||||
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
|
@ -16205,8 +16205,8 @@ static int llama_decode_internal(
|
||||||
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
||||||
|
|
||||||
// the output is always the last tensor in the graph
|
// the output is always the last tensor in the graph
|
||||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor * res = ggml_graph_node(gf, -1);
|
||||||
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
struct ggml_tensor * embd = ggml_graph_node(gf, -2);
|
||||||
|
|
||||||
if (lctx.n_outputs == 0) {
|
if (lctx.n_outputs == 0) {
|
||||||
// no output
|
// no output
|
||||||
|
@ -16215,9 +16215,9 @@ static int llama_decode_internal(
|
||||||
} else if (cparams.embeddings) {
|
} else if (cparams.embeddings) {
|
||||||
res = nullptr; // do not extract logits for embedding case
|
res = nullptr; // do not extract logits for embedding case
|
||||||
embd = nullptr;
|
embd = nullptr;
|
||||||
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
||||||
if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
|
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
|
||||||
embd = gf->nodes[i];
|
embd = ggml_graph_node(gf, i);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -16432,15 +16432,15 @@ static int llama_encode_internal(
|
||||||
// there are two cases here
|
// there are two cases here
|
||||||
if (llama_model_has_decoder(&lctx.model)) {
|
if (llama_model_has_decoder(&lctx.model)) {
|
||||||
// first case is an encoder-decoder T5 model where embeddings are passed to decoder
|
// first case is an encoder-decoder T5 model where embeddings are passed to decoder
|
||||||
embd = gf->nodes[gf->n_nodes - 1];
|
embd = ggml_graph_node(gf, -1);
|
||||||
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
|
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
|
||||||
} else {
|
} else {
|
||||||
// second case is an encoder-only T5 model
|
// second case is an encoder-only T5 model
|
||||||
if (cparams.embeddings) {
|
if (cparams.embeddings) {
|
||||||
// only output embeddings if required
|
// only output embeddings if required
|
||||||
embd = gf->nodes[gf->n_nodes - 1];
|
embd = ggml_graph_node(gf, -1);
|
||||||
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
||||||
embd = gf->nodes[gf->n_nodes - 2];
|
embd = ggml_graph_node(gf, -2);
|
||||||
}
|
}
|
||||||
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
||||||
}
|
}
|
||||||
|
@ -18488,7 +18488,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
// note: the number of splits during measure is higher than during inference due to the kv shift
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
||||||
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||||
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
|
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, ggml_graph_n_nodes(gf));
|
||||||
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -519,7 +519,7 @@ struct test_case {
|
||||||
|
|
||||||
// add sentinels as graph nodes so that they are checked in the callback
|
// add sentinels as graph nodes so that they are checked in the callback
|
||||||
for (ggml_tensor * sentinel : sentinels) {
|
for (ggml_tensor * sentinel : sentinels) {
|
||||||
gf->nodes[gf->n_nodes++] = sentinel;
|
ggml_graph_add_node(gf, sentinel);
|
||||||
}
|
}
|
||||||
|
|
||||||
// randomize tensors
|
// randomize tensors
|
||||||
|
@ -679,9 +679,9 @@ struct test_case {
|
||||||
|
|
||||||
// duplicate the op
|
// duplicate the op
|
||||||
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
|
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
|
||||||
int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
|
int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
|
||||||
for (int i = 1; i < n_runs; i++) {
|
for (int i = 1; i < n_runs; i++) {
|
||||||
gf->nodes[gf->n_nodes++] = out;
|
ggml_graph_add_node(gf, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
// calculate memory
|
// calculate memory
|
||||||
|
@ -696,11 +696,11 @@ struct test_case {
|
||||||
}
|
}
|
||||||
return size;
|
return size;
|
||||||
};
|
};
|
||||||
for (int i = 0; i < gf->n_nodes; i++) {
|
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
|
||||||
if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
|
if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
mem += tensor_op_size(gf->nodes[i]);
|
mem += tensor_op_size(ggml_graph_node(gf, i));
|
||||||
}
|
}
|
||||||
|
|
||||||
// run
|
// run
|
||||||
|
@ -804,7 +804,7 @@ struct test_case {
|
||||||
ggml_graph_cpy(gf, gb);
|
ggml_graph_cpy(gf, gb);
|
||||||
ggml_build_backward_expand(ctx, gf, gb, false);
|
ggml_build_backward_expand(ctx, gf, gb, false);
|
||||||
if (expect.size() != 1 || expect[0] != 0.0f) {
|
if (expect.size() != 1 || expect[0] != 0.0f) {
|
||||||
GGML_ASSERT(gb->n_nodes > gf->n_nodes);
|
GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
|
||||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
|
GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
|
||||||
}
|
}
|
||||||
|
|
|
@ -242,12 +242,16 @@ static bool check_gradient(
|
||||||
ggml_graph_cpy(gf, gb);
|
ggml_graph_cpy(gf, gb);
|
||||||
ggml_build_backward_expand(ctx0, gf, gb, false);
|
ggml_build_backward_expand(ctx0, gf, gb, false);
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
|
ggml_graph_prepare(gf, n_threads, nullptr);
|
||||||
|
ggml_graph_work_init(gf, ctx0);
|
||||||
|
ggml_graph_compute(gf);
|
||||||
|
|
||||||
ggml_graph_reset (gf);
|
ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
|
ggml_graph_prepare(gb, n_threads, nullptr);
|
||||||
|
ggml_graph_work_init(gb, ctx0);
|
||||||
|
ggml_graph_compute(gb);
|
||||||
|
|
||||||
// ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
|
// ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
|
||||||
// ggml_graph_dump_dot(gb, gf, "test-grad0-backward.dot");
|
// ggml_graph_dump_dot(gb, gf, "test-grad0-backward.dot");
|
||||||
|
@ -262,13 +266,17 @@ static bool check_gradient(
|
||||||
const float xp = x0 + eps;
|
const float xp = x0 + eps;
|
||||||
ggml_set_f32_1d(x[i], k, xp);
|
ggml_set_f32_1d(x[i], k, xp);
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
|
ggml_graph_prepare(gf, n_threads, nullptr);
|
||||||
|
ggml_graph_work_init(gf, ctx0);
|
||||||
|
ggml_graph_compute(gf);
|
||||||
|
|
||||||
const double f0 = ggml_get_f32_1d(f, 0);
|
const double f0 = ggml_get_f32_1d(f, 0);
|
||||||
|
|
||||||
ggml_set_f32_1d(x[i], k, xm);
|
ggml_set_f32_1d(x[i], k, xm);
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
|
ggml_graph_prepare(gf, n_threads, nullptr);
|
||||||
|
ggml_graph_work_init(gf, ctx0);
|
||||||
|
ggml_graph_compute(gf);
|
||||||
|
|
||||||
const double f1 = ggml_get_f32_1d(f, 0);
|
const double f1 = ggml_get_f32_1d(f, 0);
|
||||||
const double g0 = (f0 - f1)/(2.0*(double) eps);
|
const double g0 = (f0 - f1)/(2.0*(double) eps);
|
||||||
|
@ -301,7 +309,9 @@ static bool check_gradient(
|
||||||
ggml_graph_reset (gf);
|
ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
|
ggml_graph_prepare(gb, n_threads, nullptr);
|
||||||
|
ggml_graph_work_init(gb, ctx0);
|
||||||
|
ggml_graph_compute(gb);
|
||||||
|
|
||||||
const double g1 = ggml_get_f32_1d(x[i]->grad, k);
|
const double g1 = ggml_get_f32_1d(x[i]->grad, k);
|
||||||
|
|
||||||
|
|
|
@ -113,7 +113,10 @@ int main(void) {
|
||||||
ggml_build_forward_expand(ge, e);
|
ggml_build_forward_expand(ge, e);
|
||||||
ggml_graph_reset(ge);
|
ggml_graph_reset(ge);
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
|
ggml_graph_prepare(ge, 1, nullptr);
|
||||||
|
ggml_graph_work_init(ge, nullptr);
|
||||||
|
ggml_graph_compute(ge);
|
||||||
|
ggml_graph_work_free(ge);
|
||||||
|
|
||||||
const float fe = ggml_get_f32_1d(e, 0);
|
const float fe = ggml_get_f32_1d(e, 0);
|
||||||
printf("%s: e = %.4f\n", __func__, fe);
|
printf("%s: e = %.4f\n", __func__, fe);
|
||||||
|
@ -124,7 +127,10 @@ int main(void) {
|
||||||
|
|
||||||
ggml_graph_reset(ge);
|
ggml_graph_reset(ge);
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
|
ggml_graph_prepare(ge, 1, nullptr);
|
||||||
|
ggml_graph_work_init(ge, nullptr);
|
||||||
|
ggml_graph_compute(ge);
|
||||||
|
ggml_graph_work_free(ge);
|
||||||
|
|
||||||
const float fe_opt = ggml_get_f32_1d(e, 0);
|
const float fe_opt = ggml_get_f32_1d(e, 0);
|
||||||
printf("%s: original e = %.4f\n", __func__, fe);
|
printf("%s: original e = %.4f\n", __func__, fe);
|
||||||
|
|
|
@ -112,17 +112,6 @@ static struct ggml_tensor * get_random_tensor_f32(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
|
||||||
|
|
||||||
if (plan.work_size > 0) {
|
|
||||||
buf.resize(plan.work_size);
|
|
||||||
plan.work_data = buf.data();
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_graph_compute(graph, &plan);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int /*argc*/, const char ** /*argv*/) {
|
int main(int /*argc*/, const char ** /*argv*/) {
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/* .mem_size = */ 128*1024*1024,
|
/* .mem_size = */ 128*1024*1024,
|
||||||
|
@ -130,8 +119,6 @@ int main(int /*argc*/, const char ** /*argv*/) {
|
||||||
/* .no_alloc = */ false,
|
/* .no_alloc = */ false,
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<uint8_t> work_buffer;
|
|
||||||
|
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
|
||||||
struct ggml_tensor * x;
|
struct ggml_tensor * x;
|
||||||
|
@ -175,7 +162,10 @@ int main(int /*argc*/, const char ** /*argv*/) {
|
||||||
ggml_build_forward_expand(gf, r1);
|
ggml_build_forward_expand(gf, r1);
|
||||||
ggml_build_forward_expand(gf, r2);
|
ggml_build_forward_expand(gf, r2);
|
||||||
|
|
||||||
ggml_graph_compute_helper(work_buffer, gf, 4);
|
ggml_graph_prepare(gf, 4, nullptr);
|
||||||
|
ggml_graph_work_init(gf, nullptr);
|
||||||
|
ggml_graph_compute(gf);
|
||||||
|
ggml_graph_work_free(gf);
|
||||||
|
|
||||||
// check that r1 and r2 are the same
|
// check that r1 and r2 are the same
|
||||||
{
|
{
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue