ggml : hide ggml_object, ggml_cgraph, ggml_hash_set (#9408)
* ggml : hide ggml_object, ggml_cgraph, ggml_hash_set ggml-ci * ggml : add ggml-impl.h to backends * ggml : fix compiler warnings ggml-ci * ggml : add assert upon adding nodes
This commit is contained in:
		
							parent
							
								
									c9c8575a1a
								
							
						
					
					
						commit
						d6a04f872d
					
				
					 18 changed files with 170 additions and 129 deletions
				
			
		|  | @ -183,7 +183,7 @@ int main(int argc, char ** argv)  { | ||||||
| 
 | 
 | ||||||
|     ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); |     ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); | ||||||
| 
 | 
 | ||||||
|     TENSOR_DUMP(gf->nodes[0]); |     TENSOR_DUMP(ggml_graph_node(gf, 0)); | ||||||
| 
 | 
 | ||||||
|     printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); |     printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); | ||||||
| 
 | 
 | ||||||
|  | @ -224,7 +224,7 @@ int main(int argc, char ** argv)  { | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     // Let's use the F32 result from above as a reference for the quantized multiplication
 |     // Let's use the F32 result from above as a reference for the quantized multiplication
 | ||||||
|     float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]); |     float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0)); | ||||||
| 
 | 
 | ||||||
|     printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); |     printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); | ||||||
|     printf("=====================================================================================\n"); |     printf("=====================================================================================\n"); | ||||||
|  | @ -252,7 +252,7 @@ int main(int argc, char ** argv)  { | ||||||
| 
 | 
 | ||||||
|         // Check that the matrix multiplication result is in the right ballpark
 |         // Check that the matrix multiplication result is in the right ballpark
 | ||||||
|         // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
 |         // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
 | ||||||
|         float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]); |         float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0)); | ||||||
|         float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); |         float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); | ||||||
|         float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
 |         float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -226,8 +226,8 @@ static ggml_status compute_piter( | ||||||
|         result.eigenvectors.resize(params.n_batch); |         result.eigenvectors.resize(params.n_batch); | ||||||
|         result.distances.resize(params.n_batch); |         result.distances.resize(params.n_batch); | ||||||
|         // get output nodes
 |         // get output nodes
 | ||||||
|         for (int i = 0; i < gf->n_nodes; ++i) { |         for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) { | ||||||
|             auto node = gf->nodes[i]; |             auto node = ggml_graph_node(gf, i); | ||||||
|             int iter = -1; |             int iter = -1; | ||||||
|             // find b_tensor (without copying data from device)
 |             // find b_tensor (without copying data from device)
 | ||||||
|             if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) { |             if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) { | ||||||
|  |  | ||||||
|  | @ -370,7 +370,7 @@ struct lora_merge_ctx { | ||||||
| 
 | 
 | ||||||
|         // write data to output file
 |         // write data to output file
 | ||||||
|         { |         { | ||||||
|             auto result = gf->nodes[gf->n_nodes - 1]; |             auto * result = ggml_graph_node(gf, -1); | ||||||
|             size_t len = ggml_nbytes(result); |             size_t len = ggml_nbytes(result); | ||||||
|             if (read_buf.size() < len) { |             if (read_buf.size() < len) { | ||||||
|                 read_buf.resize(len); |                 read_buf.resize(len); | ||||||
|  |  | ||||||
|  | @ -2449,7 +2449,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima | ||||||
|     ggml_backend_graph_compute(ctx->backend, gf); |     ggml_backend_graph_compute(ctx->backend, gf); | ||||||
| 
 | 
 | ||||||
|     // the last node is the embedding tensor
 |     // the last node is the embedding tensor
 | ||||||
|     struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; |     struct ggml_tensor * embeddings = ggml_graph_node(gf, -1); | ||||||
| 
 | 
 | ||||||
|     // copy the embeddings to the location passed by the user
 |     // copy the embeddings to the location passed by the user
 | ||||||
|     ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); |     ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); | ||||||
|  |  | ||||||
|  | @ -184,7 +184,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> | ||||||
|     // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
 |     // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
 | ||||||
|     ggml_build_forward_expand(gf, flatten); |     ggml_build_forward_expand(gf, flatten); | ||||||
|     ggml_graph_compute_with_ctx(model.ctx, gf, 1); |     ggml_graph_compute_with_ctx(model.ctx, gf, 1); | ||||||
|     struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1]; |     struct ggml_tensor* result = ggml_graph_node(gf, -1); | ||||||
| 
 | 
 | ||||||
|     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
 |     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
 | ||||||
|     // append without newline tokens (default behavior in llava_arch when not using unpad ):
 |     // append without newline tokens (default behavior in llava_arch when not using unpad ):
 | ||||||
|  |  | ||||||
|  | @ -358,6 +358,7 @@ extern "C" { | ||||||
| 
 | 
 | ||||||
|     struct ggml_object; |     struct ggml_object; | ||||||
|     struct ggml_context; |     struct ggml_context; | ||||||
|  |     struct ggml_cgraph; | ||||||
| 
 | 
 | ||||||
|     // NOTE: always add types at the end of the enum to keep backward compatibility
 |     // NOTE: always add types at the end of the enum to keep backward compatibility
 | ||||||
|     enum ggml_type { |     enum ggml_type { | ||||||
|  | @ -575,23 +576,9 @@ extern "C" { | ||||||
|         GGML_TENSOR_FLAG_PARAM  = 4, |         GGML_TENSOR_FLAG_PARAM  = 4, | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|     // ggml object
 |  | ||||||
|     struct ggml_object { |  | ||||||
|         size_t offs; |  | ||||||
|         size_t size; |  | ||||||
| 
 |  | ||||||
|         struct ggml_object * next; |  | ||||||
| 
 |  | ||||||
|         enum ggml_object_type type; |  | ||||||
| 
 |  | ||||||
|         char padding[4]; |  | ||||||
|     }; |  | ||||||
| 
 |  | ||||||
|     static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); |  | ||||||
| 
 |  | ||||||
|     // n-dimensional tensor
 |     // n-dimensional tensor
 | ||||||
|     struct ggml_tensor { |     struct ggml_tensor { | ||||||
|         enum ggml_type         type; |         enum ggml_type type; | ||||||
| 
 | 
 | ||||||
|         GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor"); |         GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor"); | ||||||
| 
 | 
 | ||||||
|  | @ -655,7 +642,7 @@ extern "C" { | ||||||
| 
 | 
 | ||||||
|     struct ggml_threadpool;     // forward declaration, see ggml.c
 |     struct ggml_threadpool;     // forward declaration, see ggml.c
 | ||||||
| 
 | 
 | ||||||
|     typedef struct  ggml_threadpool * ggml_threadpool_t; |     typedef struct ggml_threadpool * ggml_threadpool_t; | ||||||
| 
 | 
 | ||||||
|     // the compute plan that needs to be prepared for ggml_graph_compute()
 |     // the compute plan that needs to be prepared for ggml_graph_compute()
 | ||||||
|     // since https://github.com/ggerganov/ggml/issues/287
 |     // since https://github.com/ggerganov/ggml/issues/287
 | ||||||
|  | @ -671,35 +658,6 @@ extern "C" { | ||||||
|         void *              abort_callback_data; |         void *              abort_callback_data; | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|     enum ggml_cgraph_eval_order { |  | ||||||
|         GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, |  | ||||||
|         GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, |  | ||||||
|         GGML_CGRAPH_EVAL_ORDER_COUNT |  | ||||||
|     }; |  | ||||||
| 
 |  | ||||||
|     typedef uint32_t ggml_bitset_t; |  | ||||||
| 
 |  | ||||||
|     struct ggml_hash_set { |  | ||||||
|         size_t size; |  | ||||||
|         ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
 |  | ||||||
|         struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
 |  | ||||||
|     }; |  | ||||||
| 
 |  | ||||||
|     // computation graph
 |  | ||||||
|     struct ggml_cgraph { |  | ||||||
|         int size; |  | ||||||
|         int n_nodes; |  | ||||||
|         int n_leafs; |  | ||||||
| 
 |  | ||||||
|         struct ggml_tensor ** nodes; |  | ||||||
|         struct ggml_tensor ** grads; |  | ||||||
|         struct ggml_tensor ** leafs; |  | ||||||
| 
 |  | ||||||
|         struct ggml_hash_set visited_hash_set; |  | ||||||
| 
 |  | ||||||
|         enum ggml_cgraph_eval_order order; |  | ||||||
|     }; |  | ||||||
| 
 |  | ||||||
|     // scratch buffer
 |     // scratch buffer
 | ||||||
|     struct ggml_scratch { |     struct ggml_scratch { | ||||||
|         size_t offs; |         size_t offs; | ||||||
|  | @ -2017,8 +1975,6 @@ extern "C" { | ||||||
|     typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); |     typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); | ||||||
|     typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); |     typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); | ||||||
| 
 | 
 | ||||||
|     #define GGML_N_TASKS_MAX -1 |  | ||||||
| 
 |  | ||||||
|     GGML_API struct ggml_tensor * ggml_map_custom1( |     GGML_API struct ggml_tensor * ggml_map_custom1( | ||||||
|             struct ggml_context   * ctx, |             struct ggml_context   * ctx, | ||||||
|             struct ggml_tensor    * a, |             struct ggml_tensor    * a, | ||||||
|  | @ -2088,30 +2044,35 @@ extern "C" { | ||||||
|             struct ggml_context * ctx, |             struct ggml_context * ctx, | ||||||
|             struct ggml_tensor  * tensor); |             struct ggml_tensor  * tensor); | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
|     GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); |     GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); | ||||||
|     GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); |     GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); | ||||||
| 
 | 
 | ||||||
|     // graph allocation in a context
 |     // graph allocation in a context
 | ||||||
|     GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
 |     GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
 | ||||||
|     GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads); |     GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads); | ||||||
|     GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph); |     GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph); | ||||||
|     GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1); |     GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst); | ||||||
|     GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst); |     GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph);  // zero grads
 | ||||||
|     GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
 |     GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph); | ||||||
|     GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph); | 
 | ||||||
|  |     GGML_API int                   ggml_graph_size   (struct ggml_cgraph * cgraph); | ||||||
|  |     GGML_API struct ggml_tensor *  ggml_graph_node   (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
 | ||||||
|  |     GGML_API struct ggml_tensor ** ggml_graph_nodes  (struct ggml_cgraph * cgraph); | ||||||
|  |     GGML_API int                   ggml_graph_n_nodes(struct ggml_cgraph * cgraph); | ||||||
|  | 
 | ||||||
|  |     GGML_API void   ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); | ||||||
| 
 | 
 | ||||||
|     GGML_API size_t ggml_graph_overhead(void); |     GGML_API size_t ggml_graph_overhead(void); | ||||||
|     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); |     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); | ||||||
| 
 | 
 | ||||||
|     GGML_API struct ggml_threadpool_params   ggml_threadpool_params_default(int n_threads); |     GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); | ||||||
|     GGML_API void                            ggml_threadpool_params_init  (struct ggml_threadpool_params *p, int n_threads); |     GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads); | ||||||
|     GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1); |     GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); | ||||||
|     GGML_API struct ggml_threadpool*         ggml_threadpool_new          (struct ggml_threadpool_params  * params); |     GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params); | ||||||
|     GGML_API void                            ggml_threadpool_free         (struct ggml_threadpool * threadpool); |     GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool); | ||||||
|     GGML_API int                             ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); |     GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); | ||||||
|     GGML_API void                            ggml_threadpool_pause        (struct ggml_threadpool * threadpool); |     GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool); | ||||||
|     GGML_API void                            ggml_threadpool_resume       (struct ggml_threadpool * threadpool); |     GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool); | ||||||
| 
 | 
 | ||||||
|     // ggml_graph_plan() has to be called before ggml_graph_compute()
 |     // ggml_graph_plan() has to be called before ggml_graph_compute()
 | ||||||
|     // when plan.work_size > 0, caller must allocate memory for plan.work_data
 |     // when plan.work_size > 0, caller must allocate memory for plan.work_data
 | ||||||
|  |  | ||||||
|  | @ -1,3 +1,4 @@ | ||||||
|  | #include "ggml-impl.h" | ||||||
| #include "ggml-blas.h" | #include "ggml-blas.h" | ||||||
| #include "ggml-backend-impl.h" | #include "ggml-backend-impl.h" | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -30,6 +30,7 @@ | ||||||
| #include <cstring> | #include <cstring> | ||||||
| #include <mutex> | #include <mutex> | ||||||
| 
 | 
 | ||||||
|  | #include "ggml-impl.h" | ||||||
| #include "ggml-backend-impl.h" | #include "ggml-backend-impl.h" | ||||||
| #include "ggml-cann/aclnn_ops.h" | #include "ggml-cann/aclnn_ops.h" | ||||||
| #include "ggml-cann/common.h" | #include "ggml-cann/common.h" | ||||||
|  |  | ||||||
|  | @ -1,5 +1,5 @@ | ||||||
| #include "ggml-cuda.h" | #include "ggml-cuda.h" | ||||||
| #include "ggml.h" | #include "ggml-impl.h" | ||||||
| #include "ggml-backend-impl.h" | #include "ggml-backend-impl.h" | ||||||
| 
 | 
 | ||||||
| #include "ggml-cuda/common.cuh" | #include "ggml-cuda/common.cuh" | ||||||
|  |  | ||||||
|  | @ -629,8 +629,16 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { | ||||||
| #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) | #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | enum ggml_cgraph_eval_order { | ||||||
|  |     GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, | ||||||
|  |     GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, | ||||||
|  |     GGML_CGRAPH_EVAL_ORDER_COUNT | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| // bitset
 | // bitset
 | ||||||
| 
 | 
 | ||||||
|  | typedef uint32_t ggml_bitset_t; | ||||||
|  | 
 | ||||||
| static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated"); | static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated"); | ||||||
| #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
 | #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
 | ||||||
| #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1) | #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1) | ||||||
|  | @ -656,6 +664,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) { | ||||||
| #define GGML_HASHSET_FULL ((size_t)-1) | #define GGML_HASHSET_FULL ((size_t)-1) | ||||||
| #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2) | #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2) | ||||||
| 
 | 
 | ||||||
|  | struct ggml_hash_set { | ||||||
|  |     size_t size; | ||||||
|  |     ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
 | ||||||
|  |     struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
 | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| struct ggml_hash_set ggml_hash_set_new(size_t size); | struct ggml_hash_set ggml_hash_set_new(size_t size); | ||||||
| void                 ggml_hash_set_free(struct ggml_hash_set * hash_set); | void                 ggml_hash_set_free(struct ggml_hash_set * hash_set); | ||||||
| 
 | 
 | ||||||
|  | @ -745,6 +759,24 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g | ||||||
|     GGML_ABORT("fatal error"); |     GGML_ABORT("fatal error"); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | // computation graph
 | ||||||
|  | 
 | ||||||
|  | struct ggml_cgraph { | ||||||
|  |     int size; | ||||||
|  |     int n_nodes; | ||||||
|  |     int n_leafs; | ||||||
|  | 
 | ||||||
|  |     struct ggml_tensor ** nodes; | ||||||
|  |     struct ggml_tensor ** grads; | ||||||
|  |     struct ggml_tensor ** leafs; | ||||||
|  | 
 | ||||||
|  |     struct ggml_hash_set visited_hash_set; | ||||||
|  | 
 | ||||||
|  |     enum ggml_cgraph_eval_order order; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); | ||||||
|  | 
 | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | @ -1,4 +1,4 @@ | ||||||
| #include "ggml.h" | #include "ggml-impl.h" | ||||||
| #include "ggml-backend.h" | #include "ggml-backend.h" | ||||||
| #include "ggml-backend-impl.h" | #include "ggml-backend-impl.h" | ||||||
| #include "ggml-kompute.h" | #include "ggml-kompute.h" | ||||||
|  |  | ||||||
|  | @ -1,7 +1,7 @@ | ||||||
| #import "ggml-metal.h" | #import "ggml-metal.h" | ||||||
| 
 | 
 | ||||||
|  | #import "ggml-impl.h" | ||||||
| #import "ggml-backend-impl.h" | #import "ggml-backend-impl.h" | ||||||
| #import "ggml.h" |  | ||||||
| 
 | 
 | ||||||
| #import <Foundation/Foundation.h> | #import <Foundation/Foundation.h> | ||||||
| 
 | 
 | ||||||
|  | @ -882,7 +882,7 @@ static enum ggml_status ggml_metal_graph_compute( | ||||||
|     // create multiple command buffers and enqueue them |     // create multiple command buffers and enqueue them | ||||||
|     // then, we encode the graph into the command buffers in parallel |     // then, we encode the graph into the command buffers in parallel | ||||||
| 
 | 
 | ||||||
|     const int n_nodes  = gf->n_nodes; |     const int n_nodes = gf->n_nodes; | ||||||
|     const int n_cb = ctx->n_cb; |     const int n_cb = ctx->n_cb; | ||||||
|     const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; |     const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,5 +1,5 @@ | ||||||
| #include "ggml-rpc.h" | #include "ggml-rpc.h" | ||||||
| #include "ggml.h" | #include "ggml-impl.h" | ||||||
| #include "ggml-backend-impl.h" | #include "ggml-backend-impl.h" | ||||||
| 
 | 
 | ||||||
| #include <cinttypes> | #include <cinttypes> | ||||||
|  |  | ||||||
|  | @ -33,7 +33,7 @@ | ||||||
| #include <sycl/half_type.hpp> | #include <sycl/half_type.hpp> | ||||||
| 
 | 
 | ||||||
| #include "ggml-sycl.h" | #include "ggml-sycl.h" | ||||||
| #include "ggml.h" | #include "ggml-impl.h" | ||||||
| #include "ggml-backend-impl.h" | #include "ggml-backend-impl.h" | ||||||
| 
 | 
 | ||||||
| #include "ggml-sycl/backend.hpp" | #include "ggml-sycl/backend.hpp" | ||||||
|  |  | ||||||
|  | @ -21,7 +21,7 @@ | ||||||
| #include <memory> | #include <memory> | ||||||
| #include <mutex> | #include <mutex> | ||||||
| 
 | 
 | ||||||
| #include "ggml.h" | #include "ggml-impl.h" | ||||||
| #include "ggml-backend-impl.h" | #include "ggml-backend-impl.h" | ||||||
| 
 | 
 | ||||||
| #include "ggml-vulkan-shaders.hpp" | #include "ggml-vulkan-shaders.hpp" | ||||||
|  |  | ||||||
							
								
								
									
										112
									
								
								ggml/src/ggml.c
									
										
									
									
									
								
							
							
						
						
									
										112
									
								
								ggml/src/ggml.c
									
										
									
									
									
								
							|  | @ -287,6 +287,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) { | ||||||
| #define GGML_DEBUG 0 | #define GGML_DEBUG 0 | ||||||
| #define GGML_GELU_FP16 | #define GGML_GELU_FP16 | ||||||
| #define GGML_GELU_QUICK_FP16 | #define GGML_GELU_QUICK_FP16 | ||||||
|  | #define GGML_N_TASKS_MAX (-1) | ||||||
| 
 | 
 | ||||||
| #define GGML_SOFT_MAX_UNROLL 4 | #define GGML_SOFT_MAX_UNROLL 4 | ||||||
| #define GGML_VEC_DOT_UNROLL  2 | #define GGML_VEC_DOT_UNROLL  2 | ||||||
|  | @ -1120,21 +1121,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { | ||||||
| #define GGML_F32x4_ADD          vaddq_f32 | #define GGML_F32x4_ADD          vaddq_f32 | ||||||
| #define GGML_F32x4_MUL          vmulq_f32 | #define GGML_F32x4_MUL          vmulq_f32 | ||||||
| #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) | #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) | ||||||
| #define GGML_F32x4_REDUCE(res, x)              \ | #define GGML_F32x4_REDUCE(res, x)                  \ | ||||||
| {                                              \ | {                                                  \ | ||||||
|     int offset = GGML_F32_ARR >> 1;            \ |     int offset = GGML_F32_ARR >> 1;                \ | ||||||
|     for (int i = 0; i < offset; ++i) {         \ |     for (int i = 0; i < offset; ++i) {             \ | ||||||
|         x[i] = vaddq_f32(x[i], x[offset+i]);   \ |         (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ | ||||||
|     }                                          \ |     }                                              \ | ||||||
|     offset >>= 1;                              \ |     offset >>= 1;                                  \ | ||||||
|     for (int i = 0; i < offset; ++i) {         \ |     for (int i = 0; i < offset; ++i) {             \ | ||||||
|         x[i] = vaddq_f32(x[i], x[offset+i]);   \ |         (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ | ||||||
|     }                                          \ |     }                                              \ | ||||||
|     offset >>= 1;                              \ |     offset >>= 1;                                  \ | ||||||
|     for (int i = 0; i < offset; ++i) {         \ |     for (int i = 0; i < offset; ++i) {             \ | ||||||
|         x[i] = vaddq_f32(x[i], x[offset+i]);   \ |         (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ | ||||||
|     }                                          \ |     }                                              \ | ||||||
|     res = GGML_F32x4_REDUCE_ONE(x[0]);         \ |     (res) = GGML_F32x4_REDUCE_ONE((x)[0]);         \ | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #define GGML_F32_VEC        GGML_F32x4 | #define GGML_F32_VEC        GGML_F32x4 | ||||||
|  | @ -1161,30 +1162,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { | ||||||
|     #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) |     #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) | ||||||
|     #define GGML_F16x8_ADD          vaddq_f16 |     #define GGML_F16x8_ADD          vaddq_f16 | ||||||
|     #define GGML_F16x8_MUL          vmulq_f16 |     #define GGML_F16x8_MUL          vmulq_f16 | ||||||
|     #define GGML_F16x8_REDUCE(res, x)                             \ |     #define GGML_F16x8_REDUCE(res, x)                               \ | ||||||
|     do {                                                          \ |     do {                                                            \ | ||||||
|         int offset = GGML_F16_ARR >> 1;                           \ |         int offset = GGML_F16_ARR >> 1;                             \ | ||||||
|         for (int i = 0; i < offset; ++i) {                        \ |         for (int i = 0; i < offset; ++i) {                          \ | ||||||
|             x[i] = vaddq_f16(x[i], x[offset+i]);                  \ |             (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \ | ||||||
|         }                                                         \ |         }                                                           \ | ||||||
|         offset >>= 1;                                             \ |         offset >>= 1;                                               \ | ||||||
|         for (int i = 0; i < offset; ++i) {                        \ |         for (int i = 0; i < offset; ++i) {                          \ | ||||||
|             x[i] = vaddq_f16(x[i], x[offset+i]);                  \ |             (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \ | ||||||
|         }                                                         \ |         }                                                           \ | ||||||
|         offset >>= 1;                                             \ |         offset >>= 1;                                               \ | ||||||
|         for (int i = 0; i < offset; ++i) {                        \ |         for (int i = 0; i < offset; ++i) {                          \ | ||||||
|             x[i] = vaddq_f16(x[i], x[offset+i]);                  \ |             (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \ | ||||||
|         }                                                         \ |         }                                                           \ | ||||||
|         const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ |         const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \ | ||||||
|         const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ |         const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \ | ||||||
|         res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \ |         (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \ | ||||||
|     } while (0) |     } while (0) | ||||||
| 
 | 
 | ||||||
|     #define GGML_F16_VEC                GGML_F16x8 |     #define GGML_F16_VEC                GGML_F16x8 | ||||||
|     #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO |     #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO | ||||||
|     #define GGML_F16_VEC_SET1           GGML_F16x8_SET1 |     #define GGML_F16_VEC_SET1           GGML_F16x8_SET1 | ||||||
|     #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p) |     #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p) | ||||||
|     #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i]) |     #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i]) | ||||||
|     #define GGML_F16_VEC_FMA            GGML_F16x8_FMA |     #define GGML_F16_VEC_FMA            GGML_F16x8_FMA | ||||||
|     #define GGML_F16_VEC_ADD            GGML_F16x8_ADD |     #define GGML_F16_VEC_ADD            GGML_F16x8_ADD | ||||||
|     #define GGML_F16_VEC_MUL            GGML_F16x8_MUL |     #define GGML_F16_VEC_MUL            GGML_F16x8_MUL | ||||||
|  | @ -1893,6 +1894,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { | ||||||
| #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) | #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | //
 | ||||||
|  | // ggml object
 | ||||||
|  | //
 | ||||||
|  | 
 | ||||||
|  | struct ggml_object { | ||||||
|  |     size_t offs; | ||||||
|  |     size_t size; | ||||||
|  | 
 | ||||||
|  |     struct ggml_object * next; | ||||||
|  | 
 | ||||||
|  |     enum ggml_object_type type; | ||||||
|  | 
 | ||||||
|  |     char padding[4]; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); | ||||||
|  | 
 | ||||||
| //
 | //
 | ||||||
| // ggml context
 | // ggml context
 | ||||||
| //
 | //
 | ||||||
|  | @ -19161,6 +19179,34 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) { | ||||||
|     ggml_hash_set_reset(&cgraph->visited_hash_set); |     ggml_hash_set_reset(&cgraph->visited_hash_set); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | int ggml_graph_size(struct ggml_cgraph * cgraph) { | ||||||
|  |     return cgraph->size; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) { | ||||||
|  |     if (i < 0) { | ||||||
|  |         GGML_ASSERT(cgraph->n_nodes + i >= 0); | ||||||
|  |         return cgraph->nodes[cgraph->n_nodes + i]; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     GGML_ASSERT(i < cgraph->n_nodes); | ||||||
|  |     return cgraph->nodes[i]; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) { | ||||||
|  |     return cgraph->nodes; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) { | ||||||
|  |     return cgraph->n_nodes; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) { | ||||||
|  |     GGML_ASSERT(cgraph->size > cgraph->n_nodes); | ||||||
|  |     cgraph->nodes[cgraph->n_nodes] = tensor; | ||||||
|  |     cgraph->n_nodes++; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| // Android's libc implementation "bionic" does not support setting affinity
 | // Android's libc implementation "bionic" does not support setting affinity
 | ||||||
| #if defined(__gnu_linux__) | #if defined(__gnu_linux__) | ||||||
| static void set_numa_thread_affinity(int thread_n) { | static void set_numa_thread_affinity(int thread_n) { | ||||||
|  |  | ||||||
|  | @ -9877,8 +9877,8 @@ struct llm_build_context { | ||||||
|     struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { |     struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { | ||||||
|         // find result_norm tensor for input
 |         // find result_norm tensor for input
 | ||||||
|         struct ggml_tensor * inp = nullptr; |         struct ggml_tensor * inp = nullptr; | ||||||
|         for (int i = gf->n_nodes - 1; i >= 0; --i) { |         for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { | ||||||
|             inp = gf->nodes[i]; |             inp = ggml_graph_node(gf, i); | ||||||
|             if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) { |             if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) { | ||||||
|                 break; |                 break; | ||||||
|             } else { |             } else { | ||||||
|  | @ -16207,8 +16207,8 @@ static int llama_decode_internal( | ||||||
|         ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); |         ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); | ||||||
| 
 | 
 | ||||||
|         // the output is always the last tensor in the graph
 |         // the output is always the last tensor in the graph
 | ||||||
|         struct ggml_tensor * res  = gf->nodes[gf->n_nodes - 1]; |         struct ggml_tensor * res  = ggml_graph_node(gf, -1); | ||||||
|         struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2]; |         struct ggml_tensor * embd = ggml_graph_node(gf, -2); | ||||||
| 
 | 
 | ||||||
|         if (lctx.n_outputs == 0) { |         if (lctx.n_outputs == 0) { | ||||||
|             // no output
 |             // no output
 | ||||||
|  | @ -16217,9 +16217,9 @@ static int llama_decode_internal( | ||||||
|         } else if (cparams.embeddings) { |         } else if (cparams.embeddings) { | ||||||
|             res  = nullptr; // do not extract logits for embedding case
 |             res  = nullptr; // do not extract logits for embedding case
 | ||||||
|             embd = nullptr; |             embd = nullptr; | ||||||
|             for (int i = gf->n_nodes - 1; i >= 0; --i) { |             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { | ||||||
|                 if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) { |                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { | ||||||
|                     embd = gf->nodes[i]; |                     embd = ggml_graph_node(gf, i); | ||||||
|                     break; |                     break; | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|  | @ -16436,15 +16436,15 @@ static int llama_encode_internal( | ||||||
|     // there are two cases here
 |     // there are two cases here
 | ||||||
|     if (llama_model_has_decoder(&lctx.model)) { |     if (llama_model_has_decoder(&lctx.model)) { | ||||||
|         // first case is an encoder-decoder T5 model where embeddings are passed to decoder
 |         // first case is an encoder-decoder T5 model where embeddings are passed to decoder
 | ||||||
|         embd = gf->nodes[gf->n_nodes - 1]; |         embd = ggml_graph_node(gf, -1); | ||||||
|         GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor"); |         GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor"); | ||||||
|     } else { |     } else { | ||||||
|         // second case is an encoder-only T5 model
 |         // second case is an encoder-only T5 model
 | ||||||
|         if (cparams.embeddings) { |         if (cparams.embeddings) { | ||||||
|             // only output embeddings if required
 |             // only output embeddings if required
 | ||||||
|             embd = gf->nodes[gf->n_nodes - 1]; |             embd = ggml_graph_node(gf, -1); | ||||||
|             if (strcmp(embd->name, "result_embd_pooled") != 0) { |             if (strcmp(embd->name, "result_embd_pooled") != 0) { | ||||||
|                 embd = gf->nodes[gf->n_nodes - 2]; |                 embd = ggml_graph_node(gf, -2); | ||||||
|             } |             } | ||||||
|             GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); |             GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); | ||||||
|         } |         } | ||||||
|  | @ -18492,7 +18492,7 @@ struct llama_context * llama_new_context_with_model( | ||||||
| 
 | 
 | ||||||
|             // note: the number of splits during measure is higher than during inference due to the kv shift
 |             // note: the number of splits during measure is higher than during inference due to the kv shift
 | ||||||
|             int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); |             int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); | ||||||
|             LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, gf->n_nodes); |             LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, ggml_graph_n_nodes(gf)); | ||||||
|             LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits); |             LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  | @ -519,7 +519,7 @@ struct test_case { | ||||||
| 
 | 
 | ||||||
|         // add sentinels as graph nodes so that they are checked in the callback
 |         // add sentinels as graph nodes so that they are checked in the callback
 | ||||||
|         for (ggml_tensor * sentinel : sentinels) { |         for (ggml_tensor * sentinel : sentinels) { | ||||||
|             gf->nodes[gf->n_nodes++] = sentinel; |             ggml_graph_add_node(gf, sentinel); | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         // randomize tensors
 |         // randomize tensors
 | ||||||
|  | @ -679,9 +679,9 @@ struct test_case { | ||||||
| 
 | 
 | ||||||
|         // duplicate the op
 |         // duplicate the op
 | ||||||
|         size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
 |         size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
 | ||||||
|         int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1; |         int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1; | ||||||
|         for (int i = 1; i < n_runs; i++) { |         for (int i = 1; i < n_runs; i++) { | ||||||
|             gf->nodes[gf->n_nodes++] = out; |             ggml_graph_add_node(gf, out); | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         // calculate memory
 |         // calculate memory
 | ||||||
|  | @ -696,11 +696,11 @@ struct test_case { | ||||||
|             } |             } | ||||||
|             return size; |             return size; | ||||||
|         }; |         }; | ||||||
|         for (int i = 0; i < gf->n_nodes; i++) { |         for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) { | ||||||
|             if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) { |             if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) { | ||||||
|                 continue; |                 continue; | ||||||
|             } |             } | ||||||
|             mem += tensor_op_size(gf->nodes[i]); |             mem += tensor_op_size(ggml_graph_node(gf, i)); | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         // run
 |         // run
 | ||||||
|  | @ -804,7 +804,7 @@ struct test_case { | ||||||
|         ggml_graph_cpy(gf, gb); |         ggml_graph_cpy(gf, gb); | ||||||
|         ggml_build_backward_expand(ctx, gf, gb, false); |         ggml_build_backward_expand(ctx, gf, gb, false); | ||||||
|         if (expect.size() != 1 || expect[0] != 0.0f) { |         if (expect.size() != 1 || expect[0] != 0.0f) { | ||||||
|             GGML_ASSERT(gb->n_nodes > gf->n_nodes); |             GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf)); | ||||||
|             for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { |             for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { | ||||||
|                 GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE); |                 GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE); | ||||||
|             } |             } | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue