backend : add eval callback (#4935)
* backend : add eval callback ggml-ci * backend : group nodes in a single compute when user don't need them * backend : clean-up the implementation ggml-ci * simple : do not perform tensor data copy if not needed * simple : fix * simple : no need for ggml_is_contiguous + fix bool parse * llama : fix callback placement in llama_context_params * backend : avoid double-ask callback calls * simple : restore examples, imatrix will serve as a demo
This commit is contained in:
		
							parent
							
								
									c918fe8dca
								
							
						
					
					
						commit
						44a1a4a41a
					
				
					 4 changed files with 64 additions and 2 deletions
				
			
		
							
								
								
									
										4
									
								
								llama.h
									
										
									
									
									
								
							
							
						
						
									
										4
									
								
								llama.h
									
										
									
									
									
								
							|  | @ -2,6 +2,7 @@ | |||
| #define LLAMA_H | ||||
| 
 | ||||
| #include "ggml.h" | ||||
| #include "ggml-backend.h" | ||||
| #ifdef GGML_USE_CUBLAS | ||||
| #include "ggml-cuda.h" | ||||
| #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES | ||||
|  | @ -231,6 +232,9 @@ extern "C" { | |||
|         float    yarn_beta_slow;   // YaRN high correction dim
 | ||||
|         uint32_t yarn_orig_ctx;    // YaRN original context size
 | ||||
| 
 | ||||
|         ggml_backend_sched_eval_callback cb_eval; | ||||
|         void * cb_eval_user_data; | ||||
| 
 | ||||
|         enum ggml_type type_k; // data type for K cache
 | ||||
|         enum ggml_type type_v; // data type for V cache
 | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue