Compare commits

...
Sign in to create a new pull request.

15 commits

Author SHA1 Message Date
Georgi Gerganov
15267192c0
llama : refactor tensor offloading as callback 2023-10-29 13:04:36 +02:00
Georgi Gerganov
da936188d8
llama : move refact in correct place + optimize graph input 2023-10-29 11:48:58 +02:00
Georgi Gerganov
739b85c985
llama : try to fix build 2023-10-29 11:25:32 +02:00
Georgi Gerganov
25cfbf6776
llama : fix non-CUDA build 2023-10-29 11:12:03 +02:00
Georgi Gerganov
b4ad03b3a7
llama : try to optimize offloading code 2023-10-29 10:33:11 +02:00
Georgi Gerganov
79617902ea
llama : fix res_norm offloading 2023-10-29 09:20:35 +02:00
Georgi Gerganov
e14aa46151
llama : do tensor offload only with CUDA 2023-10-29 08:03:46 +02:00
Georgi Gerganov
0dc05b8433
llama : factor graph input into a function 2023-10-29 07:52:43 +02:00
Georgi Gerganov
4e98897ede
llama : support offloading result_norm + comments 2023-10-29 07:36:07 +02:00
Georgi Gerganov
51c4f9ee9f
llama : comments 2023-10-28 22:50:08 +03:00
Georgi Gerganov
3af8771389
llama : update offload log messages to print node index 2023-10-28 22:36:44 +03:00
Georgi Gerganov
83d2c43791
llama : offload rest of the models
ggml-ci
2023-10-28 22:30:54 +03:00
Georgi Gerganov
38aca9e1ab
llama : factor out tensor offloading outside the build call (wip)
ggml-ci
2023-10-28 21:22:31 +03:00
Georgi Gerganov
5946d98fc8
metal : disable kernel load log 2023-10-28 21:22:01 +03:00
Georgi Gerganov
8b2420d249
llama : factor out ggml-alloc from graph graph build functions
ggml-ci
2023-10-28 19:54:28 +03:00
3 changed files with 1187 additions and 1346 deletions

View file

@ -238,12 +238,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
// load kernels // load kernels
{ {
NSError * error = nil; NSError * error = nil;
#define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ /*
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
(int) ctx->pipeline_##name.threadExecutionWidth); \ (int) ctx->pipeline_##name.threadExecutionWidth); \
*/
#define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
if (error) { \ if (error) { \
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
return NULL; \ return NULL; \

2
ggml.h
View file

@ -709,7 +709,7 @@ extern "C" {
// Context tensor enumeration and lookup // Context tensor enumeration and lookup
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx); GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);

2510
llama.cpp

File diff suppressed because it is too large Load diff