mtl : preparing for merge

This commit is contained in:
Georgi Gerganov 2023-06-04 09:27:27 +03:00
parent 4df2ef3161
commit 18e482a89c
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
13 changed files with 94 additions and 144 deletions

1
.gitignore vendored
View file

@ -17,6 +17,7 @@ build-release/
build-static/ build-static/
build-cublas/ build-cublas/
build-opencl/ build-opencl/
build-mtl/
build-no-accel/ build-no-accel/
build-sanitize-addr/ build-sanitize-addr/
build-sanitize-thread/ build-sanitize-thread/

View file

@ -71,6 +71,7 @@ option(LLAMA_CUBLAS "llama: use cuBLAS"
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@ -183,7 +184,7 @@ if (LLAMA_CUBLAS)
enable_language(CUDA) enable_language(CUDA)
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h) set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
add_compile_definitions(GGML_USE_CUBLAS) add_compile_definitions(GGML_USE_CUBLAS)
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
@ -200,12 +201,37 @@ if (LLAMA_CUBLAS)
endif() endif()
endif() endif()
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
set(GGML_SOURCES_METAL ggml-mtl.m ggml-mtl.h)
add_compile_definitions(GGML_USE_METAL)
add_compile_definitions(GGML_METAL_NDEBUG)
# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
# copy ggml-mtl.metal to bin directory
configure_file(ggml-mtl.metal bin/ggml-mtl.metal COPYONLY)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
${METALPERFORMANCE_FRAMEWORK}
)
endif()
if (LLAMA_CLBLAST) if (LLAMA_CLBLAST)
find_package(CLBlast) find_package(CLBlast)
if (CLBlast_FOUND) if (CLBlast_FOUND)
message(STATUS "CLBlast found") message(STATUS "CLBlast found")
set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h) set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
add_compile_definitions(GGML_USE_CLBLAST) add_compile_definitions(GGML_USE_CLBLAST)
@ -370,8 +396,10 @@ endif()
add_library(ggml OBJECT add_library(ggml OBJECT
ggml.c ggml.c
ggml.h ggml.h
${GGML_CUDA_SOURCES} ${GGML_SOURCES_CUDA}
${GGML_OPENCL_SOURCES}) ${GGML_SOURCES_OPENCL}
${GGML_SOURCES_METAL}
)
target_include_directories(ggml PUBLIC .) target_include_directories(ggml PUBLIC .)
target_compile_features(ggml PUBLIC c_std_11) # don't bump target_compile_features(ggml PUBLIC c_std_11) # don't bump
@ -385,8 +413,6 @@ add_library(llama
llama.cpp llama.cpp
llama.h llama.h
llama-util.h llama-util.h
examples/mtl/mtl.h # TODO: METAL TMP
examples/mtl/mtl.m # TODO: METAL TMP
) )
target_include_directories(llama PUBLIC .) target_include_directories(llama PUBLIC .)
@ -394,19 +420,14 @@ target_compile_features(llama PUBLIC cxx_std_11) # don't bump
target_link_libraries(llama PRIVATE target_link_libraries(llama PRIVATE
ggml ggml
${LLAMA_EXTRA_LIBS} ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY} # TODO: METAL TMP
${METAL_FRAMEWORK} # TODO: METAL TMP
${METALKIT_FRAMEWORK} # TODO: METAL TMP
${METALPERFORMANCE_FRAMEWORK} # TODO: METAL TMP
) )
target_compile_definitions(llama PRIVATE LLAMA_MTL_NDEBUG) # TODO: METAL TMP
if (BUILD_SHARED_LIBS) if (BUILD_SHARED_LIBS)
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
endif() endif()
if (GGML_CUDA_SOURCES) if (GGML_SOURCES_CUDA)
message(STATUS "GGML CUDA sources found, configuring CUDA architecture") message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF) set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")

View file

@ -37,8 +37,10 @@ else()
add_subdirectory(save-load-state) add_subdirectory(save-load-state)
add_subdirectory(benchmark) add_subdirectory(benchmark)
add_subdirectory(baby-llama) add_subdirectory(baby-llama)
if (LLAMA_METAL)
add_subdirectory(mtl) add_subdirectory(mtl)
if(LLAMA_BUILD_SERVER) endif()
if (LLAMA_BUILD_SERVER)
add_subdirectory(server) add_subdirectory(server)
endif() endif()
endif() endif()

View file

@ -301,8 +301,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.mem_test = true; params.mem_test = true;
} else if (arg == "--export") { } else if (arg == "--export") {
params.export_cgraph = true; params.export_cgraph = true;
} else if (arg == "--import") {
params.import_cgraph = true;
} else if (arg == "--verbose-prompt") { } else if (arg == "--verbose-prompt") {
params.verbose_prompt = true; params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") { } else if (arg == "-r" || arg == "--reverse-prompt") {
@ -443,7 +441,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
#endif #endif
fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
fprintf(stderr, " --import import a computation graph from 'llama.ggml'\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n"); fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
@ -493,7 +490,6 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
lparams.use_mlock = params.use_mlock; lparams.use_mlock = params.use_mlock;
lparams.logits_all = params.perplexity; lparams.logits_all = params.perplexity;
lparams.embedding = params.embedding; lparams.embedding = params.embedding;
lparams.cgraph = params.import_cgraph;
llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams); llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);

View file

@ -72,7 +72,6 @@ struct gpt_params {
bool use_mlock = false; // use mlock to keep model in memory bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage bool mem_test = false; // compute maximum memory usage
bool export_cgraph = false; // export the computation graph bool export_cgraph = false; // export the computation graph
bool import_cgraph = false; // import a computation graph
bool verbose_prompt = false; // print prompt tokens before generation bool verbose_prompt = false; // print prompt tokens before generation
}; };

View file

@ -1,33 +1,6 @@
if (APPLE) if (APPLE)
#
# mtl
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
set(TEST_TARGET mtl) set(TEST_TARGET mtl)
add_executable(${TEST_TARGET} mtl.cpp mtl.h mtl.m) add_executable(${TEST_TARGET} mtl.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE target_link_libraries(${TEST_TARGET} PRIVATE ggml)
ggml
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
${METALPERFORMANCE_FRAMEWORK}
)
# TODO: temporary until the kernels are ready
# custom command to build mtl.metal into a library
# depends on the mtl.metal file
add_custom_target(mtl.metallib-tmp ALL DEPENDS ${CMAKE_BINARY_DIR}/mtl.metallib)
add_custom_command(
OUTPUT ${CMAKE_BINARY_DIR}/mtl.metallib
COMMAND xcrun -sdk macosx metal -c ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal -o ${CMAKE_BINARY_DIR}/mtl.air
COMMAND xcrun -sdk macosx metallib ${CMAKE_BINARY_DIR}/mtl.air -o ${CMAKE_BINARY_DIR}/mtl.metallib
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal
COMMENT "Building mtl.metallib"
)
endif() endif()

View file

@ -1,5 +1,5 @@
#include "ggml.h" #include "ggml.h"
#include "mtl.h" #include "ggml-mtl.h"
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
@ -51,7 +51,7 @@ int main(int argc, char ** argv) {
} }
// this allocates all Metal resources and memory buffers // this allocates all Metal resources and memory buffers
auto * ctx_mtl = llama_mtl_init( auto * ctx_mtl = ggml_mtl_init(
ggml_get_mem_buffer(ctx_data), ggml_get_mem_buffer(ctx_data),
ggml_get_mem_size (ctx_data), ggml_get_mem_size (ctx_data),
ggml_get_mem_buffer(ctx_eval), ggml_get_mem_buffer(ctx_eval),
@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
const std::vector<int> tmp(n_batch, 1); // BOS const std::vector<int> tmp(n_batch, 1); // BOS
// warmup // warmup
llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past); ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
const int n_iter = 16; const int n_iter = 16;
@ -75,7 +75,7 @@ int main(int argc, char ** argv) {
// the actual inference happens here // the actual inference happens here
for (int i = 0; i < n_iter; ++i) { for (int i = 0; i < n_iter; ++i) {
llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past); ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
} }
const int64_t t1 = ggml_time_us(); const int64_t t1 = ggml_time_us();
@ -83,7 +83,7 @@ int main(int argc, char ** argv) {
printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter); printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
} }
llama_mtl_free(ctx_mtl); ggml_mtl_free(ctx_mtl);
ggml_free(ctx_data); ggml_free(ctx_data);
ggml_free(ctx_eval); ggml_free(ctx_eval);

View file

@ -11,7 +11,7 @@ extern "C" {
struct ggml_mtl_context; struct ggml_mtl_context;
struct ggml_mtl_context * llama_mtl_init( struct ggml_mtl_context * ggml_mtl_init(
void * data_buf, void * data_buf,
size_t data_size, size_t data_size,
void * eval_buf, void * eval_buf,
@ -20,18 +20,16 @@ struct ggml_mtl_context * llama_mtl_init(
size_t cach_size, size_t cach_size,
size_t outp_size); size_t outp_size);
void llama_mtl_free(struct ggml_mtl_context * ctx); void ggml_mtl_free(struct ggml_mtl_context * ctx);
// return 0 on success // return 0 on success
int llama_mtl_eval( int ggml_mtl_graph_compute(
struct ggml_mtl_context * ctx, struct ggml_mtl_context * ctx,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
const int * tokens, const int * tokens,
int n_tokens, int n_tokens,
int n_past); int n_past);
float * llama_mtl_get_logits(struct ggml_mtl_context * ctx);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View file

@ -1,4 +1,4 @@
#import "mtl.h" #import "ggml-mtl.h"
#import "ggml.h" #import "ggml.h"
@ -6,7 +6,7 @@
#import <Metal/Metal.h> #import <Metal/Metal.h>
#import <MetalPerformanceShaders/MetalPerformanceShaders.h> #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
#ifdef LLAMA_MTL_NDEBUG #ifdef GGML_METAL_NDEBUG
#define mtl_printf(...) #define mtl_printf(...)
#else #else
#define mtl_printf(...) fprintf(stderr, __VA_ARGS__) #define mtl_printf(...) fprintf(stderr, __VA_ARGS__)
@ -85,9 +85,9 @@ struct ggml_mtl_context {
// MSL code // MSL code
// TODO: move the contents here when ready // TODO: move the contents here when ready
// for now it is easier to work in a separate file // for now it is easier to work in a separate file
NSString * const msl_library_llama = @"see mtl.metal"; NSString * const msl_library_source = @"see mtl.metal";
struct ggml_mtl_context * llama_mtl_init( struct ggml_mtl_context * ggml_mtl_init(
void * data_buf, void * data_buf,
size_t data_size, size_t data_size,
void * eval_buf, void * eval_buf,
@ -122,7 +122,7 @@ struct ggml_mtl_context * llama_mtl_init(
{ {
NSError * error = nil; NSError * error = nil;
ctx->library = [ctx->device newLibraryWithSource:msl_library_llama options:nil error:&error]; ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
if (error) { if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
exit(1); exit(1);
@ -133,7 +133,10 @@ struct ggml_mtl_context * llama_mtl_init(
{ {
NSError * error = nil; NSError * error = nil;
NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"]; //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"];
NSString * path = [[NSBundle mainBundle] pathForResource:@"ggml-mtl" ofType:@"metal"];
fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]);
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
if (error) { if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
@ -220,7 +223,7 @@ struct ggml_mtl_context * llama_mtl_init(
// TODO: how to use MTLStorageModeManaged? // TODO: how to use MTLStorageModeManaged?
// TODO: see if we can avoid this copy somehow // TODO: see if we can avoid this copy somehow
{ {
void * mem_buffer = data_buf; const void * mem_buffer = data_buf;
const size_t mem_size = data_size; const size_t mem_size = data_size;
//ctx->buffer_data = [ctx->device newBufferWithBytesNoCopy:mem_buffer length:mem_size options:MTLResourceStorageModeShared deallocator:nil]; //ctx->buffer_data = [ctx->device newBufferWithBytesNoCopy:mem_buffer length:mem_size options:MTLResourceStorageModeShared deallocator:nil];
@ -261,18 +264,20 @@ struct ggml_mtl_context * llama_mtl_init(
return ctx; return ctx;
} }
void llama_mtl_free(struct ggml_mtl_context * ctx) { void ggml_mtl_free(struct ggml_mtl_context * ctx) {
fprintf(stderr, "%s: deallocating\n", __func__); fprintf(stderr, "%s: deallocating\n", __func__);
free(ctx); free(ctx);
} }
// get data / eval buffer + offset // get data / eval buffer + offset
id<MTLBuffer> llama_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) { id<MTLBuffer> ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
const int64_t offs_data = (int64_t) t->data - (int64_t) ctx->data_buf; const int64_t offs_data = (int64_t) t->data - (int64_t) ctx->data_buf;
const int64_t offs_eval = (int64_t) t->data - (int64_t) ctx->eval_buf; const int64_t offs_eval = (int64_t) t->data - (int64_t) ctx->eval_buf;
const int64_t offs_cach = (int64_t) t->data - (int64_t) ctx->cach_buf; const int64_t offs_cach = (int64_t) t->data - (int64_t) ctx->cach_buf;
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
//const size_t t_size = ggml_nbytes(t); //const size_t t_size = ggml_nbytes(t);
id<MTLBuffer> result; id<MTLBuffer> result;
@ -317,7 +322,7 @@ id<MTLBuffer> llama_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_te
return result; return result;
} }
int llama_mtl_eval( int ggml_mtl_graph_compute(
struct ggml_mtl_context * ctx, struct ggml_mtl_context * ctx,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
const int * tokens, const int * tokens,
@ -336,7 +341,7 @@ int llama_mtl_eval(
{ {
struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd"); struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, embd, &offs_src0); id<MTLBuffer> id_dst = ggml_mtl_get_buffer(ctx, embd, &offs_src0);
memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd)); memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
} }
@ -385,9 +390,9 @@ int llama_mtl_eval(
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
id<MTLBuffer> id_src0 = src0 ? llama_mtl_get_buffer(ctx, src0, &offs_src0) : nil; id<MTLBuffer> id_src0 = src0 ? ggml_mtl_get_buffer(ctx, src0, &offs_src0) : nil;
id<MTLBuffer> id_src1 = src1 ? llama_mtl_get_buffer(ctx, src1, &offs_src1) : nil; id<MTLBuffer> id_src1 = src1 ? ggml_mtl_get_buffer(ctx, src1, &offs_src1) : nil;
id<MTLBuffer> id_dst = dst ? llama_mtl_get_buffer(ctx, dst, &offs_dst) : nil; id<MTLBuffer> id_dst = dst ? ggml_mtl_get_buffer(ctx, dst, &offs_dst) : nil;
//mtl_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); //mtl_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
//if (src0) { //if (src0) {
@ -775,7 +780,7 @@ int llama_mtl_eval(
struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, out, &offs_src0); id<MTLBuffer> id_src = ggml_mtl_get_buffer(ctx, out, &offs_src0);
id<MTLBuffer> id_dst = ctx->out; id<MTLBuffer> id_dst = ctx->out;
id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder]; id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
@ -817,53 +822,5 @@ int llama_mtl_eval(
mtl_printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax); mtl_printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
#endif #endif
//{
// struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
// if (t->type == GGML_TYPE_F32) {
// const const float * data = (float *) ctx->out.contents;
// printf("data: ");
// for (int i = 0; i < (int) t->ne[0]; i++) {
// printf("%f ", data[i]);
// }
// printf("\n");
// double sum = 0.0;
// for (int i = 0; i < ggml_nelements(t); i++) {
// double cur = data[i];
// if (isinf(cur)) continue;
// sum += cur;
// }
// printf("sum: %f\n", sum);
// } else if (t->type == GGML_TYPE_F16) {
// ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
// printf("data: ");
// for (int i = 0; i < (int) t->ne[0]; i++) {
// printf("%f ", ggml_fp16_to_fp32(data[i]));
// }
// printf("\n");
// double sum = 0.0;
// printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
// for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
// for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
// for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
// for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
// const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
// const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
// const float curf = ggml_fp16_to_fp32(cur);
// if (isinf(curf)) continue;
// sum += curf;
// }
// }
// }
// }
// printf("sum: %f\n", sum);
// } else {
// GGML_ASSERT(false && "not implemented");
// }
//}
return 0; return 0;
} }
float * llama_mtl_get_logits(struct ggml_mtl_context * ctx) {
return ctx->logits;
}

View file

@ -9,9 +9,6 @@
#include "llama-util.h" #include "llama-util.h"
#include "llama.h" #include "llama.h"
// METAL
#include "examples/mtl/mtl.h"
#include "ggml.h" #include "ggml.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h" #include "ggml-cuda.h"
@ -19,6 +16,10 @@
#include "ggml-opencl.h" #include "ggml-opencl.h"
#endif #endif
#ifdef GGML_USE_METAL
#include "ggml-mtl.h"
#endif
#include <array> #include <array>
#include <ctime> #include <ctime>
#include <cinttypes> #include <cinttypes>
@ -241,8 +242,9 @@ struct llama_context {
llama_ctx_buffer buf_compute; llama_ctx_buffer buf_compute;
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
// METAL #ifdef GGML_USE_METAL
ggml_mtl_context * mtl_ctx = NULL; ggml_mtl_context * mtl_ctx = NULL;
#endif
int buf_last = 0; int buf_last = 0;
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@ -842,7 +844,6 @@ struct llama_context_params llama_context_default_params() {
/*.use_mmap =*/ true, /*.use_mmap =*/ true,
/*.use_mlock =*/ false, /*.use_mlock =*/ false,
/*.embedding =*/ false, /*.embedding =*/ false,
/*.cgraph =*/ false,
/*.progress_callback =*/ nullptr, /*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr, /*.progress_callback_user_data =*/ nullptr,
}; };
@ -1442,12 +1443,15 @@ static bool llama_eval_internal(
// run the computation // run the computation
ggml_build_forward_expand(&gf, inpL); ggml_build_forward_expand(&gf, inpL);
// METAL #ifdef GGML_USE_METAL
if (lctx.mtl_ctx) { if (lctx.mtl_ctx) {
llama_mtl_eval(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past); ggml_mtl_graph_compute(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past);
} else { } else {
ggml_graph_compute (ctx0, &gf); ggml_graph_compute(ctx0, &gf);
} }
#else
ggml_graph_compute(ctx0, &gf);
#endif
if (cgraph_fname) { if (cgraph_fname) {
// TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found // TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found
@ -2376,11 +2380,10 @@ struct llama_context * llama_init_from_file(
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
} }
// METAL #ifdef GGML_USE_METAL
if (params.cgraph) { if (params.n_gpu_layers > 0) {
// this allocates all Metal resources and memory buffers // this allocates all Metal resources and memory buffers
//ctx->mtl_ctx = llama_mtl_init(ctx_data, ctx_eval, &gf); ctx->mtl_ctx = ggml_mtl_init(
ctx->mtl_ctx = llama_mtl_init(
ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_buffer(ctx->model.ctx),
ggml_get_mem_size (ctx->model.ctx), ggml_get_mem_size (ctx->model.ctx),
ctx->buf_compute.addr, ctx->buf_compute.addr,
@ -2389,6 +2392,7 @@ struct llama_context * llama_init_from_file(
ctx->model.kv_self.buf.size, ctx->model.kv_self.buf.size,
32*ctx->model.hparams.n_vocab*sizeof(float)); 32*ctx->model.hparams.n_vocab*sizeof(float));
} }
#endif
return ctx; return ctx;
} }

View file

@ -31,7 +31,7 @@
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 1 #define LLAMA_SESSION_VERSION 1
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD #define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif #endif
@ -75,7 +75,6 @@ extern "C" {
bool use_mmap; // use mmap if possible bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM bool use_mlock; // force system to keep model in RAM
bool embedding; // embedding mode only bool embedding; // embedding mode only
bool cgraph; // try to load computation graph from "llama.ggml" (METAL)
// called with a progress value between 0 and 1, pass NULL to disable // called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback; llama_progress_callback progress_callback;