mtl : preparing for merge
This commit is contained in:
parent
4df2ef3161
commit
18e482a89c
13 changed files with 94 additions and 144 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -17,6 +17,7 @@ build-release/
|
||||||
build-static/
|
build-static/
|
||||||
build-cublas/
|
build-cublas/
|
||||||
build-opencl/
|
build-opencl/
|
||||||
|
build-mtl/
|
||||||
build-no-accel/
|
build-no-accel/
|
||||||
build-sanitize-addr/
|
build-sanitize-addr/
|
||||||
build-sanitize-thread/
|
build-sanitize-thread/
|
||||||
|
|
|
@ -71,6 +71,7 @@ option(LLAMA_CUBLAS "llama: use cuBLAS"
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
||||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||||
|
option(LLAMA_METAL "llama: use Metal" OFF)
|
||||||
|
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
|
@ -183,7 +184,7 @@ if (LLAMA_CUBLAS)
|
||||||
|
|
||||||
enable_language(CUDA)
|
enable_language(CUDA)
|
||||||
|
|
||||||
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
|
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_CUBLAS)
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
|
@ -200,12 +201,37 @@ if (LLAMA_CUBLAS)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_METAL)
|
||||||
|
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||||
|
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||||
|
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||||
|
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
|
||||||
|
|
||||||
|
set(GGML_SOURCES_METAL ggml-mtl.m ggml-mtl.h)
|
||||||
|
|
||||||
|
add_compile_definitions(GGML_USE_METAL)
|
||||||
|
add_compile_definitions(GGML_METAL_NDEBUG)
|
||||||
|
|
||||||
|
# get full path to the file
|
||||||
|
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
|
||||||
|
|
||||||
|
# copy ggml-mtl.metal to bin directory
|
||||||
|
configure_file(ggml-mtl.metal bin/ggml-mtl.metal COPYONLY)
|
||||||
|
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
|
||||||
|
${FOUNDATION_LIBRARY}
|
||||||
|
${METAL_FRAMEWORK}
|
||||||
|
${METALKIT_FRAMEWORK}
|
||||||
|
${METALPERFORMANCE_FRAMEWORK}
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_CLBLAST)
|
if (LLAMA_CLBLAST)
|
||||||
find_package(CLBlast)
|
find_package(CLBlast)
|
||||||
if (CLBlast_FOUND)
|
if (CLBlast_FOUND)
|
||||||
message(STATUS "CLBlast found")
|
message(STATUS "CLBlast found")
|
||||||
|
|
||||||
set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
|
set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CLBLAST)
|
add_compile_definitions(GGML_USE_CLBLAST)
|
||||||
|
|
||||||
|
@ -370,8 +396,10 @@ endif()
|
||||||
add_library(ggml OBJECT
|
add_library(ggml OBJECT
|
||||||
ggml.c
|
ggml.c
|
||||||
ggml.h
|
ggml.h
|
||||||
${GGML_CUDA_SOURCES}
|
${GGML_SOURCES_CUDA}
|
||||||
${GGML_OPENCL_SOURCES})
|
${GGML_SOURCES_OPENCL}
|
||||||
|
${GGML_SOURCES_METAL}
|
||||||
|
)
|
||||||
|
|
||||||
target_include_directories(ggml PUBLIC .)
|
target_include_directories(ggml PUBLIC .)
|
||||||
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
||||||
|
@ -385,8 +413,6 @@ add_library(llama
|
||||||
llama.cpp
|
llama.cpp
|
||||||
llama.h
|
llama.h
|
||||||
llama-util.h
|
llama-util.h
|
||||||
examples/mtl/mtl.h # TODO: METAL TMP
|
|
||||||
examples/mtl/mtl.m # TODO: METAL TMP
|
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(llama PUBLIC .)
|
target_include_directories(llama PUBLIC .)
|
||||||
|
@ -394,19 +420,14 @@ target_compile_features(llama PUBLIC cxx_std_11) # don't bump
|
||||||
target_link_libraries(llama PRIVATE
|
target_link_libraries(llama PRIVATE
|
||||||
ggml
|
ggml
|
||||||
${LLAMA_EXTRA_LIBS}
|
${LLAMA_EXTRA_LIBS}
|
||||||
${FOUNDATION_LIBRARY} # TODO: METAL TMP
|
|
||||||
${METAL_FRAMEWORK} # TODO: METAL TMP
|
|
||||||
${METALKIT_FRAMEWORK} # TODO: METAL TMP
|
|
||||||
${METALPERFORMANCE_FRAMEWORK} # TODO: METAL TMP
|
|
||||||
)
|
)
|
||||||
target_compile_definitions(llama PRIVATE LLAMA_MTL_NDEBUG) # TODO: METAL TMP
|
|
||||||
|
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_CUDA_SOURCES)
|
if (GGML_SOURCES_CUDA)
|
||||||
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
|
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
|
||||||
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
|
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
|
||||||
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
|
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
|
||||||
|
|
|
@ -37,8 +37,10 @@ else()
|
||||||
add_subdirectory(save-load-state)
|
add_subdirectory(save-load-state)
|
||||||
add_subdirectory(benchmark)
|
add_subdirectory(benchmark)
|
||||||
add_subdirectory(baby-llama)
|
add_subdirectory(baby-llama)
|
||||||
|
if (LLAMA_METAL)
|
||||||
add_subdirectory(mtl)
|
add_subdirectory(mtl)
|
||||||
if(LLAMA_BUILD_SERVER)
|
endif()
|
||||||
|
if (LLAMA_BUILD_SERVER)
|
||||||
add_subdirectory(server)
|
add_subdirectory(server)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -301,8 +301,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
params.mem_test = true;
|
params.mem_test = true;
|
||||||
} else if (arg == "--export") {
|
} else if (arg == "--export") {
|
||||||
params.export_cgraph = true;
|
params.export_cgraph = true;
|
||||||
} else if (arg == "--import") {
|
|
||||||
params.import_cgraph = true;
|
|
||||||
} else if (arg == "--verbose-prompt") {
|
} else if (arg == "--verbose-prompt") {
|
||||||
params.verbose_prompt = true;
|
params.verbose_prompt = true;
|
||||||
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
||||||
|
@ -443,7 +441,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
#endif
|
#endif
|
||||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||||
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
||||||
fprintf(stderr, " --import import a computation graph from 'llama.ggml'\n");
|
|
||||||
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
||||||
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||||
|
@ -493,7 +490,6 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
||||||
lparams.use_mlock = params.use_mlock;
|
lparams.use_mlock = params.use_mlock;
|
||||||
lparams.logits_all = params.perplexity;
|
lparams.logits_all = params.perplexity;
|
||||||
lparams.embedding = params.embedding;
|
lparams.embedding = params.embedding;
|
||||||
lparams.cgraph = params.import_cgraph;
|
|
||||||
|
|
||||||
llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
|
llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
|
||||||
|
|
||||||
|
|
|
@ -72,7 +72,6 @@ struct gpt_params {
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool mem_test = false; // compute maximum memory usage
|
bool mem_test = false; // compute maximum memory usage
|
||||||
bool export_cgraph = false; // export the computation graph
|
bool export_cgraph = false; // export the computation graph
|
||||||
bool import_cgraph = false; // import a computation graph
|
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1,33 +1,6 @@
|
||||||
if (APPLE)
|
if (APPLE)
|
||||||
#
|
|
||||||
# mtl
|
|
||||||
|
|
||||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
|
||||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
|
||||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
|
||||||
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
|
|
||||||
|
|
||||||
set(TEST_TARGET mtl)
|
set(TEST_TARGET mtl)
|
||||||
add_executable(${TEST_TARGET} mtl.cpp mtl.h mtl.m)
|
add_executable(${TEST_TARGET} mtl.cpp)
|
||||||
target_link_libraries(${TEST_TARGET} PRIVATE
|
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
|
||||||
ggml
|
|
||||||
${FOUNDATION_LIBRARY}
|
|
||||||
${METAL_FRAMEWORK}
|
|
||||||
${METALKIT_FRAMEWORK}
|
|
||||||
${METALPERFORMANCE_FRAMEWORK}
|
|
||||||
)
|
|
||||||
|
|
||||||
# TODO: temporary until the kernels are ready
|
|
||||||
# custom command to build mtl.metal into a library
|
|
||||||
# depends on the mtl.metal file
|
|
||||||
add_custom_target(mtl.metallib-tmp ALL DEPENDS ${CMAKE_BINARY_DIR}/mtl.metallib)
|
|
||||||
|
|
||||||
add_custom_command(
|
|
||||||
OUTPUT ${CMAKE_BINARY_DIR}/mtl.metallib
|
|
||||||
COMMAND xcrun -sdk macosx metal -c ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal -o ${CMAKE_BINARY_DIR}/mtl.air
|
|
||||||
COMMAND xcrun -sdk macosx metallib ${CMAKE_BINARY_DIR}/mtl.air -o ${CMAKE_BINARY_DIR}/mtl.metallib
|
|
||||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal
|
|
||||||
COMMENT "Building mtl.metallib"
|
|
||||||
)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "mtl.h"
|
#include "ggml-mtl.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
@ -51,7 +51,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
auto * ctx_mtl = llama_mtl_init(
|
auto * ctx_mtl = ggml_mtl_init(
|
||||||
ggml_get_mem_buffer(ctx_data),
|
ggml_get_mem_buffer(ctx_data),
|
||||||
ggml_get_mem_size (ctx_data),
|
ggml_get_mem_size (ctx_data),
|
||||||
ggml_get_mem_buffer(ctx_eval),
|
ggml_get_mem_buffer(ctx_eval),
|
||||||
|
@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
|
||||||
const std::vector<int> tmp(n_batch, 1); // BOS
|
const std::vector<int> tmp(n_batch, 1); // BOS
|
||||||
|
|
||||||
// warmup
|
// warmup
|
||||||
llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
|
ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
|
||||||
|
|
||||||
const int n_iter = 16;
|
const int n_iter = 16;
|
||||||
|
|
||||||
|
@ -75,7 +75,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// the actual inference happens here
|
// the actual inference happens here
|
||||||
for (int i = 0; i < n_iter; ++i) {
|
for (int i = 0; i < n_iter; ++i) {
|
||||||
llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
|
ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t1 = ggml_time_us();
|
const int64_t t1 = ggml_time_us();
|
||||||
|
@ -83,7 +83,7 @@ int main(int argc, char ** argv) {
|
||||||
printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
|
printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_mtl_free(ctx_mtl);
|
ggml_mtl_free(ctx_mtl);
|
||||||
|
|
||||||
ggml_free(ctx_data);
|
ggml_free(ctx_data);
|
||||||
ggml_free(ctx_eval);
|
ggml_free(ctx_eval);
|
||||||
|
|
|
@ -11,7 +11,7 @@ extern "C" {
|
||||||
|
|
||||||
struct ggml_mtl_context;
|
struct ggml_mtl_context;
|
||||||
|
|
||||||
struct ggml_mtl_context * llama_mtl_init(
|
struct ggml_mtl_context * ggml_mtl_init(
|
||||||
void * data_buf,
|
void * data_buf,
|
||||||
size_t data_size,
|
size_t data_size,
|
||||||
void * eval_buf,
|
void * eval_buf,
|
||||||
|
@ -20,18 +20,16 @@ struct ggml_mtl_context * llama_mtl_init(
|
||||||
size_t cach_size,
|
size_t cach_size,
|
||||||
size_t outp_size);
|
size_t outp_size);
|
||||||
|
|
||||||
void llama_mtl_free(struct ggml_mtl_context * ctx);
|
void ggml_mtl_free(struct ggml_mtl_context * ctx);
|
||||||
|
|
||||||
// return 0 on success
|
// return 0 on success
|
||||||
int llama_mtl_eval(
|
int ggml_mtl_graph_compute(
|
||||||
struct ggml_mtl_context * ctx,
|
struct ggml_mtl_context * ctx,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf,
|
||||||
const int * tokens,
|
const int * tokens,
|
||||||
int n_tokens,
|
int n_tokens,
|
||||||
int n_past);
|
int n_past);
|
||||||
|
|
||||||
float * llama_mtl_get_logits(struct ggml_mtl_context * ctx);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
|
@ -1,4 +1,4 @@
|
||||||
#import "mtl.h"
|
#import "ggml-mtl.h"
|
||||||
|
|
||||||
#import "ggml.h"
|
#import "ggml.h"
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@
|
||||||
#import <Metal/Metal.h>
|
#import <Metal/Metal.h>
|
||||||
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
|
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
|
||||||
|
|
||||||
#ifdef LLAMA_MTL_NDEBUG
|
#ifdef GGML_METAL_NDEBUG
|
||||||
#define mtl_printf(...)
|
#define mtl_printf(...)
|
||||||
#else
|
#else
|
||||||
#define mtl_printf(...) fprintf(stderr, __VA_ARGS__)
|
#define mtl_printf(...) fprintf(stderr, __VA_ARGS__)
|
||||||
|
@ -85,9 +85,9 @@ struct ggml_mtl_context {
|
||||||
// MSL code
|
// MSL code
|
||||||
// TODO: move the contents here when ready
|
// TODO: move the contents here when ready
|
||||||
// for now it is easier to work in a separate file
|
// for now it is easier to work in a separate file
|
||||||
NSString * const msl_library_llama = @"see mtl.metal";
|
NSString * const msl_library_source = @"see mtl.metal";
|
||||||
|
|
||||||
struct ggml_mtl_context * llama_mtl_init(
|
struct ggml_mtl_context * ggml_mtl_init(
|
||||||
void * data_buf,
|
void * data_buf,
|
||||||
size_t data_size,
|
size_t data_size,
|
||||||
void * eval_buf,
|
void * eval_buf,
|
||||||
|
@ -122,7 +122,7 @@ struct ggml_mtl_context * llama_mtl_init(
|
||||||
{
|
{
|
||||||
NSError * error = nil;
|
NSError * error = nil;
|
||||||
|
|
||||||
ctx->library = [ctx->device newLibraryWithSource:msl_library_llama options:nil error:&error];
|
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
|
||||||
if (error) {
|
if (error) {
|
||||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
exit(1);
|
exit(1);
|
||||||
|
@ -133,7 +133,10 @@ struct ggml_mtl_context * llama_mtl_init(
|
||||||
{
|
{
|
||||||
NSError * error = nil;
|
NSError * error = nil;
|
||||||
|
|
||||||
NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"];
|
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"];
|
||||||
|
NSString * path = [[NSBundle mainBundle] pathForResource:@"ggml-mtl" ofType:@"metal"];
|
||||||
|
fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]);
|
||||||
|
|
||||||
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
||||||
if (error) {
|
if (error) {
|
||||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
|
@ -220,7 +223,7 @@ struct ggml_mtl_context * llama_mtl_init(
|
||||||
// TODO: how to use MTLStorageModeManaged?
|
// TODO: how to use MTLStorageModeManaged?
|
||||||
// TODO: see if we can avoid this copy somehow
|
// TODO: see if we can avoid this copy somehow
|
||||||
{
|
{
|
||||||
void * mem_buffer = data_buf;
|
const void * mem_buffer = data_buf;
|
||||||
const size_t mem_size = data_size;
|
const size_t mem_size = data_size;
|
||||||
|
|
||||||
//ctx->buffer_data = [ctx->device newBufferWithBytesNoCopy:mem_buffer length:mem_size options:MTLResourceStorageModeShared deallocator:nil];
|
//ctx->buffer_data = [ctx->device newBufferWithBytesNoCopy:mem_buffer length:mem_size options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
@ -261,18 +264,20 @@ struct ggml_mtl_context * llama_mtl_init(
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_mtl_free(struct ggml_mtl_context * ctx) {
|
void ggml_mtl_free(struct ggml_mtl_context * ctx) {
|
||||||
fprintf(stderr, "%s: deallocating\n", __func__);
|
fprintf(stderr, "%s: deallocating\n", __func__);
|
||||||
|
|
||||||
free(ctx);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
// get data / eval buffer + offset
|
// get data / eval buffer + offset
|
||||||
id<MTLBuffer> llama_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
id<MTLBuffer> ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
||||||
const int64_t offs_data = (int64_t) t->data - (int64_t) ctx->data_buf;
|
const int64_t offs_data = (int64_t) t->data - (int64_t) ctx->data_buf;
|
||||||
const int64_t offs_eval = (int64_t) t->data - (int64_t) ctx->eval_buf;
|
const int64_t offs_eval = (int64_t) t->data - (int64_t) ctx->eval_buf;
|
||||||
const int64_t offs_cach = (int64_t) t->data - (int64_t) ctx->cach_buf;
|
const int64_t offs_cach = (int64_t) t->data - (int64_t) ctx->cach_buf;
|
||||||
|
|
||||||
|
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
||||||
|
|
||||||
//const size_t t_size = ggml_nbytes(t);
|
//const size_t t_size = ggml_nbytes(t);
|
||||||
|
|
||||||
id<MTLBuffer> result;
|
id<MTLBuffer> result;
|
||||||
|
@ -317,7 +322,7 @@ id<MTLBuffer> llama_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_te
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_mtl_eval(
|
int ggml_mtl_graph_compute(
|
||||||
struct ggml_mtl_context * ctx,
|
struct ggml_mtl_context * ctx,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf,
|
||||||
const int * tokens,
|
const int * tokens,
|
||||||
|
@ -336,7 +341,7 @@ int llama_mtl_eval(
|
||||||
{
|
{
|
||||||
struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
|
struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
|
||||||
|
|
||||||
id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, embd, &offs_src0);
|
id<MTLBuffer> id_dst = ggml_mtl_get_buffer(ctx, embd, &offs_src0);
|
||||||
|
|
||||||
memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
|
memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
|
||||||
}
|
}
|
||||||
|
@ -385,9 +390,9 @@ int llama_mtl_eval(
|
||||||
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
||||||
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
|
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
|
||||||
|
|
||||||
id<MTLBuffer> id_src0 = src0 ? llama_mtl_get_buffer(ctx, src0, &offs_src0) : nil;
|
id<MTLBuffer> id_src0 = src0 ? ggml_mtl_get_buffer(ctx, src0, &offs_src0) : nil;
|
||||||
id<MTLBuffer> id_src1 = src1 ? llama_mtl_get_buffer(ctx, src1, &offs_src1) : nil;
|
id<MTLBuffer> id_src1 = src1 ? ggml_mtl_get_buffer(ctx, src1, &offs_src1) : nil;
|
||||||
id<MTLBuffer> id_dst = dst ? llama_mtl_get_buffer(ctx, dst, &offs_dst) : nil;
|
id<MTLBuffer> id_dst = dst ? ggml_mtl_get_buffer(ctx, dst, &offs_dst) : nil;
|
||||||
|
|
||||||
//mtl_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
//mtl_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
||||||
//if (src0) {
|
//if (src0) {
|
||||||
|
@ -775,7 +780,7 @@ int llama_mtl_eval(
|
||||||
|
|
||||||
struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
|
||||||
|
|
||||||
id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, out, &offs_src0);
|
id<MTLBuffer> id_src = ggml_mtl_get_buffer(ctx, out, &offs_src0);
|
||||||
id<MTLBuffer> id_dst = ctx->out;
|
id<MTLBuffer> id_dst = ctx->out;
|
||||||
|
|
||||||
id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
|
id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
|
||||||
|
@ -817,53 +822,5 @@ int llama_mtl_eval(
|
||||||
mtl_printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
|
mtl_printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//{
|
|
||||||
// struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
|
|
||||||
// if (t->type == GGML_TYPE_F32) {
|
|
||||||
// const const float * data = (float *) ctx->out.contents;
|
|
||||||
// printf("data: ");
|
|
||||||
// for (int i = 0; i < (int) t->ne[0]; i++) {
|
|
||||||
// printf("%f ", data[i]);
|
|
||||||
// }
|
|
||||||
// printf("\n");
|
|
||||||
// double sum = 0.0;
|
|
||||||
// for (int i = 0; i < ggml_nelements(t); i++) {
|
|
||||||
// double cur = data[i];
|
|
||||||
// if (isinf(cur)) continue;
|
|
||||||
// sum += cur;
|
|
||||||
// }
|
|
||||||
// printf("sum: %f\n", sum);
|
|
||||||
// } else if (t->type == GGML_TYPE_F16) {
|
|
||||||
// ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
|
|
||||||
// printf("data: ");
|
|
||||||
// for (int i = 0; i < (int) t->ne[0]; i++) {
|
|
||||||
// printf("%f ", ggml_fp16_to_fp32(data[i]));
|
|
||||||
// }
|
|
||||||
// printf("\n");
|
|
||||||
// double sum = 0.0;
|
|
||||||
// printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
|
|
||||||
// for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
|
|
||||||
// for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
|
|
||||||
// for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
|
|
||||||
// for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
|
|
||||||
// const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
|
|
||||||
// const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
|
|
||||||
// const float curf = ggml_fp16_to_fp32(cur);
|
|
||||||
// if (isinf(curf)) continue;
|
|
||||||
// sum += curf;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// printf("sum: %f\n", sum);
|
|
||||||
// } else {
|
|
||||||
// GGML_ASSERT(false && "not implemented");
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
float * llama_mtl_get_logits(struct ggml_mtl_context * ctx) {
|
|
||||||
return ctx->logits;
|
|
||||||
}
|
|
28
llama.cpp
28
llama.cpp
|
@ -9,9 +9,6 @@
|
||||||
#include "llama-util.h"
|
#include "llama-util.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
// METAL
|
|
||||||
#include "examples/mtl/mtl.h"
|
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
|
@ -19,6 +16,10 @@
|
||||||
#include "ggml-opencl.h"
|
#include "ggml-opencl.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
#include "ggml-mtl.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
@ -241,8 +242,9 @@ struct llama_context {
|
||||||
llama_ctx_buffer buf_compute;
|
llama_ctx_buffer buf_compute;
|
||||||
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
||||||
|
|
||||||
// METAL
|
#ifdef GGML_USE_METAL
|
||||||
ggml_mtl_context * mtl_ctx = NULL;
|
ggml_mtl_context * mtl_ctx = NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
int buf_last = 0;
|
int buf_last = 0;
|
||||||
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
||||||
|
@ -842,7 +844,6 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.use_mmap =*/ true,
|
/*.use_mmap =*/ true,
|
||||||
/*.use_mlock =*/ false,
|
/*.use_mlock =*/ false,
|
||||||
/*.embedding =*/ false,
|
/*.embedding =*/ false,
|
||||||
/*.cgraph =*/ false,
|
|
||||||
/*.progress_callback =*/ nullptr,
|
/*.progress_callback =*/ nullptr,
|
||||||
/*.progress_callback_user_data =*/ nullptr,
|
/*.progress_callback_user_data =*/ nullptr,
|
||||||
};
|
};
|
||||||
|
@ -1442,12 +1443,15 @@ static bool llama_eval_internal(
|
||||||
// run the computation
|
// run the computation
|
||||||
ggml_build_forward_expand(&gf, inpL);
|
ggml_build_forward_expand(&gf, inpL);
|
||||||
|
|
||||||
// METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (lctx.mtl_ctx) {
|
if (lctx.mtl_ctx) {
|
||||||
llama_mtl_eval(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past);
|
ggml_mtl_graph_compute(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past);
|
||||||
} else {
|
} else {
|
||||||
ggml_graph_compute (ctx0, &gf);
|
ggml_graph_compute(ctx0, &gf);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
ggml_graph_compute(ctx0, &gf);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (cgraph_fname) {
|
if (cgraph_fname) {
|
||||||
// TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found
|
// TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found
|
||||||
|
@ -2376,11 +2380,10 @@ struct llama_context * llama_init_from_file(
|
||||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
||||||
}
|
}
|
||||||
|
|
||||||
// METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (params.cgraph) {
|
if (params.n_gpu_layers > 0) {
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
//ctx->mtl_ctx = llama_mtl_init(ctx_data, ctx_eval, &gf);
|
ctx->mtl_ctx = ggml_mtl_init(
|
||||||
ctx->mtl_ctx = llama_mtl_init(
|
|
||||||
ggml_get_mem_buffer(ctx->model.ctx),
|
ggml_get_mem_buffer(ctx->model.ctx),
|
||||||
ggml_get_mem_size (ctx->model.ctx),
|
ggml_get_mem_size (ctx->model.ctx),
|
||||||
ctx->buf_compute.addr,
|
ctx->buf_compute.addr,
|
||||||
|
@ -2389,6 +2392,7 @@ struct llama_context * llama_init_from_file(
|
||||||
ctx->model.kv_self.buf.size,
|
ctx->model.kv_self.buf.size,
|
||||||
32*ctx->model.hparams.n_vocab*sizeof(float));
|
32*ctx->model.hparams.n_vocab*sizeof(float));
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
3
llama.h
3
llama.h
|
@ -31,7 +31,7 @@
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 1
|
#define LLAMA_SESSION_VERSION 1
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
#endif
|
#endif
|
||||||
|
@ -75,7 +75,6 @@ extern "C" {
|
||||||
bool use_mmap; // use mmap if possible
|
bool use_mmap; // use mmap if possible
|
||||||
bool use_mlock; // force system to keep model in RAM
|
bool use_mlock; // force system to keep model in RAM
|
||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
bool cgraph; // try to load computation graph from "llama.ggml" (METAL)
|
|
||||||
|
|
||||||
// called with a progress value between 0 and 1, pass NULL to disable
|
// called with a progress value between 0 and 1, pass NULL to disable
|
||||||
llama_progress_callback progress_callback;
|
llama_progress_callback progress_callback;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue