llama : merge gguf-util.h in llama.cpp

This commit is contained in:
Georgi Gerganov 2023-08-15 22:09:56 +03:00
parent a02b809a2e
commit afd135a64c
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
4 changed files with 450 additions and 501 deletions

View file

@ -332,7 +332,7 @@ OBJS += ggml-alloc.o
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
gguf-llama.o: gguf-llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h gguf-llama.h gguf-util.h gguf-llama.o: gguf-llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h gguf-llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
common.o: examples/common.cpp examples/common.h common.o: examples/common.cpp examples/common.h

View file

@ -1,5 +1,4 @@
#include "ggml.h" #include "ggml.h"
#include "gguf-util.h"
#include "gguf-llama.h" #include "gguf-llama.h"
#include <cstdio> #include <cstdio>
@ -195,6 +194,15 @@ bool gguf_ex_read_1(const std::string & fname) {
fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data); fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
// print first 10 elements
const float * data = (const float *) cur->data;
printf("%s data[:10] : ", name);
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
printf("%f ", data[j]);
}
printf("\n\n");
// check data // check data
{ {
const float * data = (const float *) cur->data; const float * data = (const float *) cur->data;
@ -216,48 +224,6 @@ bool gguf_ex_read_1(const std::string & fname) {
return true; return true;
} }
// read just the tensor info and mmap the data in user code
bool gguf_ex_read_2(const std::string & fname) {
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_data,
};
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
struct gguf_file file(fname.c_str(), "rb");
gguf_mmap data_mmap(&file, 0, false);
const int n_tensors = gguf_get_n_tensors(ctx);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
cur->data = static_cast<char *>(data_mmap.addr) + offset;
// print first 10 elements
const float * data = (const float *) cur->data;
printf("%s data[:10] : ", name);
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
printf("%f ", data[j]);
}
printf("\n\n");
}
fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
ggml_free(ctx_data);
gguf_free(ctx);
return true;
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
if (argc < 3) { if (argc < 3) {
fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]); fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
@ -274,7 +240,6 @@ int main(int argc, char ** argv) {
} else if (mode == "r") { } else if (mode == "r") {
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file"); GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file"); GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_2(fname) && "failed to read gguf file");
} else if (mode == "q") { } else if (mode == "q") {
llama_model_quantize_params params = llama_model_quantize_default_params(); llama_model_quantize_params params = llama_model_quantize_default_params();
llama_model_quantize(fname.c_str(), "quant.gguf", &params); llama_model_quantize(fname.c_str(), "quant.gguf", &params);

View file

@ -6,31 +6,61 @@
#include <cstdio> #include <cstdio>
#endif #endif
#include "gguf-util.h"
#define LLAMA_API_CPP // TODO: eliminate me #define LLAMA_API_CPP // TODO: eliminate me
#include "gguf-llama.h" #include "gguf-llama.h"
#include "ggml.h" #include "ggml.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h" # include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h" # include "ggml-opencl.h"
#endif #endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
#include "ggml-metal.h" # include "ggml-metal.h"
#endif #endif
#ifdef GGML_USE_MPI #ifdef GGML_USE_MPI
#include "ggml-mpi.h" # include "ggml-mpi.h"
#endif #endif
#ifdef GGML_USE_K_QUANTS #ifdef GGML_USE_K_QUANTS
#ifndef QK_K # ifndef QK_K
#ifdef GGML_QKK_64 # ifdef GGML_QKK_64
#define QK_K 64 # define QK_K 64
# else
# define QK_K 256
# endif
# endif
#endif
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
# include "ggml-alloc.h"
# define LLAMA_USE_ALLOCATOR
#else #else
#define QK_K 256 # define LLAMA_USE_SCRATCH
# define LLAMA_MAX_SCRATCH_BUFFERS 16
#endif #endif
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/mman.h>
#endif
#if defined(_POSIX_MEMLOCK_RANGE)
#include <sys/resource.h>
#endif
#endif
#endif #endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <io.h>
#include <stdio.h> // for _fseeki64
#endif #endif
#include <array> #include <array>
@ -70,12 +100,20 @@
#define TN_FFN_DOWN "blk.%d.ffn_down.weight" #define TN_FFN_DOWN "blk.%d.ffn_down.weight"
#define TN_FFN_UP "blk.%d.ffn_up.weight" #define TN_FFN_UP "blk.%d.ffn_up.weight"
//
// logging
//
static void llama_log_internal(llama_log_level level, const char* format, ...); static void llama_log_internal(llama_log_level level, const char* format, ...);
static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data); static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__) #define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
//
// helpers
//
template<typename T> template<typename T>
static std::string to_string(const T & val) { static std::string to_string(const T & val) {
std::stringstream ss; std::stringstream ss;
@ -90,25 +128,69 @@ static void zeros(std::ofstream & file, size_t n) {
} }
} }
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) #ifdef __GNUC__
#include "ggml-alloc.h" #ifdef __MINGW32__
#define LLAMA_USE_ALLOCATOR __attribute__((format(gnu_printf, 1, 2)))
#else #else
#define LLAMA_USE_SCRATCH __attribute__((format(printf, 1, 2)))
#define LLAMA_MAX_SCRATCH_BUFFERS 16
#endif #endif
#endif
static std::string format(const char * fmt, ...) {
va_list ap, ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX);
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
#define UNUSED GGML_UNUSED //
// ggml helpers
//
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}
ggml_graph_compute(graph, &plan);
}
//
// llama helpers
//
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
#define llama_host_malloc(n) ggml_cuda_host_malloc(n) # define llama_host_malloc(n) ggml_cuda_host_malloc(n)
#define llama_host_free(data) ggml_cuda_host_free(data) # define llama_host_free(data) ggml_cuda_host_free(data)
#elif GGML_USE_METAL #elif GGML_USE_METAL
#define llama_host_malloc(n) ggml_metal_host_malloc(n) # define llama_host_malloc(n) ggml_metal_host_malloc(n)
#define llama_host_free(data) ggml_metal_host_free(data) # define llama_host_free(data) ggml_metal_host_free(data)
#else #else
#define llama_host_malloc(n) malloc(n) # define llama_host_malloc(n) malloc(n)
#define llama_host_free(data) free(data) # define llama_host_free(data) free(data)
#endif
#if defined(_WIN32)
static std::string llama_format_win_err(DWORD err) {
LPSTR buf;
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
if (!size) {
return "FormatMessageA failed";
}
std::string ret(buf, size);
LocalFree(buf);
return ret;
}
#endif #endif
struct llama_buffer { struct llama_buffer {
@ -147,27 +229,328 @@ struct llama_buffer {
} }
}; };
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
}
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}
size_t tell() const {
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
GGML_ASSERT(ret != -1); // this really shouldn't fail
return (size_t) ret;
}
void seek(size_t offset, int whence) {
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
GGML_ASSERT(ret == 0); // same
}
void read_raw(void * ptr, size_t len) const {
if (len == 0) {
return;
}
errno = 0;
std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret != 1) {
throw std::runtime_error(std::string("unexpectedly reached end of file"));
}
}
void write_raw(const void * ptr, size_t len) const {
if (len == 0) {
return;
}
errno = 0;
size_t ret = std::fwrite(ptr, len, 1, fp);
if (ret != 1) {
throw std::runtime_error(format("write error: %s", strerror(errno)));
}
}
~llama_file() {
if (fp) {
std::fclose(fp);
}
}
};
struct llama_mmap {
void * addr;
size_t size;
llama_mmap(const llama_mmap &) = delete;
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
size = file->size;
int fd = fileno(file->fp);
int flags = MAP_SHARED;
// prefetch/readahead impairs performance on NUMA systems
if (numa) { prefetch = 0; }
#ifdef __linux__
if (prefetch) { flags |= MAP_POPULATE; }
#endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) {
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
}
if (prefetch > 0) {
// Advise the kernel to preload the mapped memory
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
strerror(errno));
}
}
if (numa) {
// advise the kernel not to use readahead
// (because the next page might not belong on the same node)
if (madvise(addr, file->size, MADV_RANDOM)) {
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
strerror(errno));
}
}
}
~llama_mmap() {
munmap(addr, size);
}
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
(void) numa;
size = file->size;
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
DWORD error = GetLastError();
if (hMapping == NULL) {
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
}
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
error = GetLastError();
CloseHandle(hMapping);
if (addr == NULL) {
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
}
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
if (prefetch) {
// Advise the kernel to preload the mapped memory
WIN32_MEMORY_RANGE_ENTRY range;
range.VirtualAddress = addr;
range.NumberOfBytes = (SIZE_T)size;
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
}
~llama_mmap() {
if (!UnmapViewOfFile(addr)) {
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
static constexpr bool SUPPORTED = false;
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
(void) file;
(void) prefetch;
(void) numa;
throw std::runtime_error(std::string("mmap not supported"));
}
#endif
};
// Represents some region of memory being locked using mlock or VirtualLock;
// will automatically unlock on destruction.
struct llama_mlock {
void * addr = NULL;
size_t size = 0;
bool failed_already = false;
llama_mlock() {}
llama_mlock(const llama_mlock &) = delete;
~llama_mlock() {
if (size) {
raw_unlock(addr, size);
}
}
void init(void * ptr) {
GGML_ASSERT(addr == NULL && size == 0);
addr = ptr;
}
void grow_to(size_t target_size) {
GGML_ASSERT(addr);
if (failed_already) {
return;
}
size_t granularity = lock_granularity();
target_size = (target_size + granularity - 1) & ~(granularity - 1);
if (target_size > size) {
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
size = target_size;
} else {
failed_already = true;
}
}
}
#ifdef _POSIX_MEMLOCK_RANGE
static constexpr bool SUPPORTED = true;
size_t lock_granularity() {
return (size_t) sysconf(_SC_PAGESIZE);
}
#ifdef __APPLE__
#define MLOCK_SUGGESTION \
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
#else
#define MLOCK_SUGGESTION \
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
#endif
bool raw_lock(const void * addr, size_t size) {
if (!mlock(addr, size)) {
return true;
} else {
char* errmsg = std::strerror(errno);
bool suggest = (errno == ENOMEM);
// Check if the resource limit is fine after all
struct rlimit lock_limit;
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
suggest = false;
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
suggest = false;
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
return false;
}
}
#undef MLOCK_SUGGESTION
void raw_unlock(void * addr, size_t size) {
if (munlock(addr, size)) {
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
}
}
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
size_t lock_granularity() {
SYSTEM_INFO si;
GetSystemInfo(&si);
return (size_t) si.dwPageSize;
}
bool raw_lock(void * ptr, size_t len) {
for (int tries = 1; ; tries++) {
if (VirtualLock(ptr, len)) {
return true;
}
if (tries == 2) {
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
len, size, llama_format_win_err(GetLastError()).c_str());
return false;
}
// It failed but this was only the first try; increase the working
// set size and try again.
SIZE_T min_ws_size, max_ws_size;
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
gguf_format_win_err(GetLastError()).c_str());
return false;
}
// Per MSDN: "The maximum number of pages that a process can lock
// is equal to the number of pages in its minimum working set minus
// a small overhead."
// Hopefully a megabyte is enough overhead:
size_t increment = len + 1048576;
// The minimum must be <= the maximum, so we need to increase both:
min_ws_size += increment;
max_ws_size += increment;
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
gguf_format_win_err(GetLastError()).c_str());
return false;
}
}
}
void raw_unlock(void * ptr, size_t len) {
if (!VirtualUnlock(ptr, len)) {
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
gguf_format_win_err(GetLastError()).c_str());
}
}
#else
static constexpr bool SUPPORTED = false;
size_t lock_granularity() {
return (size_t) 65536;
}
bool raw_lock(const void * addr, size_t len) {
fprintf(stderr, "warning: mlock not supported on this system\n");
return false;
}
void raw_unlock(const void * addr, size_t len) {}
#endif
};
typedef void (*offload_func_t)(struct ggml_tensor * tensor); typedef void (*offload_func_t)(struct ggml_tensor * tensor);
void llama_nop(struct ggml_tensor * tensor) { // don't offload by default void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
(void) tensor; (void) tensor;
} }
//
// ggml helpers
//
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}
ggml_graph_compute(graph, &plan);
}
// //
// memory sizes (calculated for n_batch == 512) // memory sizes (calculated for n_batch == 512)
// //
@ -384,11 +767,11 @@ struct llama_model {
llama_buffer buf; llama_buffer buf;
// model memory mapped file // model memory mapped file
std::unique_ptr<gguf_mmap> mapping; std::unique_ptr<llama_mmap> mapping;
// objects representing data potentially being locked in memory // objects representing data potentially being locked in memory
gguf_mlock mlock_buf; llama_mlock mlock_buf;
gguf_mlock mlock_mmap; llama_mlock mlock_mmap;
// for quantize-stats only // for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name; std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
@ -565,7 +948,7 @@ enum gguf_file_version {
}; };
struct gguf_file_loader { struct gguf_file_loader {
gguf_file file; llama_file file;
gguf_context * ctx_gguf; gguf_context * ctx_gguf;
gguf_file_version file_version; gguf_file_version file_version;
@ -643,11 +1026,11 @@ struct llama_model_loader {
bool use_mmap; bool use_mmap;
size_t num_ggml_tensors_created = 0; size_t num_ggml_tensors_created = 0;
struct ggml_context * ggml_ctx = NULL; struct ggml_context * ggml_ctx = NULL;
std::unique_ptr<gguf_mmap> mapping; std::unique_ptr<llama_mmap> mapping;
llama_model_loader(const std::string & fname_base, bool use_mmap) { llama_model_loader(const std::string & fname_base, bool use_mmap) {
file_loader = std::unique_ptr<gguf_file_loader>(new gguf_file_loader(fname_base.c_str(), tensors_map)); file_loader = std::unique_ptr<gguf_file_loader>(new gguf_file_loader(fname_base.c_str(), tensors_map));
if (!gguf_mmap::SUPPORTED) { if (!llama_mmap::SUPPORTED) {
use_mmap = false; use_mmap = false;
} }
this->use_mmap = use_mmap; this->use_mmap = use_mmap;
@ -707,13 +1090,13 @@ struct llama_model_loader {
if (use_mmap) { if (use_mmap) {
lt.data = (uint8_t *) mapping->addr + lt.file_off; lt.data = (uint8_t *) mapping->addr + lt.file_off;
} else { } else {
gguf_file & file = file_loader->file; llama_file & file = file_loader->file;
file.seek(lt.file_off, SEEK_SET); file.seek(lt.file_off, SEEK_SET);
file.read_raw(lt.data, lt.size); file.read_raw(lt.data, lt.size);
} }
} }
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) { void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
size_t data_size = 0; size_t data_size = 0;
size_t prefetch_size = 0; size_t prefetch_size = 0;
size_t lock_size = 0; size_t lock_size = 0;
@ -726,7 +1109,7 @@ struct llama_model_loader {
} }
if (use_mmap) { if (use_mmap) {
mapping.reset(new gguf_mmap(&file_loader->file, prefetch_size, ggml_is_numa())); mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
if (lmlock) { if (lmlock) {
lmlock->init(mapping->addr); lmlock->init(mapping->addr);
} }
@ -748,7 +1131,7 @@ struct llama_model_loader {
load_data_for(lt); load_data_for(lt);
switch(lt.ggml_tensor->backend) { switch (lt.ggml_tensor->backend) {
case GGML_BACKEND_CPU: case GGML_BACKEND_CPU:
lt.ggml_tensor->data = lt.data; lt.ggml_tensor->data = lt.data;
if (use_mmap && lmlock) { if (use_mmap && lmlock) {
@ -871,11 +1254,11 @@ int llama_max_devices(void) {
} }
bool llama_mmap_supported(void) { bool llama_mmap_supported(void) {
return gguf_mmap::SUPPORTED; return llama_mmap::SUPPORTED;
} }
bool llama_mlock_supported(void) { bool llama_mlock_supported(void) {
return gguf_mlock::SUPPORTED; return llama_mlock::SUPPORTED;
} }
void llama_backend_init(bool numa) { void llama_backend_init(bool numa) {
@ -982,7 +1365,6 @@ static void llama_model_load_internal(
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap)); std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
model.n_gpu_layers = n_gpu_layers; model.n_gpu_layers = n_gpu_layers;
gguf_file_version file_version = ml->file_loader->file_version;
auto & hparams = model.hparams; auto & hparams = model.hparams;
@ -1069,7 +1451,7 @@ static void llama_model_load_internal(
} }
{ {
LLAMA_LOG_INFO("%s: format = %s\n", __func__, gguf_file_version_name(file_version)); LLAMA_LOG_INFO("%s: format = %s\n", __func__, gguf_file_version_name(ml->file_loader->file_version));
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx); LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
@ -1935,15 +2317,15 @@ static bool llama_is_eos_token(const llama_vocab & vocab, llama_token token) {
} }
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token token) { static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token token) {
UNUSED(vocab); GGML_UNUSED(vocab);
UNUSED(token); GGML_UNUSED(token);
// TODO: improve? // TODO: improve?
return false; return false;
} }
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token token) { static bool llama_is_unused_token(const llama_vocab & vocab, llama_token token) {
UNUSED(vocab); GGML_UNUSED(vocab);
UNUSED(token); GGML_UNUSED(token);
// TODO: improve? // TODO: improve?
return false; return false;
} }
@ -2598,7 +2980,6 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
} }
} }
void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
// Reference implementation: // Reference implementation:
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
@ -3615,7 +3996,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
// maybe this should in llama_model_loader // maybe this should in llama_model_loader
if (model_loader->use_mmap) { if (model_loader->use_mmap) {
model_loader->mapping.reset(new gguf_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa())); model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
} }
} }
@ -4143,7 +4524,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
} }
static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
gguf_file file(path_session, "rb"); llama_file file(path_session, "rb");
GGML_UNUSED(ctx); GGML_UNUSED(ctx);
GGML_UNUSED(path_session); GGML_UNUSED(path_session);
GGML_UNUSED(tokens_out); GGML_UNUSED(tokens_out);
@ -4164,7 +4545,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
} }
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
gguf_file file(path_session, "wb"); llama_file file(path_session, "wb");
GGML_UNUSED(ctx); GGML_UNUSED(ctx);
GGML_UNUSED(tokens); GGML_UNUSED(tokens);
GGML_UNUSED(n_token_count); GGML_UNUSED(n_token_count);

View file

@ -1,397 +0,0 @@
// GGUF counterpart of llama-util.h.
// we may consider making it a part of ggml.c once GGUF work is complete.
// this will require extra work to migrate this to pure C.
// Contains wrappers around OS interfaces.
#ifndef GGUF_UTIL_H
#define GGUF_UTIL_H
#include "ggml.h"
#include <cstdio>
#include <cstdint>
#include <cerrno>
#include <cstring>
#include <cstdarg>
#include <cstdlib>
#include <climits>
#include <string>
#include <sstream>
#include <vector>
#include <stdexcept>
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/mman.h>
#endif
#if defined(_POSIX_MEMLOCK_RANGE)
#include <sys/resource.h>
#endif
#endif
#endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <io.h>
#include <stdio.h> // for _fseeki64
#endif
#ifdef __GNUC__
#ifdef __MINGW32__
__attribute__((format(gnu_printf, 1, 2)))
#else
__attribute__((format(printf, 1, 2)))
#endif
#endif
static std::string format(const char * fmt, ...) {
va_list ap, ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX);
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
// TODO: can we merge this one and gguf_context?
struct gguf_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
gguf_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
}
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}
size_t tell() const {
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
GGML_ASSERT(ret != -1); // this really shouldn't fail
return (size_t) ret;
}
void seek(size_t offset, int whence) {
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
GGML_ASSERT(ret == 0); // same
}
void read_raw(void * ptr, size_t len) const {
if (len == 0) {
return;
}
errno = 0;
std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret != 1) {
throw std::runtime_error(std::string("unexpectedly reached end of file"));
}
}
void write_raw(const void * ptr, size_t len) const {
if (len == 0) {
return;
}
errno = 0;
size_t ret = std::fwrite(ptr, len, 1, fp);
if (ret != 1) {
throw std::runtime_error(format("write error: %s", strerror(errno)));
}
}
~gguf_file() {
if (fp) {
std::fclose(fp);
}
}
};
#if defined(_WIN32)
static std::string gguf_format_win_err(DWORD err) {
LPSTR buf;
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
if (!size) {
return "FormatMessageA failed";
}
std::string ret(buf, size);
LocalFree(buf);
return ret;
}
#endif
struct gguf_mmap {
void * addr;
size_t size;
gguf_mmap(const gguf_mmap &) = delete;
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
gguf_mmap(struct gguf_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
size = file->size;
int fd = fileno(file->fp);
int flags = MAP_SHARED;
// prefetch/readahead impairs performance on NUMA systems
if (numa) { prefetch = 0; }
#ifdef __linux__
if (prefetch) { flags |= MAP_POPULATE; }
#endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) {
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
}
if (prefetch > 0) {
// Advise the kernel to preload the mapped memory
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
strerror(errno));
}
}
if (numa) {
// advise the kernel not to use readahead
// (because the next page might not belong on the same node)
if (madvise(addr, file->size, MADV_RANDOM)) {
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
strerror(errno));
}
}
}
~gguf_mmap() {
munmap(addr, size);
}
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
gguf_mmap(struct gguf_file * file, bool prefetch = true, bool numa = false) {
(void) numa;
size = file->size;
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
DWORD error = GetLastError();
if (hMapping == NULL) {
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
}
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
error = GetLastError();
CloseHandle(hMapping);
if (addr == NULL) {
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
}
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
if (prefetch) {
// Advise the kernel to preload the mapped memory
WIN32_MEMORY_RANGE_ENTRY range;
range.VirtualAddress = addr;
range.NumberOfBytes = (SIZE_T)size;
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
gguf_format_win_err(GetLastError()).c_str());
}
}
#else
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
}
~gguf_mmap() {
if (!UnmapViewOfFile(addr)) {
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
static constexpr bool SUPPORTED = false;
gguf_mmap(struct gguf_file * file, bool prefetch = true, bool numa = false) {
(void) file;
(void) prefetch;
(void) numa;
throw std::runtime_error(std::string("mmap not supported"));
}
#endif
};
// Represents some region of memory being locked using mlock or VirtualLock;
// will automatically unlock on destruction.
struct gguf_mlock {
void * addr = NULL;
size_t size = 0;
bool failed_already = false;
gguf_mlock() {}
gguf_mlock(const gguf_mlock &) = delete;
~gguf_mlock() {
if (size) {
raw_unlock(addr, size);
}
}
void init(void * ptr) {
GGML_ASSERT(addr == NULL && size == 0);
addr = ptr;
}
void grow_to(size_t target_size) {
GGML_ASSERT(addr);
if (failed_already) {
return;
}
size_t granularity = lock_granularity();
target_size = (target_size + granularity - 1) & ~(granularity - 1);
if (target_size > size) {
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
size = target_size;
} else {
failed_already = true;
}
}
}
#ifdef _POSIX_MEMLOCK_RANGE
static constexpr bool SUPPORTED = true;
size_t lock_granularity() {
return (size_t) sysconf(_SC_PAGESIZE);
}
#ifdef __APPLE__
#define MLOCK_SUGGESTION \
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
#else
#define MLOCK_SUGGESTION \
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
#endif
bool raw_lock(const void * addr, size_t size) {
if (!mlock(addr, size)) {
return true;
} else {
char* errmsg = std::strerror(errno);
bool suggest = (errno == ENOMEM);
// Check if the resource limit is fine after all
struct rlimit lock_limit;
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
suggest = false;
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
suggest = false;
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
return false;
}
}
#undef MLOCK_SUGGESTION
void raw_unlock(void * addr, size_t size) {
if (munlock(addr, size)) {
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
}
}
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
size_t lock_granularity() {
SYSTEM_INFO si;
GetSystemInfo(&si);
return (size_t) si.dwPageSize;
}
bool raw_lock(void * ptr, size_t len) {
for (int tries = 1; ; tries++) {
if (VirtualLock(ptr, len)) {
return true;
}
if (tries == 2) {
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
len, size, llama_format_win_err(GetLastError()).c_str());
return false;
}
// It failed but this was only the first try; increase the working
// set size and try again.
SIZE_T min_ws_size, max_ws_size;
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
gguf_format_win_err(GetLastError()).c_str());
return false;
}
// Per MSDN: "The maximum number of pages that a process can lock
// is equal to the number of pages in its minimum working set minus
// a small overhead."
// Hopefully a megabyte is enough overhead:
size_t increment = len + 1048576;
// The minimum must be <= the maximum, so we need to increase both:
min_ws_size += increment;
max_ws_size += increment;
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
gguf_format_win_err(GetLastError()).c_str());
return false;
}
}
}
void raw_unlock(void * ptr, size_t len) {
if (!VirtualUnlock(ptr, len)) {
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
gguf_format_win_err(GetLastError()).c_str());
}
}
#else
static constexpr bool SUPPORTED = false;
size_t lock_granularity() {
return (size_t) 65536;
}
bool raw_lock(const void * addr, size_t len) {
fprintf(stderr, "warning: mlock not supported on this system\n");
return false;
}
void raw_unlock(const void * addr, size_t len) {}
#endif
};
#endif