diff --git a/Makefile b/Makefile index 5e50f4691..ce36fcf6b 100644 --- a/Makefile +++ b/Makefile @@ -332,7 +332,7 @@ OBJS += ggml-alloc.o llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h $(CXX) $(CXXFLAGS) -c $< -o $@ -gguf-llama.o: gguf-llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h gguf-llama.h gguf-util.h +gguf-llama.o: gguf-llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h gguf-llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ common.o: examples/common.cpp examples/common.h diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index 74a447c07..ad212d752 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -1,5 +1,4 @@ #include "ggml.h" -#include "gguf-util.h" #include "gguf-llama.h" #include @@ -195,6 +194,15 @@ bool gguf_ex_read_1(const std::string & fname) { fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data); + // print first 10 elements + const float * data = (const float *) cur->data; + + printf("%s data[:10] : ", name); + for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) { + printf("%f ", data[j]); + } + printf("\n\n"); + // check data { const float * data = (const float *) cur->data; @@ -216,48 +224,6 @@ bool gguf_ex_read_1(const std::string & fname) { return true; } -// read just the tensor info and mmap the data in user code -bool gguf_ex_read_2(const std::string & fname) { - struct ggml_context * ctx_data = NULL; - - struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_data, - }; - - struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); - - struct gguf_file file(fname.c_str(), "rb"); - gguf_mmap data_mmap(&file, 0, false); - - const int n_tensors = gguf_get_n_tensors(ctx); - - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(ctx, i); - const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); - - struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - - cur->data = static_cast(data_mmap.addr) + offset; - - // print first 10 elements - const float * data = (const float *) cur->data; - - printf("%s data[:10] : ", name); - for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) { - printf("%f ", data[j]); - } - printf("\n\n"); - } - - fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data)); - - ggml_free(ctx_data); - gguf_free(ctx); - - return true; -} - int main(int argc, char ** argv) { if (argc < 3) { fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]); @@ -274,7 +240,6 @@ int main(int argc, char ** argv) { } else if (mode == "r") { GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file"); GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file"); - GGML_ASSERT(gguf_ex_read_2(fname) && "failed to read gguf file"); } else if (mode == "q") { llama_model_quantize_params params = llama_model_quantize_default_params(); llama_model_quantize(fname.c_str(), "quant.gguf", ¶ms); diff --git a/gguf-llama.cpp b/gguf-llama.cpp index c62840e88..b3de2671f 100644 --- a/gguf-llama.cpp +++ b/gguf-llama.cpp @@ -6,31 +6,61 @@ #include #endif -#include "gguf-util.h" #define LLAMA_API_CPP // TODO: eliminate me #include "gguf-llama.h" #include "ggml.h" + #ifdef GGML_USE_CUBLAS -#include "ggml-cuda.h" +# include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) -#include "ggml-opencl.h" +# include "ggml-opencl.h" #endif #ifdef GGML_USE_METAL -#include "ggml-metal.h" +# include "ggml-metal.h" #endif #ifdef GGML_USE_MPI -#include "ggml-mpi.h" +# include "ggml-mpi.h" #endif #ifdef GGML_USE_K_QUANTS -#ifndef QK_K -#ifdef GGML_QKK_64 -#define QK_K 64 +# ifndef QK_K +# ifdef GGML_QKK_64 +# define QK_K 64 +# else +# define QK_K 256 +# endif +# endif +#endif + +#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) +# include "ggml-alloc.h" +# define LLAMA_USE_ALLOCATOR #else -#define QK_K 256 +# define LLAMA_USE_SCRATCH +# define LLAMA_MAX_SCRATCH_BUFFERS 16 #endif + +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #endif + #if defined(_POSIX_MEMLOCK_RANGE) + #include + #endif + #endif #endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #include + #include // for _fseeki64 #endif #include @@ -68,7 +98,11 @@ #define TN_FFN_NORM "blk.%d.ffn_norm.weight" #define TN_FFN_GATE "blk.%d.ffn_gate.weight" #define TN_FFN_DOWN "blk.%d.ffn_down.weight" -#define TN_FFN_UP "blk.%d.ffn_up.weight" +#define TN_FFN_UP "blk.%d.ffn_up.weight" + +// +// logging +// static void llama_log_internal(llama_log_level level, const char* format, ...); static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data); @@ -76,6 +110,10 @@ static void llama_log_callback_default(llama_log_level level, const char * text, #define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) +// +// helpers +// + template static std::string to_string(const T & val) { std::stringstream ss; @@ -90,25 +128,69 @@ static void zeros(std::ofstream & file, size_t n) { } } -#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) -#include "ggml-alloc.h" -#define LLAMA_USE_ALLOCATOR +#ifdef __GNUC__ +#ifdef __MINGW32__ +__attribute__((format(gnu_printf, 1, 2))) #else -#define LLAMA_USE_SCRATCH -#define LLAMA_MAX_SCRATCH_BUFFERS 16 +__attribute__((format(printf, 1, 2))) #endif +#endif +static std::string format(const char * fmt, ...) { + va_list ap, ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} -#define UNUSED GGML_UNUSED +// +// ggml helpers +// + +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + +// +// llama helpers +// #ifdef GGML_USE_CUBLAS -#define llama_host_malloc(n) ggml_cuda_host_malloc(n) -#define llama_host_free(data) ggml_cuda_host_free(data) +# define llama_host_malloc(n) ggml_cuda_host_malloc(n) +# define llama_host_free(data) ggml_cuda_host_free(data) #elif GGML_USE_METAL -#define llama_host_malloc(n) ggml_metal_host_malloc(n) -#define llama_host_free(data) ggml_metal_host_free(data) +# define llama_host_malloc(n) ggml_metal_host_malloc(n) +# define llama_host_free(data) ggml_metal_host_free(data) #else -#define llama_host_malloc(n) malloc(n) -#define llama_host_free(data) free(data) +# define llama_host_malloc(n) malloc(n) +# define llama_host_free(data) free(data) +#endif + +#if defined(_WIN32) +static std::string llama_format_win_err(DWORD err) { + LPSTR buf; + size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); + if (!size) { + return "FormatMessageA failed"; + } + std::string ret(buf, size); + LocalFree(buf); + return ret; +} #endif struct llama_buffer { @@ -147,27 +229,328 @@ struct llama_buffer { } }; +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t len) const { + if (len == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, len, 1, fp); + if (ferror(fp)) { + throw std::runtime_error(format("read error: %s", strerror(errno))); + } + if (ret != 1) { + throw std::runtime_error(std::string("unexpectedly reached end of file")); + } + } + + void write_raw(const void * ptr, size_t len) const { + if (len == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, len, 1, fp); + if (ret != 1) { + throw std::runtime_error(format("write error: %s", strerror(errno))); + } + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +struct llama_mmap { + void * addr; + size_t size; + + llama_mmap(const llama_mmap &) = delete; + +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; + + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { + size = file->size; + int fd = fileno(file->fp); + int flags = MAP_SHARED; + // prefetch/readahead impairs performance on NUMA systems + if (numa) { prefetch = 0; } +#ifdef __linux__ + if (prefetch) { flags |= MAP_POPULATE; } +#endif + addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); + if (addr == MAP_FAILED) { + throw std::runtime_error(format("mmap failed: %s", strerror(errno))); + } + + if (prefetch > 0) { + // Advise the kernel to preload the mapped memory + if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } + } + if (numa) { + // advise the kernel not to use readahead + // (because the next page might not belong on the same node) + if (madvise(addr, file->size, MADV_RANDOM)) { + fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", + strerror(errno)); + } + } + } + + ~llama_mmap() { + munmap(addr, size); + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { + (void) numa; + + size = file->size; + + HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); + + HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); + DWORD error = GetLastError(); + + if (hMapping == NULL) { + throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); + } + + addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); + error = GetLastError(); + CloseHandle(hMapping); + + if (addr == NULL) { + throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); + } + + #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 + if (prefetch) { + // Advise the kernel to preload the mapped memory + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T)size; + if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } + #else + #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") + #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8 + } + + ~llama_mmap() { + if (!UnmapViewOfFile(addr)) { + fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { + (void) file; + (void) prefetch; + (void) numa; + + throw std::runtime_error(std::string("mmap not supported")); + } +#endif +}; + +// Represents some region of memory being locked using mlock or VirtualLock; +// will automatically unlock on destruction. +struct llama_mlock { + void * addr = NULL; + size_t size = 0; + + bool failed_already = false; + + llama_mlock() {} + llama_mlock(const llama_mlock &) = delete; + + ~llama_mlock() { + if (size) { + raw_unlock(addr, size); + } + } + + void init(void * ptr) { + GGML_ASSERT(addr == NULL && size == 0); + addr = ptr; + } + + void grow_to(size_t target_size) { + GGML_ASSERT(addr); + if (failed_already) { + return; + } + size_t granularity = lock_granularity(); + target_size = (target_size + granularity - 1) & ~(granularity - 1); + if (target_size > size) { + if (raw_lock((uint8_t *) addr + size, target_size - size)) { + size = target_size; + } else { + failed_already = true; + } + } + } + +#ifdef _POSIX_MEMLOCK_RANGE + static constexpr bool SUPPORTED = true; + + size_t lock_granularity() { + return (size_t) sysconf(_SC_PAGESIZE); + } + + #ifdef __APPLE__ + #define MLOCK_SUGGESTION \ + "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ + "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" + #else + #define MLOCK_SUGGESTION \ + "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" + #endif + + bool raw_lock(const void * addr, size_t size) { + if (!mlock(addr, size)) { + return true; + } else { + char* errmsg = std::strerror(errno); + bool suggest = (errno == ENOMEM); + + // Check if the resource limit is fine after all + struct rlimit lock_limit; + if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) + suggest = false; + if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) + suggest = false; + + fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", + size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); + return false; + } + } + + #undef MLOCK_SUGGESTION + + void raw_unlock(void * addr, size_t size) { + if (munlock(addr, size)) { + fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); + } + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + size_t lock_granularity() { + SYSTEM_INFO si; + GetSystemInfo(&si); + return (size_t) si.dwPageSize; + } + + bool raw_lock(void * ptr, size_t len) { + for (int tries = 1; ; tries++) { + if (VirtualLock(ptr, len)) { + return true; + } + if (tries == 2) { + fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", + len, size, llama_format_win_err(GetLastError()).c_str()); + return false; + } + + // It failed but this was only the first try; increase the working + // set size and try again. + SIZE_T min_ws_size, max_ws_size; + if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { + fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", + gguf_format_win_err(GetLastError()).c_str()); + return false; + } + // Per MSDN: "The maximum number of pages that a process can lock + // is equal to the number of pages in its minimum working set minus + // a small overhead." + // Hopefully a megabyte is enough overhead: + size_t increment = len + 1048576; + // The minimum must be <= the maximum, so we need to increase both: + min_ws_size += increment; + max_ws_size += increment; + if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { + fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", + gguf_format_win_err(GetLastError()).c_str()); + return false; + } + } + } + + void raw_unlock(void * ptr, size_t len) { + if (!VirtualUnlock(ptr, len)) { + fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", + gguf_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + size_t lock_granularity() { + return (size_t) 65536; + } + + bool raw_lock(const void * addr, size_t len) { + fprintf(stderr, "warning: mlock not supported on this system\n"); + return false; + } + + void raw_unlock(const void * addr, size_t len) {} +#endif +}; + typedef void (*offload_func_t)(struct ggml_tensor * tensor); void llama_nop(struct ggml_tensor * tensor) { // don't offload by default (void) tensor; } -// -// ggml helpers -// - -static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); -} - // // memory sizes (calculated for n_batch == 512) // @@ -384,11 +767,11 @@ struct llama_model { llama_buffer buf; // model memory mapped file - std::unique_ptr mapping; + std::unique_ptr mapping; // objects representing data potentially being locked in memory - gguf_mlock mlock_buf; - gguf_mlock mlock_mmap; + llama_mlock mlock_buf; + llama_mlock mlock_mmap; // for quantize-stats only std::vector> tensors_by_name; @@ -565,7 +948,7 @@ enum gguf_file_version { }; struct gguf_file_loader { - gguf_file file; + llama_file file; gguf_context * ctx_gguf; gguf_file_version file_version; @@ -643,11 +1026,11 @@ struct llama_model_loader { bool use_mmap; size_t num_ggml_tensors_created = 0; struct ggml_context * ggml_ctx = NULL; - std::unique_ptr mapping; + std::unique_ptr mapping; llama_model_loader(const std::string & fname_base, bool use_mmap) { file_loader = std::unique_ptr(new gguf_file_loader(fname_base.c_str(), tensors_map)); - if (!gguf_mmap::SUPPORTED) { + if (!llama_mmap::SUPPORTED) { use_mmap = false; } this->use_mmap = use_mmap; @@ -707,13 +1090,13 @@ struct llama_model_loader { if (use_mmap) { lt.data = (uint8_t *) mapping->addr + lt.file_off; } else { - gguf_file & file = file_loader->file; + llama_file & file = file_loader->file; file.seek(lt.file_off, SEEK_SET); file.read_raw(lt.data, lt.size); } } - void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) { + void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { size_t data_size = 0; size_t prefetch_size = 0; size_t lock_size = 0; @@ -726,7 +1109,7 @@ struct llama_model_loader { } if (use_mmap) { - mapping.reset(new gguf_mmap(&file_loader->file, prefetch_size, ggml_is_numa())); + mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa())); if (lmlock) { lmlock->init(mapping->addr); } @@ -748,7 +1131,7 @@ struct llama_model_loader { load_data_for(lt); - switch(lt.ggml_tensor->backend) { + switch (lt.ggml_tensor->backend) { case GGML_BACKEND_CPU: lt.ggml_tensor->data = lt.data; if (use_mmap && lmlock) { @@ -871,11 +1254,11 @@ int llama_max_devices(void) { } bool llama_mmap_supported(void) { - return gguf_mmap::SUPPORTED; + return llama_mmap::SUPPORTED; } bool llama_mlock_supported(void) { - return gguf_mlock::SUPPORTED; + return llama_mlock::SUPPORTED; } void llama_backend_init(bool numa) { @@ -982,7 +1365,6 @@ static void llama_model_load_internal( std::unique_ptr ml(new llama_model_loader(fname, use_mmap)); model.n_gpu_layers = n_gpu_layers; - gguf_file_version file_version = ml->file_loader->file_version; auto & hparams = model.hparams; @@ -1069,7 +1451,7 @@ static void llama_model_load_internal( } { - LLAMA_LOG_INFO("%s: format = %s\n", __func__, gguf_file_version_name(file_version)); + LLAMA_LOG_INFO("%s: format = %s\n", __func__, gguf_file_version_name(ml->file_loader->file_version)); LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx); LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); @@ -1935,15 +2317,15 @@ static bool llama_is_eos_token(const llama_vocab & vocab, llama_token token) { } static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token token) { - UNUSED(vocab); - UNUSED(token); + GGML_UNUSED(vocab); + GGML_UNUSED(token); // TODO: improve? return false; } static bool llama_is_unused_token(const llama_vocab & vocab, llama_token token) { - UNUSED(vocab); - UNUSED(token); + GGML_UNUSED(vocab); + GGML_UNUSED(token); // TODO: improve? return false; } @@ -2598,7 +2980,6 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * } } - void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { // Reference implementation: // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr @@ -3615,7 +3996,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const // maybe this should in llama_model_loader if (model_loader->use_mmap) { - model_loader->mapping.reset(new gguf_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa())); + model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa())); } } @@ -4143,7 +4524,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { } static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - gguf_file file(path_session, "rb"); + llama_file file(path_session, "rb"); GGML_UNUSED(ctx); GGML_UNUSED(path_session); GGML_UNUSED(tokens_out); @@ -4164,7 +4545,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi } bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { - gguf_file file(path_session, "wb"); + llama_file file(path_session, "wb"); GGML_UNUSED(ctx); GGML_UNUSED(tokens); GGML_UNUSED(n_token_count); diff --git a/gguf-util.h b/gguf-util.h deleted file mode 100644 index 15fbc6036..000000000 --- a/gguf-util.h +++ /dev/null @@ -1,397 +0,0 @@ -// GGUF counterpart of llama-util.h. -// we may consider making it a part of ggml.c once GGUF work is complete. -// this will require extra work to migrate this to pure C. -// Contains wrappers around OS interfaces. - -#ifndef GGUF_UTIL_H -#define GGUF_UTIL_H - -#include "ggml.h" - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#ifdef __has_include - #if __has_include() - #include - #if defined(_POSIX_MAPPED_FILES) - #include - #endif - #if defined(_POSIX_MEMLOCK_RANGE) - #include - #endif - #endif -#endif - -#if defined(_WIN32) - #define WIN32_LEAN_AND_MEAN - #ifndef NOMINMAX - #define NOMINMAX - #endif - #include - #include - #include // for _fseeki64 -#endif - -#ifdef __GNUC__ -#ifdef __MINGW32__ -__attribute__((format(gnu_printf, 1, 2))) -#else -__attribute__((format(printf, 1, 2))) -#endif -#endif -static std::string format(const char * fmt, ...) { - va_list ap, ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - -// TODO: can we merge this one and gguf_context? -struct gguf_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - gguf_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); - } - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t len) const { - if (len == 0) { - return; - } - errno = 0; - std::size_t ret = std::fread(ptr, len, 1, fp); - if (ferror(fp)) { - throw std::runtime_error(format("read error: %s", strerror(errno))); - } - if (ret != 1) { - throw std::runtime_error(std::string("unexpectedly reached end of file")); - } - } - - void write_raw(const void * ptr, size_t len) const { - if (len == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, len, 1, fp); - if (ret != 1) { - throw std::runtime_error(format("write error: %s", strerror(errno))); - } - } - - ~gguf_file() { - if (fp) { - std::fclose(fp); - } - } -}; - -#if defined(_WIN32) -static std::string gguf_format_win_err(DWORD err) { - LPSTR buf; - size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); - if (!size) { - return "FormatMessageA failed"; - } - std::string ret(buf, size); - LocalFree(buf); - return ret; -} -#endif - -struct gguf_mmap { - void * addr; - size_t size; - - gguf_mmap(const gguf_mmap &) = delete; - -#ifdef _POSIX_MAPPED_FILES - static constexpr bool SUPPORTED = true; - - gguf_mmap(struct gguf_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { - size = file->size; - int fd = fileno(file->fp); - int flags = MAP_SHARED; - // prefetch/readahead impairs performance on NUMA systems - if (numa) { prefetch = 0; } -#ifdef __linux__ - if (prefetch) { flags |= MAP_POPULATE; } -#endif - addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); - if (addr == MAP_FAILED) { - throw std::runtime_error(format("mmap failed: %s", strerror(errno))); - } - - if (prefetch > 0) { - // Advise the kernel to preload the mapped memory - if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { - fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", - strerror(errno)); - } - } - if (numa) { - // advise the kernel not to use readahead - // (because the next page might not belong on the same node) - if (madvise(addr, file->size, MADV_RANDOM)) { - fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", - strerror(errno)); - } - } - } - - ~gguf_mmap() { - munmap(addr, size); - } -#elif defined(_WIN32) - static constexpr bool SUPPORTED = true; - - gguf_mmap(struct gguf_file * file, bool prefetch = true, bool numa = false) { - (void) numa; - - size = file->size; - - HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); - - HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); - DWORD error = GetLastError(); - - if (hMapping == NULL) { - throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); - } - - addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); - error = GetLastError(); - CloseHandle(hMapping); - - if (addr == NULL) { - throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); - } - - #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 - if (prefetch) { - // Advise the kernel to preload the mapped memory - WIN32_MEMORY_RANGE_ENTRY range; - range.VirtualAddress = addr; - range.NumberOfBytes = (SIZE_T)size; - if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { - fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", - gguf_format_win_err(GetLastError()).c_str()); - } - } - #else - #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") - #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8 - } - - ~gguf_mmap() { - if (!UnmapViewOfFile(addr)) { - fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); - } - } -#else - static constexpr bool SUPPORTED = false; - - gguf_mmap(struct gguf_file * file, bool prefetch = true, bool numa = false) { - (void) file; - (void) prefetch; - (void) numa; - - throw std::runtime_error(std::string("mmap not supported")); - } -#endif -}; - -// Represents some region of memory being locked using mlock or VirtualLock; -// will automatically unlock on destruction. -struct gguf_mlock { - void * addr = NULL; - size_t size = 0; - bool failed_already = false; - - gguf_mlock() {} - gguf_mlock(const gguf_mlock &) = delete; - - ~gguf_mlock() { - if (size) { - raw_unlock(addr, size); - } - } - - void init(void * ptr) { - GGML_ASSERT(addr == NULL && size == 0); - addr = ptr; - } - - void grow_to(size_t target_size) { - GGML_ASSERT(addr); - if (failed_already) { - return; - } - size_t granularity = lock_granularity(); - target_size = (target_size + granularity - 1) & ~(granularity - 1); - if (target_size > size) { - if (raw_lock((uint8_t *) addr + size, target_size - size)) { - size = target_size; - } else { - failed_already = true; - } - } - } - -#ifdef _POSIX_MEMLOCK_RANGE - static constexpr bool SUPPORTED = true; - - size_t lock_granularity() { - return (size_t) sysconf(_SC_PAGESIZE); - } - - #ifdef __APPLE__ - #define MLOCK_SUGGESTION \ - "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ - "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" - #else - #define MLOCK_SUGGESTION \ - "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" - #endif - - bool raw_lock(const void * addr, size_t size) { - if (!mlock(addr, size)) { - return true; - } else { - char* errmsg = std::strerror(errno); - bool suggest = (errno == ENOMEM); - - // Check if the resource limit is fine after all - struct rlimit lock_limit; - if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) - suggest = false; - if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) - suggest = false; - - fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", - size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); - return false; - } - } - - #undef MLOCK_SUGGESTION - - void raw_unlock(void * addr, size_t size) { - if (munlock(addr, size)) { - fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); - } - } -#elif defined(_WIN32) - static constexpr bool SUPPORTED = true; - - size_t lock_granularity() { - SYSTEM_INFO si; - GetSystemInfo(&si); - return (size_t) si.dwPageSize; - } - - bool raw_lock(void * ptr, size_t len) { - for (int tries = 1; ; tries++) { - if (VirtualLock(ptr, len)) { - return true; - } - if (tries == 2) { - fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", - len, size, llama_format_win_err(GetLastError()).c_str()); - return false; - } - - // It failed but this was only the first try; increase the working - // set size and try again. - SIZE_T min_ws_size, max_ws_size; - if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { - fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", - gguf_format_win_err(GetLastError()).c_str()); - return false; - } - // Per MSDN: "The maximum number of pages that a process can lock - // is equal to the number of pages in its minimum working set minus - // a small overhead." - // Hopefully a megabyte is enough overhead: - size_t increment = len + 1048576; - // The minimum must be <= the maximum, so we need to increase both: - min_ws_size += increment; - max_ws_size += increment; - if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { - fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", - gguf_format_win_err(GetLastError()).c_str()); - return false; - } - } - } - - void raw_unlock(void * ptr, size_t len) { - if (!VirtualUnlock(ptr, len)) { - fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", - gguf_format_win_err(GetLastError()).c_str()); - } - } -#else - static constexpr bool SUPPORTED = false; - - size_t lock_granularity() { - return (size_t) 65536; - } - - bool raw_lock(const void * addr, size_t len) { - fprintf(stderr, "warning: mlock not supported on this system\n"); - return false; - } - - void raw_unlock(const void * addr, size_t len) {} -#endif -}; - -#endif