Merge branch 'master' into concedo_experimental

# Conflicts:
#	.github/workflows/docker.yml
#	Makefile
#	README.md
#	llama.cpp
This commit is contained in:
Concedo 2023-12-22 21:39:23 +08:00
commit 3bca03d26b
5 changed files with 65 additions and 37 deletions

View file

@ -7691,7 +7691,8 @@ inline void ggml_cuda_op_scale(
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
const float scale = ((float *) dst->op_params)[0]; float scale;
memcpy(&scale, dst->op_params, sizeof(float));
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());

8
ggml.c
View file

@ -10335,7 +10335,8 @@ static void ggml_compute_forward_scale_f32(
} }
// scale factor // scale factor
const float v = *(float *) dst->op_params; float v;
memcpy(&v, dst->op_params, sizeof(float));
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -15152,7 +15153,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{ {
// necessary for llama // necessary for llama
if (src0->grad) { if (src0->grad) {
const float s = ((float *) tensor->op_params)[0]; float s;
memcpy(&s, tensor->op_params, sizeof(float));
src0->grad = src0->grad =
ggml_add_or_set(ctx, ggml_add_or_set(ctx,
@ -15335,6 +15337,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
const int n_past = ((int32_t *) tensor->op_params)[0]; const int n_past = ((int32_t *) tensor->op_params)[0];
src0->grad = src0->grad =
ggml_add_or_set(ctx, src0->grad, ggml_add_or_set(ctx, src0->grad,
/* ggml_diag_mask_inf_impl() shouldn't be here */
/* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
zero_table); zero_table);
} }

3
ggml.h
View file

@ -491,7 +491,8 @@ extern "C" {
enum ggml_log_level { enum ggml_log_level {
GGML_LOG_LEVEL_ERROR = 2, GGML_LOG_LEVEL_ERROR = 2,
GGML_LOG_LEVEL_WARN = 3, GGML_LOG_LEVEL_WARN = 3,
GGML_LOG_LEVEL_INFO = 4 GGML_LOG_LEVEL_INFO = 4,
GGML_LOG_LEVEL_DEBUG = 5
}; };
// ggml object // ggml object

View file

@ -779,7 +779,7 @@ struct llama_file {
throw std::runtime_error(format("read error: %s", strerror(errno))); throw std::runtime_error(format("read error: %s", strerror(errno)));
} }
if (ret != 1) { if (ret != 1) {
throw std::runtime_error(std::string("unexpectedly reached end of file")); throw std::runtime_error("unexpectedly reached end of file");
} }
} }
@ -932,22 +932,22 @@ struct llama_mmap {
#elif defined(_WIN32) #elif defined(_WIN32)
static constexpr bool SUPPORTED = true; static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
(void) numa; GGML_UNUSED(numa);
size = file->size; size = file->size;
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
DWORD error = GetLastError();
if (hMapping == NULL) { if (hMapping == NULL) {
DWORD error = GetLastError();
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
} }
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
error = GetLastError(); DWORD error = GetLastError();
CloseHandle(hMapping); CloseHandle(hMapping);
if (addr == NULL) { if (addr == NULL) {
@ -955,7 +955,7 @@ struct llama_mmap {
} }
#ifndef USE_FAILSAFE #ifndef USE_FAILSAFE
if (prefetch) { if (prefetch > 0) {
// PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll"); HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
@ -967,9 +967,9 @@ struct llama_mmap {
// advise the kernel to preload the mapped memory // advise the kernel to preload the mapped memory
WIN32_MEMORY_RANGE_ENTRY range; WIN32_MEMORY_RANGE_ENTRY range;
range.VirtualAddress = addr; range.VirtualAddress = addr;
range.NumberOfBytes = (SIZE_T)size; range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
llama_format_win_err(GetLastError()).c_str()); llama_format_win_err(GetLastError()).c_str());
} }
} }
@ -987,26 +987,26 @@ struct llama_mmap {
~llama_mmap() { ~llama_mmap() {
if (!UnmapViewOfFile(addr)) { if (!UnmapViewOfFile(addr)) {
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
llama_format_win_err(GetLastError()).c_str()); llama_format_win_err(GetLastError()).c_str());
} }
} }
#else #else
static constexpr bool SUPPORTED = false; static constexpr bool SUPPORTED = false;
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) {
(void) file; GGML_UNUSED(file);
(void) prefetch; GGML_UNUSED(prefetch);
(void) numa; GGML_UNUSED(numa);
throw std::runtime_error(std::string("mmap not supported")); throw std::runtime_error("mmap not supported");
} }
void unmap(size_t offset, size_t len) { void unmap_fragment(size_t first, size_t last) {
(void) offset; GGML_UNUSED(first);
(void) len; GGML_UNUSED(last);
throw std::runtime_error(std::string("mmap not supported")); throw std::runtime_error("mmap not supported");
} }
#endif #endif
}; };
@ -2383,7 +2383,8 @@ struct llama_model_loader {
} }
} }
void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { // Returns false if cancelled by progress_callback
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
size_t size_data = 0; size_t size_data = 0;
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
@ -2415,7 +2416,9 @@ struct llama_model_loader {
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
if (progress_callback) { if (progress_callback) {
progress_callback((float) size_done / size_data, progress_callback_user_data); if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
return false;
}
} }
const size_t offs = file_offset(ggml_get_name(cur)); const size_t offs = file_offset(ggml_get_name(cur));
@ -2477,8 +2480,11 @@ struct llama_model_loader {
} }
if (progress_callback) { if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data); // Even though the model is done loading, we still honor
// cancellation since we need to free allocations.
return progress_callback(1.0f, progress_callback_user_data);
} }
return true;
} }
}; };
@ -3074,7 +3080,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
} }
static void llm_load_tensors( // Returns false if cancelled by progress_callback
static bool llm_load_tensors(
llama_model_loader & ml, llama_model_loader & ml,
llama_model & model, llama_model & model,
int n_gpu_layers, int n_gpu_layers,
@ -3751,16 +3758,20 @@ static void llm_load_tensors(
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
} }
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL); if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
return false;
}
model.mapping = std::move(ml.mapping); model.mapping = std::move(ml.mapping);
// loading time will be recalculate after the first eval, so // loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration // we take page faults deferred by mmap() into consideration
model.t_load_us = ggml_time_us() - model.t_start_us; model.t_load_us = ggml_time_us() - model.t_start_us;
return true;
} }
static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
try { try {
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
@ -3778,19 +3789,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
if (params.vocab_only) { if (params.vocab_only) {
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
return true; return 0;
} }
llm_load_tensors( if (!llm_load_tensors(
ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
params.progress_callback, params.progress_callback_user_data params.progress_callback, params.progress_callback_user_data
); )) {
return -2;
}
} catch (const std::exception & err) { } catch (const std::exception & err) {
LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
return false; return -1;
} }
return true; return 0;
} }
// //
@ -9406,11 +9419,18 @@ struct llama_model * llama_load_model_from_file(
LLAMA_LOG_INFO("\n"); LLAMA_LOG_INFO("\n");
} }
} }
return true;
}; };
} }
if (!llama_model_load(path_model, *model, params)) { int status = llama_model_load(path_model, *model, params);
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
} else if (status == -2) {
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
}
delete model; delete model;
return nullptr; return nullptr;
} }

View file

@ -127,7 +127,7 @@ extern "C" {
bool sorted; bool sorted;
} llama_token_data_array; } llama_token_data_array;
typedef void (*llama_progress_callback)(float progress, void *ctx); typedef bool (*llama_progress_callback)(float progress, void *ctx);
// Input data for llama_decode // Input data for llama_decode
// A llama_batch object can contain input about one or many sequences // A llama_batch object can contain input about one or many sequences
@ -180,7 +180,9 @@ extern "C" {
int32_t main_gpu; // the GPU that is used for scratch and small tensors int32_t main_gpu; // the GPU that is used for scratch and small tensors
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
// called with a progress value between 0 and 1, pass NULL to disable // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
// If the provided progress_callback returns true, model loading continues.
// If it returns false, model loading is immediately aborted.
llama_progress_callback progress_callback; llama_progress_callback progress_callback;
// context pointer passed to the progress callback // context pointer passed to the progress callback