diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5ffb91b21..0dc07eb13 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7691,7 +7691,8 @@ inline void ggml_cuda_op_scale( GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - const float scale = ((float *) dst->op_params)[0]; + float scale; + memcpy(&scale, dst->op_params, sizeof(float)); scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); CUDA_CHECK(cudaGetLastError()); diff --git a/ggml.c b/ggml.c index e0441d2f1..d6f7bdb10 100644 --- a/ggml.c +++ b/ggml.c @@ -10335,7 +10335,8 @@ static void ggml_compute_forward_scale_f32( } // scale factor - const float v = *(float *) dst->op_params; + float v; + memcpy(&v, dst->op_params, sizeof(float)); const int ith = params->ith; const int nth = params->nth; @@ -15152,7 +15153,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - const float s = ((float *) tensor->op_params)[0]; + float s; + memcpy(&s, tensor->op_params, sizeof(float)); src0->grad = ggml_add_or_set(ctx, @@ -15335,6 +15337,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor const int n_past = ((int32_t *) tensor->op_params)[0]; src0->grad = ggml_add_or_set(ctx, src0->grad, + /* ggml_diag_mask_inf_impl() shouldn't be here */ + /* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */ ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), zero_table); } diff --git a/ggml.h b/ggml.h index 2a32b5385..82be91807 100644 --- a/ggml.h +++ b/ggml.h @@ -491,7 +491,8 @@ extern "C" { enum ggml_log_level { GGML_LOG_LEVEL_ERROR = 2, GGML_LOG_LEVEL_WARN = 3, - GGML_LOG_LEVEL_INFO = 4 + GGML_LOG_LEVEL_INFO = 4, + GGML_LOG_LEVEL_DEBUG = 5 }; // ggml object diff --git a/llama.cpp b/llama.cpp index 6ddd06110..d74b89008 100644 --- a/llama.cpp +++ b/llama.cpp @@ -779,7 +779,7 @@ struct llama_file { throw std::runtime_error(format("read error: %s", strerror(errno))); } if (ret != 1) { - throw std::runtime_error(std::string("unexpectedly reached end of file")); + throw std::runtime_error("unexpectedly reached end of file"); } } @@ -932,22 +932,22 @@ struct llama_mmap { #elif defined(_WIN32) static constexpr bool SUPPORTED = true; - llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { - (void) numa; + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) { + GGML_UNUSED(numa); size = file->size; HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); - DWORD error = GetLastError(); if (hMapping == NULL) { + DWORD error = GetLastError(); throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); } addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); - error = GetLastError(); + DWORD error = GetLastError(); CloseHandle(hMapping); if (addr == NULL) { @@ -955,7 +955,7 @@ struct llama_mmap { } #ifndef USE_FAILSAFE - if (prefetch) { + if (prefetch > 0) { // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll"); @@ -967,9 +967,9 @@ struct llama_mmap { // advise the kernel to preload the mapped memory WIN32_MEMORY_RANGE_ENTRY range; range.VirtualAddress = addr; - range.NumberOfBytes = (SIZE_T)size; + range.NumberOfBytes = (SIZE_T) std::min(size, prefetch); if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { - fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n", llama_format_win_err(GetLastError()).c_str()); } } @@ -987,26 +987,26 @@ struct llama_mmap { ~llama_mmap() { if (!UnmapViewOfFile(addr)) { - fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", + LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n", llama_format_win_err(GetLastError()).c_str()); } } #else static constexpr bool SUPPORTED = false; - llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { - (void) file; - (void) prefetch; - (void) numa; + llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) { + GGML_UNUSED(file); + GGML_UNUSED(prefetch); + GGML_UNUSED(numa); - throw std::runtime_error(std::string("mmap not supported")); + throw std::runtime_error("mmap not supported"); } - void unmap(size_t offset, size_t len) { - (void) offset; - (void) len; + void unmap_fragment(size_t first, size_t last) { + GGML_UNUSED(first); + GGML_UNUSED(last); - throw std::runtime_error(std::string("mmap not supported")); + throw std::runtime_error("mmap not supported"); } #endif }; @@ -2383,7 +2383,8 @@ struct llama_model_loader { } } - void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { + // Returns false if cancelled by progress_callback + bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { size_t size_data = 0; for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { @@ -2415,7 +2416,9 @@ struct llama_model_loader { GGML_ASSERT(cur); // unused tensors should have been caught by load_data already if (progress_callback) { - progress_callback((float) size_done / size_data, progress_callback_user_data); + if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { + return false; + } } const size_t offs = file_offset(ggml_get_name(cur)); @@ -2477,8 +2480,11 @@ struct llama_model_loader { } if (progress_callback) { - progress_callback(1.0f, progress_callback_user_data); + // Even though the model is done loading, we still honor + // cancellation since we need to free allocations. + return progress_callback(1.0f, progress_callback_user_data); } + return true; } }; @@ -3074,7 +3080,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } } -static void llm_load_tensors( +// Returns false if cancelled by progress_callback +static bool llm_load_tensors( llama_model_loader & ml, llama_model & model, int n_gpu_layers, @@ -3751,16 +3758,20 @@ static void llm_load_tensors( model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); } - ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL); + if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) { + return false; + } model.mapping = std::move(ml.mapping); // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = ggml_time_us() - model.t_start_us; + return true; } -static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { +// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback +static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { try { llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); @@ -3778,19 +3789,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con if (params.vocab_only) { LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); - return true; + return 0; } - llm_load_tensors( + if (!llm_load_tensors( ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data - ); + )) { + return -2; + } } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); - return false; + return -1; } - return true; + return 0; } // @@ -9406,11 +9419,18 @@ struct llama_model * llama_load_model_from_file( LLAMA_LOG_INFO("\n"); } } + return true; }; } - if (!llama_model_load(path_model, *model, params)) { - LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + int status = llama_model_load(path_model, *model, params); + GGML_ASSERT(status <= 0); + if (status < 0) { + if (status == -1) { + LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + } else if (status == -2) { + LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); + } delete model; return nullptr; } diff --git a/llama.h b/llama.h index 76dcd280f..d3a5ff4c4 100644 --- a/llama.h +++ b/llama.h @@ -127,7 +127,7 @@ extern "C" { bool sorted; } llama_token_data_array; - typedef void (*llama_progress_callback)(float progress, void *ctx); + typedef bool (*llama_progress_callback)(float progress, void *ctx); // Input data for llama_decode // A llama_batch object can contain input about one or many sequences @@ -180,7 +180,9 @@ extern "C" { int32_t main_gpu; // the GPU that is used for scratch and small tensors const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) - // called with a progress value between 0 and 1, pass NULL to disable + // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. + // If the provided progress_callback returns true, model loading continues. + // If it returns false, model loading is immediately aborted. llama_progress_callback progress_callback; // context pointer passed to the progress callback