From c7e9701f86564088350209d2f9d71c96ea00527f Mon Sep 17 00:00:00 2001 From: crasm Date: Fri, 22 Dec 2023 01:19:36 -0500 Subject: [PATCH 1/7] llama : add ability to cancel model loading (#4462) * llama : Add ability to cancel model load Updated llama_progress_callback so that if it returns false, the model loading is aborted. * llama : Add test for model load cancellation * Fix bool return in llama_model_load, remove std::ignore use * Update llama.cpp Co-authored-by: Jared Van Bortel * Fail test if model file is missing * Revert "Fail test if model file is missing" This reverts commit 32ebd525bf7e5a87ee8a3dbaab3d92ce79fbf23d. * Add test-model-load-cancel to Makefile * Revert "Revert "Fail test if model file is missing"" This reverts commit 2796953257ee5383fa7c8fe8fa8fc888c048fb0b. * Simplify .gitignore for tests, clang-tidy fixes * Label all ctest tests * ci : ctest uses -L main * Attempt at writing ctest_with_model * ci : get ci/run.sh working with test-model-load-cancel * ci : restrict .github/workflows/build.yml ctest to -L main * update requirements.txt * Disable test-model-load-cancel in make * Remove venv before creation * Restructure requirements.txt Top-level now imports the specific additional requirements for each python file. Using `pip install -r requirements.txt` will fail if versions become mismatched in the per-file requirements. * Make per-python-script requirements work alone This doesn't break the main requirements.txt. * Add comment * Add convert-persimmon-to-gguf.py to new requirements.txt scheme * Add check-requirements.sh script and GitHub workflow * Remove shellcheck installation step from workflow * Add nocleanup special arg * Fix merge see: https://github.com/ggerganov/llama.cpp/pull/4462#discussion_r1434593573 * reset to upstream/master * Redo changes for cancelling model load --------- Co-authored-by: Georgi Gerganov Co-authored-by: Jared Van Bortel --- llama.cpp | 46 +++++++++++++++++++++++++++++++++------------- llama.h | 6 ++++-- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/llama.cpp b/llama.cpp index d6c192441..cb0546c95 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2372,7 +2372,8 @@ struct llama_model_loader { } } - void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { + // Returns false if cancelled by progress_callback + bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { size_t size_data = 0; for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { @@ -2404,7 +2405,9 @@ struct llama_model_loader { GGML_ASSERT(cur); // unused tensors should have been caught by load_data already if (progress_callback) { - progress_callback((float) size_done / size_data, progress_callback_user_data); + if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { + return false; + } } const size_t offs = file_offset(ggml_get_name(cur)); @@ -2466,8 +2469,11 @@ struct llama_model_loader { } if (progress_callback) { - progress_callback(1.0f, progress_callback_user_data); + // Even though the model is done loading, we still honor + // cancellation since we need to free allocations. + return progress_callback(1.0f, progress_callback_user_data); } + return true; } }; @@ -3044,7 +3050,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } } -static void llm_load_tensors( +// Returns false if cancelled by progress_callback +static bool llm_load_tensors( llama_model_loader & ml, llama_model & model, int n_gpu_layers, @@ -3722,16 +3729,20 @@ static void llm_load_tensors( model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); } - ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL); + if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) { + return false; + } model.mapping = std::move(ml.mapping); // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = ggml_time_us() - model.t_start_us; + return true; } -static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { +// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback +static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { try { llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); @@ -3749,19 +3760,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con if (params.vocab_only) { LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); - return true; + return 0; } - llm_load_tensors( + if (!llm_load_tensors( ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data - ); + )) { + return -2; + } } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); - return false; + return -1; } - return true; + return 0; } // @@ -9141,11 +9154,18 @@ struct llama_model * llama_load_model_from_file( LLAMA_LOG_INFO("\n"); } } + return true; }; } - if (!llama_model_load(path_model, *model, params)) { - LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + int status = llama_model_load(path_model, *model, params); + GGML_ASSERT(status <= 0); + if (status < 0) { + if (status == -1) { + LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + } else if (status == -2) { + LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); + } delete model; return nullptr; } diff --git a/llama.h b/llama.h index 0be4b1337..af76bae2d 100644 --- a/llama.h +++ b/llama.h @@ -127,7 +127,7 @@ extern "C" { bool sorted; } llama_token_data_array; - typedef void (*llama_progress_callback)(float progress, void *ctx); + typedef bool (*llama_progress_callback)(float progress, void *ctx); // Input data for llama_decode // A llama_batch object can contain input about one or many sequences @@ -180,7 +180,9 @@ extern "C" { int32_t main_gpu; // the GPU that is used for scratch and small tensors const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) - // called with a progress value between 0 and 1, pass NULL to disable + // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. + // If the provided progress_callback returns true, model loading continues. + // If it returns false, model loading is immediately aborted. llama_progress_callback progress_callback; // context pointer passed to the progress callback From 0137ef88ea9f8fd837a065700814329d24adeec3 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Fri, 22 Dec 2023 06:47:01 +0000 Subject: [PATCH 2/7] ggml : extend `enum ggml_log_level` with `GGML_LOG_LEVEL_DEBUG` (#4579) --- ggml.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml.h b/ggml.h index 75918502b..338f355a4 100644 --- a/ggml.h +++ b/ggml.h @@ -484,7 +484,8 @@ extern "C" { enum ggml_log_level { GGML_LOG_LEVEL_ERROR = 2, GGML_LOG_LEVEL_WARN = 3, - GGML_LOG_LEVEL_INFO = 4 + GGML_LOG_LEVEL_INFO = 4, + GGML_LOG_LEVEL_DEBUG = 5 }; // ggml object From 2bb98279c5a087d62949972b35cf63ff974ffe6a Mon Sep 17 00:00:00 2001 From: Deins Date: Fri, 22 Dec 2023 08:49:54 +0200 Subject: [PATCH 3/7] readme : add zig bindings (#4581) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 73fe59bb4..8e17d5ba4 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,7 @@ as the main playground for developing new features for the [ggml](https://github - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) +- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) **UI:** From f31b98489824a86c937fa62ccf5dfd4bb0327b86 Mon Sep 17 00:00:00 2001 From: rhuddleston Date: Thu, 21 Dec 2023 23:56:34 -0700 Subject: [PATCH 4/7] ci : tag docker image with build number (#4584) --- .github/workflows/docker.yml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index a7165a38f..7f4de50ea 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -69,6 +69,19 @@ jobs: docker-images: true swap-storage: true + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi + - name: Build and push Docker image (versioned) if: github.event_name == 'push' uses: docker/build-push-action@v4 @@ -85,5 +98,5 @@ jobs: context: . push: ${{ github.event_name == 'push' }} platforms: ${{ matrix.config.platforms }} - tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}" + tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}" , "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}" file: ${{ matrix.config.dockerfile }} From 28cb35a0ecb9852adc3494aa51dde60141939d64 Mon Sep 17 00:00:00 2001 From: Michael Kesper Date: Fri, 22 Dec 2023 09:03:25 +0100 Subject: [PATCH 5/7] make : add LLAMA_HIP_UMA option (#4587) NB: LLAMA_HIP_UMA=1 (or any value) adds MK_CPPFLAG -DGGML_HIP_UMA --- Makefile | 3 +++ README.md | 8 +++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 68df7702a..42686ce71 100644 --- a/Makefile +++ b/Makefile @@ -452,6 +452,9 @@ ifdef LLAMA_HIPBLAS LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_KQUANTS_ITER ?= 2 MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS +ifdef LLAMA_HIP_UMA + MK_CPPFLAGS += -DGGML_HIP_UMA +endif # LLAMA_HIP_UMA MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) diff --git a/README.md b/README.md index 8e17d5ba4..377d3928b 100644 --- a/README.md +++ b/README.md @@ -440,7 +440,13 @@ Building the program with BLAS support may lead to some performance improvements && cmake --build build -- -j 16 ``` On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`. - However, this hurts performance for non-integrated GPUs. + However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs). + + - Using `make` (example for target gfx1030, build with 16 CPU threads): + ```bash + make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030 + ``` + - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): ```bash set PATH=%HIP_PATH%\bin;%PATH% From 48b24b170e3b4f9dc28200306840cb07d1c123df Mon Sep 17 00:00:00 2001 From: Herman Semenov Date: Fri, 22 Dec 2023 09:26:49 +0000 Subject: [PATCH 6/7] ggml : add comment about backward GGML_OP_DIAG_MASK_INF (#4203) --- ggml.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml.c b/ggml.c index f27920a2d..15e1984d1 100644 --- a/ggml.c +++ b/ggml.c @@ -15335,6 +15335,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor const int n_past = ((int32_t *) tensor->op_params)[0]; src0->grad = ggml_add_or_set(ctx, src0->grad, + /* ggml_diag_mask_inf_impl() shouldn't be here */ + /* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */ ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), zero_table); } From 48b7ff193e64c97ab174280ba0eb8d14b47c49ba Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 22 Dec 2023 12:12:53 +0100 Subject: [PATCH 7/7] llama : fix platforms without mmap (#4578) * llama : fix platforms without mmap * win32 : limit prefetch size to the file size * fix win32 error clobber, unnecessary std::string in std::runtime_error --- ggml-cuda.cu | 3 ++- ggml.c | 6 ++++-- llama.cpp | 36 ++++++++++++++++++------------------ 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index ac91ee12e..37d7f2792 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7702,7 +7702,8 @@ inline void ggml_cuda_op_scale( GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - const float scale = ((float *) dst->op_params)[0]; + float scale; + memcpy(&scale, dst->op_params, sizeof(float)); scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); CUDA_CHECK(cudaGetLastError()); diff --git a/ggml.c b/ggml.c index 15e1984d1..3656422d7 100644 --- a/ggml.c +++ b/ggml.c @@ -10335,7 +10335,8 @@ static void ggml_compute_forward_scale_f32( } // scale factor - const float v = *(float *) dst->op_params; + float v; + memcpy(&v, dst->op_params, sizeof(float)); const int ith = params->ith; const int nth = params->nth; @@ -15152,7 +15153,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - const float s = ((float *) tensor->op_params)[0]; + float s; + memcpy(&s, tensor->op_params, sizeof(float)); src0->grad = ggml_add_or_set(ctx, diff --git a/llama.cpp b/llama.cpp index cb0546c95..4e4495739 100644 --- a/llama.cpp +++ b/llama.cpp @@ -778,7 +778,7 @@ struct llama_file { throw std::runtime_error(format("read error: %s", strerror(errno))); } if (ret != 1) { - throw std::runtime_error(std::string("unexpectedly reached end of file")); + throw std::runtime_error("unexpectedly reached end of file"); } } @@ -931,29 +931,29 @@ struct llama_mmap { #elif defined(_WIN32) static constexpr bool SUPPORTED = true; - llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { - (void) numa; + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) { + GGML_UNUSED(numa); size = file->size; HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); - DWORD error = GetLastError(); if (hMapping == NULL) { + DWORD error = GetLastError(); throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); } addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); - error = GetLastError(); + DWORD error = GetLastError(); CloseHandle(hMapping); if (addr == NULL) { throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); } - if (prefetch) { + if (prefetch > 0) { // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll"); @@ -965,9 +965,9 @@ struct llama_mmap { // advise the kernel to preload the mapped memory WIN32_MEMORY_RANGE_ENTRY range; range.VirtualAddress = addr; - range.NumberOfBytes = (SIZE_T)size; + range.NumberOfBytes = (SIZE_T) std::min(size, prefetch); if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { - fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n", llama_format_win_err(GetLastError()).c_str()); } } @@ -982,26 +982,26 @@ struct llama_mmap { ~llama_mmap() { if (!UnmapViewOfFile(addr)) { - fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", + LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n", llama_format_win_err(GetLastError()).c_str()); } } #else static constexpr bool SUPPORTED = false; - llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { - (void) file; - (void) prefetch; - (void) numa; + llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) { + GGML_UNUSED(file); + GGML_UNUSED(prefetch); + GGML_UNUSED(numa); - throw std::runtime_error(std::string("mmap not supported")); + throw std::runtime_error("mmap not supported"); } - void unmap(size_t offset, size_t len) { - (void) offset; - (void) len; + void unmap_fragment(size_t first, size_t last) { + GGML_UNUSED(first); + GGML_UNUSED(last); - throw std::runtime_error(std::string("mmap not supported")); + throw std::runtime_error("mmap not supported"); } #endif };