From b0915d5b51cbaa982ce9bbb9ce302bb9abdca0eb Mon Sep 17 00:00:00 2001 From: SRHMorris <69468379+SRHMorris@users.noreply.github.com> Date: Sun, 6 Oct 2024 08:34:20 +0100 Subject: [PATCH 1/3] vulkan : retry allocation with fallback flags (whisper/2451) Co-authored-by: Samuel Morris --- ggml/src/ggml-vulkan.cpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 12ad9d810..30bd376da 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -1070,10 +1070,25 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor try { buf->device_memory = device->device.allocateMemory({ mem_req.size, memory_type_index }); } catch (const vk::SystemError& e) { - // Out of Host/Device memory, clean up buffer - device->device.destroyBuffer(buf->buffer); - buf->size = 0; - throw e; + if (buf->memory_property_flags != fallback_flags) { + // Try again with fallback flags + memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags); + buf->memory_property_flags = fallback_flags; + + try { + buf->device_memory = device->device.allocateMemory({ mem_req.size, memory_type_index }); + } + catch (const vk::SystemError& e) { + device->device.destroyBuffer(buf->buffer); + buf->size = 0; + throw e; + } + } else { + // Out of Host/Device memory, clean up buffer + device->device.destroyBuffer(buf->buffer); + buf->size = 0; + throw e; + } } buf->ptr = nullptr; From b6d6c5289f1c9c677657c380591201ddb210b649 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 6 Oct 2024 12:53:28 +0300 Subject: [PATCH 2/3] sync : llama.cpp --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 5c92cdfd6..3cca9cc2f 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -0d7ecbbe536dc84240f646e0ec0a712251377f34 +564f42082f858f9674b2a2e06e9e779d9ed2c754 From f4b2dcdf4992ef11a854abc9b662624490e37b4c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 6 Oct 2024 13:49:41 +0300 Subject: [PATCH 3/3] readme : fix typo [no ci] --- examples/main/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/main/README.md b/examples/main/README.md index 6730effdf..f0c3031ab 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -69,7 +69,7 @@ In this section, we cover the most commonly used options for running the `llama- - `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. - `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\' - `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has. -- - `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. ## Input Prompts