diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 9c5091e61..2cad27e82 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1230,8 +1230,20 @@ struct clip_image_f32 * clip_image_f32_init() { return new clip_image_f32(); } -void clip_image_u8_free (struct clip_image_u8 * img) { delete img; } +void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } +void clip_image_u8_batch_free(struct clip_image_u8_batch & batch) { + if (batch.size > 0) { + delete[] batch.data; + batch.size = 0; + } +} +void clip_image_f32_batch_free(struct clip_image_f32_batch & batch) { + if (batch.size > 0) { + delete[] batch.data; + batch.size = 0; + } +} static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) { img->nx = nx; @@ -1494,11 +1506,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli pad_to_square = false; } // free the previous res_imgs if any set - if (res_imgs.size > 0 && res_imgs.size < 100) { - for (size_t i = 0; i < res_imgs.size; i++) { - clip_image_f32_free(&(res_imgs.data[i])); - } - delete[] res_imgs.data; + if (res_imgs.size > 0) { + clip_image_f32_batch_free(res_imgs); } res_imgs.data = nullptr; res_imgs.size = 0; @@ -1650,7 +1659,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli res_imgs.size = 1; res_imgs.data = new clip_image_f32[res_imgs.size]; - res_imgs.data[0] = std::move(*res); + res_imgs.data[0] = *res; + clip_image_f32_free(res); return true; } diff --git a/examples/llava/clip.h b/examples/llava/clip.h index cd9a4022f..e5bd54924 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -60,6 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init(); CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); +CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch & batch); +CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 22953417f..4ed310a0e 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -100,7 +100,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector int num_patches_width = grid_shape.first; // grid 1-4 int num_patches_height = grid_shape.second; // grid 1-4 - const size_t num_images = num_patches_width + num_patches_height + 1; + const size_t num_images = num_patches_width * num_patches_height + 1; // TODO: size calculation is not calculated - it's only tens of MB size_t ctx_size = 0; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1873dad2d..1129de203 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -975,7 +975,12 @@ struct llama_server_context { LOG_TEE("Error processing the given image"); clip_free(clp_ctx); - clip_image_f32_free(img_res_v.data); + clip_image_f32_batch_free(img_res_v); + return false; + } + if (img_res_v.size == 0) + { + LOG_TEE("Error processing the given image"); return false; } @@ -987,6 +992,7 @@ struct llama_server_context if (!img.image_embedding) { LOG_TEE("Unable to allocate memory for image embeddings\n"); + clip_image_f32_batch_free(img_res_v); clip_free(clp_ctx); return false; } @@ -994,10 +1000,11 @@ struct llama_server_context if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding)) { LOG_TEE("Unable to encode image\n"); + clip_image_f32_batch_free(img_res_v); return false; } - clip_image_f32_free(img_res_v.data); + clip_image_f32_batch_free(img_res_v); img.request_encode_image = false; } diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 7834e635c..1fad24fd1 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -707,9 +707,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) { q.cmd_buffer_idx = 0; } -static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) { +static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) { + for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) { + vk::MemoryType memory_type = mem_props->memoryTypes[i]; + if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) && + (flags & memory_type.propertyFlags) == flags && + mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) { + return static_cast(i); + } + } + return UINT32_MAX; +} + +static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl; + std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl; #endif vk_buffer buf = std::make_shared(); @@ -736,15 +748,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz uint32_t memory_type_index = UINT32_MAX; - for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) { - vk::MemoryType memory_type = mem_props.memoryTypes[i]; - if ((mem_req.memoryTypeBits & ((uint64_t)1 << i)) && (req_flags & memory_type.propertyFlags) == req_flags && mem_props.memoryHeaps[memory_type.heapIndex].size >= mem_req.size) { - memory_type_index = i; - break; - } + memory_type_index = find_properties(&mem_props, &mem_req, req_flags); + buf->memory_property_flags = req_flags; + + if (memory_type_index == UINT32_MAX && fallback_flags) { + memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags); + buf->memory_property_flags = fallback_flags; } - if (memory_type_index >= mem_props.memoryTypeCount) { + if (memory_type_index == UINT32_MAX) { ctx->device.lock()->device.destroyBuffer(buf->buffer); buf->size = 0; throw vk::OutOfDeviceMemoryError("No suitable memory type found"); @@ -758,10 +770,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz buf->size = 0; throw e; } - buf->memory_property_flags = req_flags; buf->ptr = nullptr; - if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) { + if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE); } @@ -778,9 +789,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz return buf; } -static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) { +static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) { try { - return ggml_vk_create_buffer(ctx, size, req_flags); + return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags); } catch (const vk::SystemError& e) { std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl; std::cerr << "ggml_vulkan: " << e.what() << std::endl; @@ -791,16 +802,16 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) { vk_buffer buf; try { - buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal); - } catch (const vk::SystemError& e) { if (ctx->device.lock()->uma) { // Fall back to host memory type - buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); } else { - std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl; - std::cerr << "ggml_vulkan: " << e.what() << std::endl; - throw e; + buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal); } + } catch (const vk::SystemError& e) { + std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl; + std::cerr << "ggml_vulkan: " << e.what() << std::endl; + throw e; } return buf; @@ -1422,7 +1433,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl; #endif - vk_buffer buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + vk_buffer buf = ggml_vk_create_buffer(ctx, size, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n", @@ -1568,7 +1581,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) { if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) { ggml_vk_destroy_buffer(ctx->sync_staging); - ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); } } @@ -4082,7 +4097,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl; #endif #if defined(GGML_VULKAN_RUN_TESTS) - ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); ggml_vk_test_transfer(ctx, 8192 * 1000, false); ggml_vk_test_transfer(ctx, 8192 * 1000, true); @@ -4174,7 +4191,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { if (ctx->staging != nullptr) { ggml_vk_destroy_buffer(ctx->staging); } - ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); } } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 5fba01714..9986ce9de 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -73,6 +73,8 @@ class Keys: UNK_ID = "tokenizer.ggml.unknown_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id" PAD_ID = "tokenizer.ggml.padding_token_id" + CLS_ID = "tokenizer.ggml.cls_token_id" + MASK_ID = "tokenizer.ggml.mask_token_id" ADD_BOS = "tokenizer.ggml.add_bos_token" ADD_EOS = "tokenizer.ggml.add_eos_token" ADD_PREFIX = "tokenizer.ggml.add_space_prefix" @@ -685,5 +687,7 @@ KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID +KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID +KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index d87bd8e88..26724bf94 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -414,6 +414,12 @@ class GGUFWriter: def add_pad_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.PAD_ID, id) + def add_cls_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.CLS_ID, id) + + def add_mask_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.MASK_ID, id) + def add_add_bos_token(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.ADD_BOS, value) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index cd1942975..a23136b18 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -29,7 +29,7 @@ class SpecialVocab: if special_token_types is not None: self.special_token_types = special_token_types else: - self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad') + self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask') self._load(Path(path)) def __repr__(self) -> str: @@ -152,10 +152,6 @@ class SpecialVocab: add_entry = tokenizer_config.get(f'add_{typ}_token') if isinstance(add_entry, bool): self.add_special_token[typ] = add_entry - if not added_tokens: - # We will need this to get the content for the token, so if it's empty - # may as well just give up. - continue entry = tokenizer_config.get(f'{typ}_token') if isinstance(entry, str): tc_content = entry diff --git a/scripts/hf.sh b/scripts/hf.sh new file mode 100755 index 000000000..1e9e5a6ea --- /dev/null +++ b/scripts/hf.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# +# Shortcut for downloading HF models +# +# Usage: +# ./main -m $(./examples/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./main -m $(./examples/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./main -m $(./examples/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf) +# + +# all logs go to stderr +function log { + echo "$@" 1>&2 +} + +function usage { + log "Usage: $0 [[--url] ] [--repo ] [--file ] [-h|--help]" + exit 1 +} + +# check for curl or wget +function has_cmd { + if ! [ -x "$(command -v $1)" ]; then + return 1 + fi +} + +if has_cmd wget; then + cmd="wget -q --show-progress -c -O %s %s" +elif has_cmd curl; then + cmd="curl -C - -f -o %s -L %s" +else + log "[E] curl or wget not found" + exit 1 +fi + +url="" +repo="" +file="" + +# parse args +while [[ $# -gt 0 ]]; do + case "$1" in + --url) + url="$2" + shift 2 + ;; + --repo) + repo="$2" + shift 2 + ;; + --file) + file="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + url="$1" + shift + ;; + esac +done + +if [ -n "$repo" ] && [ -n "$file" ]; then + url="https://huggingface.co/$repo/resolve/main/$file" +fi + +if [ -z "$url" ]; then + log "[E] missing --url" + usage +fi + +# check if the URL is a HuggingFace model, and if so, try to download it +is_url=false + +if [[ ${#url} -gt 22 ]]; then + if [[ ${url:0:22} == "https://huggingface.co" ]]; then + is_url=true + fi +fi + +if [ "$is_url" = false ]; then + log "[E] invalid URL, must start with https://huggingface.co" + exit 0 +fi + +# replace "blob/main" with "resolve/main" +url=${url/blob\/main/resolve\/main} + +basename=$(basename $url) + +log "[+] attempting to download $basename" + +if [ -n "$cmd" ]; then + cmd=$(printf "$cmd" "$basename" "$url") + log "[+] $cmd" + if $cmd; then + echo $basename + exit 0 + fi +fi + +log "[-] failed to download" + +exit 1