diff --git a/README.md b/README.md index 0b4efdd33..0c4ee5a27 100644 --- a/README.md +++ b/README.md @@ -958,7 +958,7 @@ We have three Docker images available for this project: 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) -3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`) +3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`) Additionally, there the following images, similar to the above: diff --git a/ci/run.sh b/ci/run.sh index a4264d775..979b4a793 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -580,6 +580,10 @@ function gg_run_embd_bge_small { gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json + + gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json path_models="../models-mnt/bge-small" diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index ae471481d..9771fccf9 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1650,7 +1650,29 @@ class BertModel(Model): def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_causal_attention(False) - self.gguf_writer.add_pooling_layer(True) + + # get pooling path + with open(self.dir_model / "modules.json", encoding="utf-8") as f: + modules = json.load(f) + pooling_path = None + for mod in modules: + if mod["type"] == "sentence_transformers.models.Pooling": + pooling_path = mod["path"] + break + + # get pooling type + pooling_type = gguf.PoolingType.NONE + if pooling_path is not None: + with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: + pooling = json.load(f) + if pooling["pooling_mode_mean_tokens"]: + pooling_type = gguf.PoolingType.MEAN + elif pooling["pooling_mode_cls_token"]: + pooling_type = gguf.PoolingType.CLS + else: + raise NotImplementedError("Only MEAN and CLS pooling types supported") + + self.gguf_writer.add_pooling_type(pooling_type.value) def set_vocab(self): path = self.dir_model diff --git a/examples/llava/README.md b/examples/llava/README.md index e2ef0eff1..1d5374f2a 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -1,10 +1,12 @@ # LLaVA -Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants. +Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants, +as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants. The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) models are available. +For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf) After API is confirmed, more models will be supported / uploaded. @@ -18,6 +20,7 @@ After building, run: `./llava-cli` to see the usage. For example: ``` **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. +**note**: For GPU offloading ensure to use the `-ngl` flag just like usual ## LLaVA 1.5 @@ -55,11 +58,46 @@ python ./convert.py ../llava-v1.5-7b Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory. -## LLaVA 1.6 +## LLaVA 1.6 gguf conversion + +1) Backup your pth/safetensor model files as llava-surgery modifies them +2) Use `python llava-surgery-v2.py -C -m /path/to/hf-model` which also supports llava-1.5 variants pytorch as well as safetensor models: +- you will find a llava.projector and a llava.clip file in your model directory +3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory (https://huggingface.co/cmp-nct/llava-1.6-gguf/blob/main/config.json) +4) Create the visual gguf model: `python ./examples/llava/convert-image-encoder-to-gguf.py -m ../path/to/vit --llava-projector ../path/to/llava.projector --output-dir ../path/to/output --clip_model_is_vision` +- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP +5) Everything else as usual: convert.py the hf model, quantize as needed +**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096) +**note** llava-1.6 greatly benefits from batched prompt processing (defaults work) + +## llava-cli templating and llava-1.6 prompting + +llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."` +For llava-1.5 models which are not vicuna (mistral and Yi) you need to adapt system prompt as well as user prompt, for this purpose llava-cli has a basic templating system: + +**For Mistral and using llava-cli binary:** +Add this: `-p "\nUSER:\nProvide a full description.\nASSISTANT:\n"` +The mistral template for llava-1.6 seems to be no system print and a USER/ASSISTANT role + +**For the 34B this should work:** +Add this: `-e -p <|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nProvide a full description.<|im_end|><|im_start|>assistant\n` + + +## How to know if you are running in llava-1.5 or llava-1.6 mode + +When running llava-cli you will see a visual information right before the prompt is being processed: + +**Llava-1.5:** +`encode_image_with_clip: image embedding created: 576 tokens` + +**Llava-1.6 (anything above 576):** +`encode_image_with_clip: image embedding created: 2880 tokens` + + +Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6 + -- Use `llava-surgery-v2.py` -- TODO: add detailed instructions ## TODO diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 9c5091e61..98d512f67 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1103,7 +1103,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { printf("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); printf("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); printf("v_image_grid_pinpoints: "); - for (int i = 0; i < 32 & hparams.image_grid_pinpoints[i]!=0; ++i) { + for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) { printf("%d ", hparams.image_grid_pinpoints[i]); } printf("\n"); @@ -1230,8 +1230,20 @@ struct clip_image_f32 * clip_image_f32_init() { return new clip_image_f32(); } -void clip_image_u8_free (struct clip_image_u8 * img) { delete img; } +void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } +void clip_image_u8_batch_free(struct clip_image_u8_batch & batch) { + if (batch.size > 0) { + delete[] batch.data; + batch.size = 0; + } +} +void clip_image_f32_batch_free(struct clip_image_f32_batch & batch) { + if (batch.size > 0) { + delete[] batch.data; + batch.size = 0; + } +} static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) { img->nx = nx; @@ -1494,11 +1506,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli pad_to_square = false; } // free the previous res_imgs if any set - if (res_imgs.size > 0 && res_imgs.size < 100) { - for (size_t i = 0; i < res_imgs.size; i++) { - clip_image_f32_free(&(res_imgs.data[i])); - } - delete[] res_imgs.data; + if (res_imgs.size > 0) { + clip_image_f32_batch_free(res_imgs); } res_imgs.data = nullptr; res_imgs.size = 0; @@ -1650,7 +1659,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli res_imgs.size = 1; res_imgs.data = new clip_image_f32[res_imgs.size]; - res_imgs.data[0] = std::move(*res); + res_imgs.data[0] = *res; + clip_image_f32_free(res); return true; } diff --git a/examples/llava/clip.h b/examples/llava/clip.h index cd9a4022f..e5bd54924 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -60,6 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init(); CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); +CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch & batch); +CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 22953417f..4ed310a0e 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -100,7 +100,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector int num_patches_width = grid_shape.first; // grid 1-4 int num_patches_height = grid_shape.second; // grid 1-4 - const size_t num_images = num_patches_width + num_patches_height + 1; + const size_t num_images = num_patches_width * num_patches_height + 1; // TODO: size calculation is not calculated - it's only tens of MB size_t ctx_size = 0; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6e3434030..2decd7762 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -975,7 +975,12 @@ struct llama_server_context { LOG_TEE("Error processing the given image"); clip_free(clp_ctx); - clip_image_f32_free(img_res_v.data); + clip_image_f32_batch_free(img_res_v); + return false; + } + if (img_res_v.size == 0) + { + LOG_TEE("Error processing the given image"); return false; } @@ -987,6 +992,7 @@ struct llama_server_context if (!img.image_embedding) { LOG_TEE("Unable to allocate memory for image embeddings\n"); + clip_image_f32_batch_free(img_res_v); clip_free(clp_ctx); return false; } @@ -994,10 +1000,11 @@ struct llama_server_context if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding)) { LOG_TEE("Unable to encode image\n"); + clip_image_f32_batch_free(img_res_v); return false; } - clip_image_f32_free(img_res_v.data); + clip_image_f32_batch_free(img_res_v); img.request_encode_image = false; } diff --git a/ggml-cuda.cu b/ggml-cuda.cu index a107cbe7e..5fd8a87e4 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7790,6 +7790,7 @@ GGML_CALL void ggml_init_cublas() { if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) { initialized = true; g_cublas_loaded = false; + fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__); return; } diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 7834e635c..1fad24fd1 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -707,9 +707,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) { q.cmd_buffer_idx = 0; } -static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) { +static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) { + for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) { + vk::MemoryType memory_type = mem_props->memoryTypes[i]; + if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) && + (flags & memory_type.propertyFlags) == flags && + mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) { + return static_cast(i); + } + } + return UINT32_MAX; +} + +static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl; + std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl; #endif vk_buffer buf = std::make_shared(); @@ -736,15 +748,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz uint32_t memory_type_index = UINT32_MAX; - for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) { - vk::MemoryType memory_type = mem_props.memoryTypes[i]; - if ((mem_req.memoryTypeBits & ((uint64_t)1 << i)) && (req_flags & memory_type.propertyFlags) == req_flags && mem_props.memoryHeaps[memory_type.heapIndex].size >= mem_req.size) { - memory_type_index = i; - break; - } + memory_type_index = find_properties(&mem_props, &mem_req, req_flags); + buf->memory_property_flags = req_flags; + + if (memory_type_index == UINT32_MAX && fallback_flags) { + memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags); + buf->memory_property_flags = fallback_flags; } - if (memory_type_index >= mem_props.memoryTypeCount) { + if (memory_type_index == UINT32_MAX) { ctx->device.lock()->device.destroyBuffer(buf->buffer); buf->size = 0; throw vk::OutOfDeviceMemoryError("No suitable memory type found"); @@ -758,10 +770,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz buf->size = 0; throw e; } - buf->memory_property_flags = req_flags; buf->ptr = nullptr; - if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) { + if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE); } @@ -778,9 +789,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz return buf; } -static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) { +static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) { try { - return ggml_vk_create_buffer(ctx, size, req_flags); + return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags); } catch (const vk::SystemError& e) { std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl; std::cerr << "ggml_vulkan: " << e.what() << std::endl; @@ -791,16 +802,16 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) { vk_buffer buf; try { - buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal); - } catch (const vk::SystemError& e) { if (ctx->device.lock()->uma) { // Fall back to host memory type - buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); } else { - std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl; - std::cerr << "ggml_vulkan: " << e.what() << std::endl; - throw e; + buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal); } + } catch (const vk::SystemError& e) { + std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl; + std::cerr << "ggml_vulkan: " << e.what() << std::endl; + throw e; } return buf; @@ -1422,7 +1433,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl; #endif - vk_buffer buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + vk_buffer buf = ggml_vk_create_buffer(ctx, size, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n", @@ -1568,7 +1581,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) { if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) { ggml_vk_destroy_buffer(ctx->sync_staging); - ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); } } @@ -4082,7 +4097,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl; #endif #if defined(GGML_VULKAN_RUN_TESTS) - ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); ggml_vk_test_transfer(ctx, 8192 * 1000, false); ggml_vk_test_transfer(ctx, 8192 * 1000, true); @@ -4174,7 +4191,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { if (ctx->staging != nullptr) { ggml_vk_destroy_buffer(ctx->staging); } - ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); } } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 5fba01714..114a9a974 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -40,7 +40,7 @@ class Keys: TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" EXPERT_COUNT = "{arch}.expert_count" EXPERT_USED_COUNT = "{arch}.expert_used_count" - POOLING_LAYER = "{arch}.pooling_layer" + POOLING_TYPE = "{arch}.pooling_type" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -73,6 +73,8 @@ class Keys: UNK_ID = "tokenizer.ggml.unknown_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id" PAD_ID = "tokenizer.ggml.padding_token_id" + CLS_ID = "tokenizer.ggml.cls_token_id" + MASK_ID = "tokenizer.ggml.mask_token_id" ADD_BOS = "tokenizer.ggml.add_bos_token" ADD_EOS = "tokenizer.ggml.add_eos_token" ADD_PREFIX = "tokenizer.ggml.add_space_prefix" @@ -559,6 +561,12 @@ class RopeScalingType(Enum): YARN = 'yarn' +class PoolingType(IntEnum): + NONE = 0 + MEAN = 1 + CLS = 2 + + class GGMLQuantizationType(IntEnum): F32 = 0 F16 = 1 @@ -685,5 +693,7 @@ KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID +KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID +KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index d87bd8e88..e4681475c 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -19,6 +19,7 @@ from .constants import ( GGUFValueType, Keys, RopeScalingType, + PoolingType, TokenType, ) @@ -360,8 +361,8 @@ class GGUFWriter: def add_causal_attention(self, value: bool) -> None: self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) - def add_pooling_layer(self, value: bool) -> None: - self.add_bool(Keys.LLM.POOLING_LAYER.format(arch=self.arch), value) + def add_pooling_type(self, value: PoolingType) -> None: + self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value) def add_rope_dimension_count(self, count: int) -> None: self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) @@ -414,6 +415,12 @@ class GGUFWriter: def add_pad_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.PAD_ID, id) + def add_cls_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.CLS_ID, id) + + def add_mask_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.MASK_ID, id) + def add_add_bos_token(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.ADD_BOS, value) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index cd1942975..a23136b18 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -29,7 +29,7 @@ class SpecialVocab: if special_token_types is not None: self.special_token_types = special_token_types else: - self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad') + self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask') self._load(Path(path)) def __repr__(self) -> str: @@ -152,10 +152,6 @@ class SpecialVocab: add_entry = tokenizer_config.get(f'add_{typ}_token') if isinstance(add_entry, bool): self.add_special_token[typ] = add_entry - if not added_tokens: - # We will need this to get the content for the token, so if it's empty - # may as well just give up. - continue entry = tokenizer_config.get(f'{typ}_token') if isinstance(entry, str): tc_content = entry diff --git a/llama.cpp b/llama.cpp index 14d86cd87..8e872f3d7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -256,7 +256,7 @@ enum llm_kv { LLM_KV_TENSOR_DATA_LAYOUT, LLM_KV_EXPERT_COUNT, LLM_KV_EXPERT_USED_COUNT, - LLM_KV_POOLING_LAYER, + LLM_KV_POOLING_TYPE, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -314,7 +314,7 @@ static std::map LLM_KV_NAMES = { { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, - { LLM_KV_POOLING_LAYER, "%s.pooling_layer" }, + { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -1560,9 +1560,9 @@ struct llama_hparams { float f_clamp_kqv = 0.0f; float f_max_alibi_bias = 0.0f; - bool causal_attn = true; - bool pooling_layer = false; + bool causal_attn = true; + uint32_t pooling_type = LLAMA_POOLING_NONE; bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; @@ -1925,7 +1925,8 @@ struct llama_context { struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch] struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx] struct ggml_tensor * inp_K_shift; // I32 [n_ctx] - struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch] + struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] + struct ggml_tensor * inp_cls; // I32 [n_batch] #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; @@ -3095,7 +3096,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); - ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); switch (hparams.n_layer) { case 3: @@ -3116,7 +3117,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); - ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); if (hparams.n_layer == 12 && hparams.n_embd == 768) { model.type = e_model::MODEL_137M; @@ -4943,7 +4944,7 @@ struct llm_build_context { const int32_t n_orig_ctx; const bool do_rope_shift; - const bool do_pooling; + const uint32_t pooling_type; const llm_build_cb & cb; @@ -4987,7 +4988,7 @@ struct llm_build_context { kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), do_rope_shift (worst_case || kv_self.has_shift), - do_pooling (hparams.pooling_layer && cparams.do_pooling), + pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { // all initializations should be done in init() @@ -5849,7 +5850,8 @@ struct llm_build_context { // get input vectors with right size const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type); struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); - struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0); + struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0); + struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0); // construct input embeddings (token, type, position) inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); @@ -5966,8 +5968,12 @@ struct llm_build_context { cur = inpL; // pooling layer - if (do_pooling) { - cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum); + if (pooling_type == LLAMA_POOLING_MEAN) { + cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean); + } else if (pooling_type == LLAMA_POOLING_CLS) { + cur = ggml_get_rows(ctx0, cur, inp_cls); + } else { + GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type"); } cb(cur, "result_embd", -1); @@ -7535,15 +7541,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - { - assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer)); - float * data = (float *) lctx.inp_sum->data; - - for (int i = 0; i < batch.n_tokens; ++i) { - data[i] = 1.0f/float(batch.n_tokens); - } - } - if (kv_self.has_shift) { const int64_t n_ctx = cparams.n_ctx; @@ -7556,17 +7553,46 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (hparams.pooling_layer && cparams.do_pooling) { + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) { const int64_t n_tokens = batch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer)); - float * data = (float *) lctx.inp_sum->data; + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); + float * data = (float *) lctx.inp_mean->data; - memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum)); + memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean)); + + std::vector sum(n_tokens, 0); + for (int i = 0; i < n_tokens; ++i) { + const llama_seq_id seq_id = batch.seq_id[i][0]; + sum[seq_id] += 1; + } + + std::vector div(n_tokens, 0.0f); + for (int i = 0; i < n_tokens; ++i) { + const uint64_t s = sum[i]; + if (s > 0) { + div[i] = 1.0f/float(s); + } + } for (int i = 0; i < n_tokens; ++i) { const llama_seq_id seq_id = batch.seq_id[i][0]; - data[seq_id*n_tokens + i] = 1.0f; + data[seq_id*n_tokens + i] = div[seq_id]; + } + } + + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) { + const int64_t n_tokens = batch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); + uint32_t * data = (uint32_t *) lctx.inp_cls->data; + + for (int i = 0; i < n_tokens; ++i) { + const llama_seq_id seq_id = batch.seq_id[i][0]; + const llama_pos pos = batch.pos[i]; + if (pos == 0) { + data[seq_id] = i; + } } } } @@ -11446,21 +11472,23 @@ struct llama_context * llama_new_context_with_model( }; ctx->ctx_input = ggml_init(init_params); - ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); - ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch); - ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); - ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch); - ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx); - ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx); - ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch); + ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); + ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch); + ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); + ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch); + ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx); + ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx); + ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch); + ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); - ggml_set_name(ctx->inp_tokens, "inp_tokens"); - ggml_set_name(ctx->inp_embd, "inp_embd"); - ggml_set_name(ctx->inp_pos, "inp_pos"); - ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask"); - ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos"); - ggml_set_name(ctx->inp_K_shift, "inp_K_shift"); - ggml_set_name(ctx->inp_sum, "inp_sum"); + ggml_set_name(ctx->inp_tokens, "inp_tokens"); + ggml_set_name(ctx->inp_embd, "inp_embd"); + ggml_set_name(ctx->inp_pos, "inp_pos"); + ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask"); + ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos"); + ggml_set_name(ctx->inp_K_shift, "inp_K_shift"); + ggml_set_name(ctx->inp_mean, "inp_mean"); + ggml_set_name(ctx->inp_cls, "inp_cls"); ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true)); diff --git a/llama.h b/llama.h index 5ef78ec96..4a26bd619 100644 --- a/llama.h +++ b/llama.h @@ -112,6 +112,12 @@ extern "C" { LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, }; + enum llama_pooling_type { + LLAMA_POOLING_NONE = 0, + LLAMA_POOLING_MEAN = 1, + LLAMA_POOLING_CLS = 2, + }; + enum llama_split_mode { LLAMA_SPLIT_NONE = 0, // single GPU LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs diff --git a/scripts/hf.sh b/scripts/hf.sh new file mode 100755 index 000000000..1e9e5a6ea --- /dev/null +++ b/scripts/hf.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# +# Shortcut for downloading HF models +# +# Usage: +# ./main -m $(./examples/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./main -m $(./examples/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./main -m $(./examples/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf) +# + +# all logs go to stderr +function log { + echo "$@" 1>&2 +} + +function usage { + log "Usage: $0 [[--url] ] [--repo ] [--file ] [-h|--help]" + exit 1 +} + +# check for curl or wget +function has_cmd { + if ! [ -x "$(command -v $1)" ]; then + return 1 + fi +} + +if has_cmd wget; then + cmd="wget -q --show-progress -c -O %s %s" +elif has_cmd curl; then + cmd="curl -C - -f -o %s -L %s" +else + log "[E] curl or wget not found" + exit 1 +fi + +url="" +repo="" +file="" + +# parse args +while [[ $# -gt 0 ]]; do + case "$1" in + --url) + url="$2" + shift 2 + ;; + --repo) + repo="$2" + shift 2 + ;; + --file) + file="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + url="$1" + shift + ;; + esac +done + +if [ -n "$repo" ] && [ -n "$file" ]; then + url="https://huggingface.co/$repo/resolve/main/$file" +fi + +if [ -z "$url" ]; then + log "[E] missing --url" + usage +fi + +# check if the URL is a HuggingFace model, and if so, try to download it +is_url=false + +if [[ ${#url} -gt 22 ]]; then + if [[ ${url:0:22} == "https://huggingface.co" ]]; then + is_url=true + fi +fi + +if [ "$is_url" = false ]; then + log "[E] invalid URL, must start with https://huggingface.co" + exit 0 +fi + +# replace "blob/main" with "resolve/main" +url=${url/blob\/main/resolve\/main} + +basename=$(basename $url) + +log "[+] attempting to download $basename" + +if [ -n "$cmd" ]; then + cmd=$(printf "$cmd" "$basename" "$url") + log "[+] $cmd" + if $cmd; then + echo $basename + exit 0 + fi +fi + +log "[-] failed to download" + +exit 1