Merge branch 'master' into gg/refactor-alibi
This commit is contained in:
commit
ac91033ccb
16 changed files with 345 additions and 88 deletions
|
@ -958,7 +958,7 @@ We have three Docker images available for this project:
|
|||
|
||||
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
|
||||
Additionally, there the following images, similar to the above:
|
||||
|
||||
|
|
|
@ -580,6 +580,10 @@ function gg_run_embd_bge_small {
|
|||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
|
||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
|
||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
|
||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json
|
||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
|
||||
|
||||
gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json
|
||||
|
||||
path_models="../models-mnt/bge-small"
|
||||
|
||||
|
|
|
@ -1650,7 +1650,29 @@ class BertModel(Model):
|
|||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_causal_attention(False)
|
||||
self.gguf_writer.add_pooling_layer(True)
|
||||
|
||||
# get pooling path
|
||||
with open(self.dir_model / "modules.json", encoding="utf-8") as f:
|
||||
modules = json.load(f)
|
||||
pooling_path = None
|
||||
for mod in modules:
|
||||
if mod["type"] == "sentence_transformers.models.Pooling":
|
||||
pooling_path = mod["path"]
|
||||
break
|
||||
|
||||
# get pooling type
|
||||
pooling_type = gguf.PoolingType.NONE
|
||||
if pooling_path is not None:
|
||||
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
|
||||
pooling = json.load(f)
|
||||
if pooling["pooling_mode_mean_tokens"]:
|
||||
pooling_type = gguf.PoolingType.MEAN
|
||||
elif pooling["pooling_mode_cls_token"]:
|
||||
pooling_type = gguf.PoolingType.CLS
|
||||
else:
|
||||
raise NotImplementedError("Only MEAN and CLS pooling types supported")
|
||||
|
||||
self.gguf_writer.add_pooling_type(pooling_type.value)
|
||||
|
||||
def set_vocab(self):
|
||||
path = self.dir_model
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
# LLaVA
|
||||
|
||||
Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants.
|
||||
Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants,
|
||||
as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants.
|
||||
|
||||
The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
|
||||
and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
|
||||
models are available.
|
||||
For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf)
|
||||
|
||||
After API is confirmed, more models will be supported / uploaded.
|
||||
|
||||
|
@ -18,6 +20,7 @@ After building, run: `./llava-cli` to see the usage. For example:
|
|||
```
|
||||
|
||||
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
||||
**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
|
||||
|
||||
## LLaVA 1.5
|
||||
|
||||
|
@ -55,11 +58,46 @@ python ./convert.py ../llava-v1.5-7b
|
|||
|
||||
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
|
||||
|
||||
## LLaVA 1.6
|
||||
## LLaVA 1.6 gguf conversion
|
||||
|
||||
1) Backup your pth/safetensor model files as llava-surgery modifies them
|
||||
2) Use `python llava-surgery-v2.py -C -m /path/to/hf-model` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||
- you will find a llava.projector and a llava.clip file in your model directory
|
||||
3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory (https://huggingface.co/cmp-nct/llava-1.6-gguf/blob/main/config.json)
|
||||
4) Create the visual gguf model: `python ./examples/llava/convert-image-encoder-to-gguf.py -m ../path/to/vit --llava-projector ../path/to/llava.projector --output-dir ../path/to/output --clip_model_is_vision`
|
||||
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
||||
5) Everything else as usual: convert.py the hf model, quantize as needed
|
||||
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
|
||||
**note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
|
||||
|
||||
## llava-cli templating and llava-1.6 prompting
|
||||
|
||||
llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."`
|
||||
For llava-1.5 models which are not vicuna (mistral and Yi) you need to adapt system prompt as well as user prompt, for this purpose llava-cli has a basic templating system:
|
||||
|
||||
**For Mistral and using llava-cli binary:**
|
||||
Add this: `-p "<image>\nUSER:\nProvide a full description.\nASSISTANT:\n"`
|
||||
The mistral template for llava-1.6 seems to be no system print and a USER/ASSISTANT role
|
||||
|
||||
**For the 34B this should work:**
|
||||
Add this: `-e -p <|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nProvide a full description.<|im_end|><|im_start|>assistant\n`
|
||||
|
||||
|
||||
## How to know if you are running in llava-1.5 or llava-1.6 mode
|
||||
|
||||
When running llava-cli you will see a visual information right before the prompt is being processed:
|
||||
|
||||
**Llava-1.5:**
|
||||
`encode_image_with_clip: image embedding created: 576 tokens`
|
||||
|
||||
**Llava-1.6 (anything above 576):**
|
||||
`encode_image_with_clip: image embedding created: 2880 tokens`
|
||||
|
||||
|
||||
Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6
|
||||
|
||||
|
||||
- Use `llava-surgery-v2.py`
|
||||
|
||||
- TODO: add detailed instructions
|
||||
|
||||
## TODO
|
||||
|
||||
|
|
|
@ -1103,7 +1103,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
printf("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
||||
printf("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
||||
printf("v_image_grid_pinpoints: ");
|
||||
for (int i = 0; i < 32 & hparams.image_grid_pinpoints[i]!=0; ++i) {
|
||||
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
|
||||
printf("%d ", hparams.image_grid_pinpoints[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
@ -1230,8 +1230,20 @@ struct clip_image_f32 * clip_image_f32_init() {
|
|||
return new clip_image_f32();
|
||||
}
|
||||
|
||||
void clip_image_u8_free (struct clip_image_u8 * img) { delete img; }
|
||||
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
|
||||
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
|
||||
void clip_image_u8_batch_free(struct clip_image_u8_batch & batch) {
|
||||
if (batch.size > 0) {
|
||||
delete[] batch.data;
|
||||
batch.size = 0;
|
||||
}
|
||||
}
|
||||
void clip_image_f32_batch_free(struct clip_image_f32_batch & batch) {
|
||||
if (batch.size > 0) {
|
||||
delete[] batch.data;
|
||||
batch.size = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
|
||||
img->nx = nx;
|
||||
|
@ -1494,11 +1506,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||
pad_to_square = false;
|
||||
}
|
||||
// free the previous res_imgs if any set
|
||||
if (res_imgs.size > 0 && res_imgs.size < 100) {
|
||||
for (size_t i = 0; i < res_imgs.size; i++) {
|
||||
clip_image_f32_free(&(res_imgs.data[i]));
|
||||
}
|
||||
delete[] res_imgs.data;
|
||||
if (res_imgs.size > 0) {
|
||||
clip_image_f32_batch_free(res_imgs);
|
||||
}
|
||||
res_imgs.data = nullptr;
|
||||
res_imgs.size = 0;
|
||||
|
@ -1650,7 +1659,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||
|
||||
res_imgs.size = 1;
|
||||
res_imgs.data = new clip_image_f32[res_imgs.size];
|
||||
res_imgs.data[0] = std::move(*res);
|
||||
res_imgs.data[0] = *res;
|
||||
clip_image_f32_free(res);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -60,6 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
|||
|
||||
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
||||
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
||||
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch & batch);
|
||||
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
|
||||
|
||||
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||
|
||||
|
|
|
@ -100,7 +100,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|||
int num_patches_width = grid_shape.first; // grid 1-4
|
||||
int num_patches_height = grid_shape.second; // grid 1-4
|
||||
|
||||
const size_t num_images = num_patches_width + num_patches_height + 1;
|
||||
const size_t num_images = num_patches_width * num_patches_height + 1;
|
||||
|
||||
// TODO: size calculation is not calculated - it's only tens of MB
|
||||
size_t ctx_size = 0;
|
||||
|
|
|
@ -975,7 +975,12 @@ struct llama_server_context
|
|||
{
|
||||
LOG_TEE("Error processing the given image");
|
||||
clip_free(clp_ctx);
|
||||
clip_image_f32_free(img_res_v.data);
|
||||
clip_image_f32_batch_free(img_res_v);
|
||||
return false;
|
||||
}
|
||||
if (img_res_v.size == 0)
|
||||
{
|
||||
LOG_TEE("Error processing the given image");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -987,6 +992,7 @@ struct llama_server_context
|
|||
if (!img.image_embedding)
|
||||
{
|
||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
||||
clip_image_f32_batch_free(img_res_v);
|
||||
clip_free(clp_ctx);
|
||||
return false;
|
||||
}
|
||||
|
@ -994,10 +1000,11 @@ struct llama_server_context
|
|||
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
|
||||
{
|
||||
LOG_TEE("Unable to encode image\n");
|
||||
clip_image_f32_batch_free(img_res_v);
|
||||
return false;
|
||||
}
|
||||
|
||||
clip_image_f32_free(img_res_v.data);
|
||||
clip_image_f32_batch_free(img_res_v);
|
||||
|
||||
img.request_encode_image = false;
|
||||
}
|
||||
|
|
|
@ -7790,6 +7790,7 @@ GGML_CALL void ggml_init_cublas() {
|
|||
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
|
||||
initialized = true;
|
||||
g_cublas_loaded = false;
|
||||
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -707,9 +707,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
|
|||
q.cmd_buffer_idx = 0;
|
||||
}
|
||||
|
||||
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
|
||||
static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
|
||||
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
|
||||
vk::MemoryType memory_type = mem_props->memoryTypes[i];
|
||||
if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
|
||||
(flags & memory_type.propertyFlags) == flags &&
|
||||
mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
|
||||
return static_cast<int32_t>(i);
|
||||
}
|
||||
}
|
||||
return UINT32_MAX;
|
||||
}
|
||||
|
||||
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
||||
#ifdef GGML_VULKAN_DEBUG
|
||||
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl;
|
||||
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
||||
#endif
|
||||
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
||||
|
||||
|
@ -736,15 +748,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|||
|
||||
uint32_t memory_type_index = UINT32_MAX;
|
||||
|
||||
for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) {
|
||||
vk::MemoryType memory_type = mem_props.memoryTypes[i];
|
||||
if ((mem_req.memoryTypeBits & ((uint64_t)1 << i)) && (req_flags & memory_type.propertyFlags) == req_flags && mem_props.memoryHeaps[memory_type.heapIndex].size >= mem_req.size) {
|
||||
memory_type_index = i;
|
||||
break;
|
||||
}
|
||||
memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
|
||||
buf->memory_property_flags = req_flags;
|
||||
|
||||
if (memory_type_index == UINT32_MAX && fallback_flags) {
|
||||
memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
|
||||
buf->memory_property_flags = fallback_flags;
|
||||
}
|
||||
|
||||
if (memory_type_index >= mem_props.memoryTypeCount) {
|
||||
if (memory_type_index == UINT32_MAX) {
|
||||
ctx->device.lock()->device.destroyBuffer(buf->buffer);
|
||||
buf->size = 0;
|
||||
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
||||
|
@ -758,10 +770,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|||
buf->size = 0;
|
||||
throw e;
|
||||
}
|
||||
buf->memory_property_flags = req_flags;
|
||||
buf->ptr = nullptr;
|
||||
|
||||
if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
|
||||
}
|
||||
|
||||
|
@ -778,9 +789,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
|||
return buf;
|
||||
}
|
||||
|
||||
static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
|
||||
static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
||||
try {
|
||||
return ggml_vk_create_buffer(ctx, size, req_flags);
|
||||
return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags);
|
||||
} catch (const vk::SystemError& e) {
|
||||
std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
|
@ -791,17 +802,17 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size
|
|||
static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
|
||||
vk_buffer buf;
|
||||
try {
|
||||
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
} catch (const vk::SystemError& e) {
|
||||
if (ctx->device.lock()->uma) {
|
||||
// Fall back to host memory type
|
||||
buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
} else {
|
||||
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
}
|
||||
} catch (const vk::SystemError& e) {
|
||||
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
@ -1422,7 +1433,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
|||
#ifdef GGML_VULKAN_DEBUG
|
||||
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
||||
#endif
|
||||
vk_buffer buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
vk_buffer buf = ggml_vk_create_buffer(ctx, size,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
|
||||
if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
|
||||
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
|
||||
|
@ -1568,7 +1581,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
|||
static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
|
||||
if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
|
||||
ggml_vk_destroy_buffer(ctx->sync_staging);
|
||||
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4082,7 +4097,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|||
std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
||||
#endif
|
||||
#if defined(GGML_VULKAN_RUN_TESTS)
|
||||
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
ggml_vk_test_transfer(ctx, 8192 * 1000, false);
|
||||
ggml_vk_test_transfer(ctx, 8192 * 1000, true);
|
||||
|
||||
|
@ -4174,7 +4191,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|||
if (ctx->staging != nullptr) {
|
||||
ggml_vk_destroy_buffer(ctx->staging);
|
||||
}
|
||||
ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -40,7 +40,7 @@ class Keys:
|
|||
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||
EXPERT_COUNT = "{arch}.expert_count"
|
||||
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||
POOLING_LAYER = "{arch}.pooling_layer"
|
||||
POOLING_TYPE = "{arch}.pooling_type"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "{arch}.attention.head_count"
|
||||
|
@ -73,6 +73,8 @@ class Keys:
|
|||
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||
CLS_ID = "tokenizer.ggml.cls_token_id"
|
||||
MASK_ID = "tokenizer.ggml.mask_token_id"
|
||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
||||
|
@ -559,6 +561,12 @@ class RopeScalingType(Enum):
|
|||
YARN = 'yarn'
|
||||
|
||||
|
||||
class PoolingType(IntEnum):
|
||||
NONE = 0
|
||||
MEAN = 1
|
||||
CLS = 2
|
||||
|
||||
|
||||
class GGMLQuantizationType(IntEnum):
|
||||
F32 = 0
|
||||
F16 = 1
|
||||
|
@ -685,5 +693,7 @@ KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
|
|||
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
|
||||
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
|
||||
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
|
||||
KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
|
||||
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
|
||||
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
||||
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
||||
|
|
|
@ -19,6 +19,7 @@ from .constants import (
|
|||
GGUFValueType,
|
||||
Keys,
|
||||
RopeScalingType,
|
||||
PoolingType,
|
||||
TokenType,
|
||||
)
|
||||
|
||||
|
@ -360,8 +361,8 @@ class GGUFWriter:
|
|||
def add_causal_attention(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
|
||||
|
||||
def add_pooling_layer(self, value: bool) -> None:
|
||||
self.add_bool(Keys.LLM.POOLING_LAYER.format(arch=self.arch), value)
|
||||
def add_pooling_type(self, value: PoolingType) -> None:
|
||||
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_dimension_count(self, count: int) -> None:
|
||||
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
|
||||
|
@ -414,6 +415,12 @@ class GGUFWriter:
|
|||
def add_pad_token_id(self, id: int) -> None:
|
||||
self.add_uint32(Keys.Tokenizer.PAD_ID, id)
|
||||
|
||||
def add_cls_token_id(self, id: int) -> None:
|
||||
self.add_uint32(Keys.Tokenizer.CLS_ID, id)
|
||||
|
||||
def add_mask_token_id(self, id: int) -> None:
|
||||
self.add_uint32(Keys.Tokenizer.MASK_ID, id)
|
||||
|
||||
def add_add_bos_token(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Tokenizer.ADD_BOS, value)
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ class SpecialVocab:
|
|||
if special_token_types is not None:
|
||||
self.special_token_types = special_token_types
|
||||
else:
|
||||
self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad')
|
||||
self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask')
|
||||
self._load(Path(path))
|
||||
|
||||
def __repr__(self) -> str:
|
||||
|
@ -152,10 +152,6 @@ class SpecialVocab:
|
|||
add_entry = tokenizer_config.get(f'add_{typ}_token')
|
||||
if isinstance(add_entry, bool):
|
||||
self.add_special_token[typ] = add_entry
|
||||
if not added_tokens:
|
||||
# We will need this to get the content for the token, so if it's empty
|
||||
# may as well just give up.
|
||||
continue
|
||||
entry = tokenizer_config.get(f'{typ}_token')
|
||||
if isinstance(entry, str):
|
||||
tc_content = entry
|
||||
|
|
82
llama.cpp
82
llama.cpp
|
@ -256,7 +256,7 @@ enum llm_kv {
|
|||
LLM_KV_TENSOR_DATA_LAYOUT,
|
||||
LLM_KV_EXPERT_COUNT,
|
||||
LLM_KV_EXPERT_USED_COUNT,
|
||||
LLM_KV_POOLING_LAYER,
|
||||
LLM_KV_POOLING_TYPE,
|
||||
|
||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||
|
@ -314,7 +314,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
||||
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
||||
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
||||
{ LLM_KV_POOLING_LAYER, "%s.pooling_layer" },
|
||||
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
||||
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
|
@ -1561,8 +1561,8 @@ struct llama_hparams {
|
|||
float f_max_alibi_bias = 0.0f;
|
||||
|
||||
bool causal_attn = true;
|
||||
bool pooling_layer = false;
|
||||
|
||||
uint32_t pooling_type = LLAMA_POOLING_NONE;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
if (this->vocab_only != other.vocab_only) return true;
|
||||
|
@ -1925,7 +1925,8 @@ struct llama_context {
|
|||
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
||||
struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
|
||||
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
||||
struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
|
||||
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
||||
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_context * ctx_mpi = NULL;
|
||||
|
@ -3095,7 +3096,7 @@ static void llm_load_hparams(
|
|||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
||||
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 3:
|
||||
|
@ -3116,7 +3117,7 @@ static void llm_load_hparams(
|
|||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
||||
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
|
||||
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
||||
model.type = e_model::MODEL_137M;
|
||||
|
@ -4943,7 +4944,7 @@ struct llm_build_context {
|
|||
const int32_t n_orig_ctx;
|
||||
|
||||
const bool do_rope_shift;
|
||||
const bool do_pooling;
|
||||
const uint32_t pooling_type;
|
||||
|
||||
const llm_build_cb & cb;
|
||||
|
||||
|
@ -4987,7 +4988,7 @@ struct llm_build_context {
|
|||
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
||||
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
||||
do_rope_shift (worst_case || kv_self.has_shift),
|
||||
do_pooling (hparams.pooling_layer && cparams.do_pooling),
|
||||
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
|
||||
cb (cb),
|
||||
buf_compute_meta (lctx.buf_compute_meta) {
|
||||
// all initializations should be done in init()
|
||||
|
@ -5849,7 +5850,8 @@ struct llm_build_context {
|
|||
// get input vectors with right size
|
||||
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
|
||||
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
|
||||
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
|
||||
|
||||
// construct input embeddings (token, type, position)
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
|
@ -5966,8 +5968,12 @@ struct llm_build_context {
|
|||
cur = inpL;
|
||||
|
||||
// pooling layer
|
||||
if (do_pooling) {
|
||||
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
|
||||
if (pooling_type == LLAMA_POOLING_MEAN) {
|
||||
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
||||
} else if (pooling_type == LLAMA_POOLING_CLS) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
||||
} else {
|
||||
GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
|
||||
}
|
||||
cb(cur, "result_embd", -1);
|
||||
|
||||
|
@ -7535,15 +7541,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
}
|
||||
}
|
||||
|
||||
{
|
||||
assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
|
||||
float * data = (float *) lctx.inp_sum->data;
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||
data[i] = 1.0f/float(batch.n_tokens);
|
||||
}
|
||||
}
|
||||
|
||||
if (kv_self.has_shift) {
|
||||
const int64_t n_ctx = cparams.n_ctx;
|
||||
|
||||
|
@ -7556,17 +7553,46 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
}
|
||||
}
|
||||
|
||||
if (hparams.pooling_layer && cparams.do_pooling) {
|
||||
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
|
||||
float * data = (float *) lctx.inp_sum->data;
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
||||
float * data = (float *) lctx.inp_mean->data;
|
||||
|
||||
memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
|
||||
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
||||
|
||||
std::vector<uint64_t> sum(n_tokens, 0);
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||
sum[seq_id] += 1;
|
||||
}
|
||||
|
||||
std::vector<float> div(n_tokens, 0.0f);
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const uint64_t s = sum[i];
|
||||
if (s > 0) {
|
||||
div[i] = 1.0f/float(s);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||
data[seq_id*n_tokens + i] = 1.0f;
|
||||
data[seq_id*n_tokens + i] = div[seq_id];
|
||||
}
|
||||
}
|
||||
|
||||
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
||||
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
||||
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||
const llama_pos pos = batch.pos[i];
|
||||
if (pos == 0) {
|
||||
data[seq_id] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -11452,7 +11478,8 @@ struct llama_context * llama_new_context_with_model(
|
|||
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
||||
ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
|
||||
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
||||
ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
||||
ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
||||
ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
||||
|
||||
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
||||
ggml_set_name(ctx->inp_embd, "inp_embd");
|
||||
|
@ -11460,7 +11487,8 @@ struct llama_context * llama_new_context_with_model(
|
|||
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
||||
ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
|
||||
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
||||
ggml_set_name(ctx->inp_sum, "inp_sum");
|
||||
ggml_set_name(ctx->inp_mean, "inp_mean");
|
||||
ggml_set_name(ctx->inp_cls, "inp_cls");
|
||||
|
||||
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
||||
|
||||
|
|
6
llama.h
6
llama.h
|
@ -112,6 +112,12 @@ extern "C" {
|
|||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
||||
};
|
||||
|
||||
enum llama_pooling_type {
|
||||
LLAMA_POOLING_NONE = 0,
|
||||
LLAMA_POOLING_MEAN = 1,
|
||||
LLAMA_POOLING_CLS = 2,
|
||||
};
|
||||
|
||||
enum llama_split_mode {
|
||||
LLAMA_SPLIT_NONE = 0, // single GPU
|
||||
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
||||
|
|
107
scripts/hf.sh
Executable file
107
scripts/hf.sh
Executable file
|
@ -0,0 +1,107 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
# Shortcut for downloading HF models
|
||||
#
|
||||
# Usage:
|
||||
# ./main -m $(./examples/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||
# ./main -m $(./examples/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||
# ./main -m $(./examples/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||
#
|
||||
|
||||
# all logs go to stderr
|
||||
function log {
|
||||
echo "$@" 1>&2
|
||||
}
|
||||
|
||||
function usage {
|
||||
log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [-h|--help]"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# check for curl or wget
|
||||
function has_cmd {
|
||||
if ! [ -x "$(command -v $1)" ]; then
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
if has_cmd wget; then
|
||||
cmd="wget -q --show-progress -c -O %s %s"
|
||||
elif has_cmd curl; then
|
||||
cmd="curl -C - -f -o %s -L %s"
|
||||
else
|
||||
log "[E] curl or wget not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
url=""
|
||||
repo=""
|
||||
file=""
|
||||
|
||||
# parse args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--url)
|
||||
url="$2"
|
||||
shift 2
|
||||
;;
|
||||
--repo)
|
||||
repo="$2"
|
||||
shift 2
|
||||
;;
|
||||
--file)
|
||||
file="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
url="$1"
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -n "$repo" ] && [ -n "$file" ]; then
|
||||
url="https://huggingface.co/$repo/resolve/main/$file"
|
||||
fi
|
||||
|
||||
if [ -z "$url" ]; then
|
||||
log "[E] missing --url"
|
||||
usage
|
||||
fi
|
||||
|
||||
# check if the URL is a HuggingFace model, and if so, try to download it
|
||||
is_url=false
|
||||
|
||||
if [[ ${#url} -gt 22 ]]; then
|
||||
if [[ ${url:0:22} == "https://huggingface.co" ]]; then
|
||||
is_url=true
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$is_url" = false ]; then
|
||||
log "[E] invalid URL, must start with https://huggingface.co"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# replace "blob/main" with "resolve/main"
|
||||
url=${url/blob\/main/resolve\/main}
|
||||
|
||||
basename=$(basename $url)
|
||||
|
||||
log "[+] attempting to download $basename"
|
||||
|
||||
if [ -n "$cmd" ]; then
|
||||
cmd=$(printf "$cmd" "$basename" "$url")
|
||||
log "[+] $cmd"
|
||||
if $cmd; then
|
||||
echo $basename
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
log "[-] failed to download"
|
||||
|
||||
exit 1
|
Loading…
Add table
Add a link
Reference in a new issue