Merge branch 'master' into gg/refactor-alibi
This commit is contained in:
commit
ac91033ccb
16 changed files with 345 additions and 88 deletions
|
@ -958,7 +958,7 @@ We have three Docker images available for this project:
|
||||||
|
|
||||||
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`)
|
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
|
|
||||||
Additionally, there the following images, similar to the above:
|
Additionally, there the following images, similar to the above:
|
||||||
|
|
||||||
|
|
|
@ -580,6 +580,10 @@ function gg_run_embd_bge_small {
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
|
||||||
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
|
||||||
|
|
||||||
|
gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json
|
||||||
|
|
||||||
path_models="../models-mnt/bge-small"
|
path_models="../models-mnt/bge-small"
|
||||||
|
|
||||||
|
|
|
@ -1650,7 +1650,29 @@ class BertModel(Model):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self.gguf_writer.add_causal_attention(False)
|
self.gguf_writer.add_causal_attention(False)
|
||||||
self.gguf_writer.add_pooling_layer(True)
|
|
||||||
|
# get pooling path
|
||||||
|
with open(self.dir_model / "modules.json", encoding="utf-8") as f:
|
||||||
|
modules = json.load(f)
|
||||||
|
pooling_path = None
|
||||||
|
for mod in modules:
|
||||||
|
if mod["type"] == "sentence_transformers.models.Pooling":
|
||||||
|
pooling_path = mod["path"]
|
||||||
|
break
|
||||||
|
|
||||||
|
# get pooling type
|
||||||
|
pooling_type = gguf.PoolingType.NONE
|
||||||
|
if pooling_path is not None:
|
||||||
|
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
|
||||||
|
pooling = json.load(f)
|
||||||
|
if pooling["pooling_mode_mean_tokens"]:
|
||||||
|
pooling_type = gguf.PoolingType.MEAN
|
||||||
|
elif pooling["pooling_mode_cls_token"]:
|
||||||
|
pooling_type = gguf.PoolingType.CLS
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Only MEAN and CLS pooling types supported")
|
||||||
|
|
||||||
|
self.gguf_writer.add_pooling_type(pooling_type.value)
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
path = self.dir_model
|
path = self.dir_model
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
# LLaVA
|
# LLaVA
|
||||||
|
|
||||||
Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants.
|
Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants,
|
||||||
|
as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants.
|
||||||
|
|
||||||
The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
|
The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
|
||||||
and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
|
and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
|
||||||
models are available.
|
models are available.
|
||||||
|
For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf)
|
||||||
|
|
||||||
After API is confirmed, more models will be supported / uploaded.
|
After API is confirmed, more models will be supported / uploaded.
|
||||||
|
|
||||||
|
@ -18,6 +20,7 @@ After building, run: `./llava-cli` to see the usage. For example:
|
||||||
```
|
```
|
||||||
|
|
||||||
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
||||||
|
**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
|
||||||
|
|
||||||
## LLaVA 1.5
|
## LLaVA 1.5
|
||||||
|
|
||||||
|
@ -55,11 +58,46 @@ python ./convert.py ../llava-v1.5-7b
|
||||||
|
|
||||||
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
|
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
|
||||||
|
|
||||||
## LLaVA 1.6
|
## LLaVA 1.6 gguf conversion
|
||||||
|
|
||||||
|
1) Backup your pth/safetensor model files as llava-surgery modifies them
|
||||||
|
2) Use `python llava-surgery-v2.py -C -m /path/to/hf-model` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||||
|
- you will find a llava.projector and a llava.clip file in your model directory
|
||||||
|
3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory (https://huggingface.co/cmp-nct/llava-1.6-gguf/blob/main/config.json)
|
||||||
|
4) Create the visual gguf model: `python ./examples/llava/convert-image-encoder-to-gguf.py -m ../path/to/vit --llava-projector ../path/to/llava.projector --output-dir ../path/to/output --clip_model_is_vision`
|
||||||
|
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
||||||
|
5) Everything else as usual: convert.py the hf model, quantize as needed
|
||||||
|
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
|
||||||
|
**note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
|
||||||
|
|
||||||
|
## llava-cli templating and llava-1.6 prompting
|
||||||
|
|
||||||
|
llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."`
|
||||||
|
For llava-1.5 models which are not vicuna (mistral and Yi) you need to adapt system prompt as well as user prompt, for this purpose llava-cli has a basic templating system:
|
||||||
|
|
||||||
|
**For Mistral and using llava-cli binary:**
|
||||||
|
Add this: `-p "<image>\nUSER:\nProvide a full description.\nASSISTANT:\n"`
|
||||||
|
The mistral template for llava-1.6 seems to be no system print and a USER/ASSISTANT role
|
||||||
|
|
||||||
|
**For the 34B this should work:**
|
||||||
|
Add this: `-e -p <|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nProvide a full description.<|im_end|><|im_start|>assistant\n`
|
||||||
|
|
||||||
|
|
||||||
|
## How to know if you are running in llava-1.5 or llava-1.6 mode
|
||||||
|
|
||||||
|
When running llava-cli you will see a visual information right before the prompt is being processed:
|
||||||
|
|
||||||
|
**Llava-1.5:**
|
||||||
|
`encode_image_with_clip: image embedding created: 576 tokens`
|
||||||
|
|
||||||
|
**Llava-1.6 (anything above 576):**
|
||||||
|
`encode_image_with_clip: image embedding created: 2880 tokens`
|
||||||
|
|
||||||
|
|
||||||
|
Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6
|
||||||
|
|
||||||
|
|
||||||
- Use `llava-surgery-v2.py`
|
|
||||||
|
|
||||||
- TODO: add detailed instructions
|
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
|
|
||||||
|
|
|
@ -1103,7 +1103,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
printf("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
printf("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
||||||
printf("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
printf("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
||||||
printf("v_image_grid_pinpoints: ");
|
printf("v_image_grid_pinpoints: ");
|
||||||
for (int i = 0; i < 32 & hparams.image_grid_pinpoints[i]!=0; ++i) {
|
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
|
||||||
printf("%d ", hparams.image_grid_pinpoints[i]);
|
printf("%d ", hparams.image_grid_pinpoints[i]);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
@ -1232,6 +1232,18 @@ struct clip_image_f32 * clip_image_f32_init() {
|
||||||
|
|
||||||
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
|
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
|
||||||
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
|
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
|
||||||
|
void clip_image_u8_batch_free(struct clip_image_u8_batch & batch) {
|
||||||
|
if (batch.size > 0) {
|
||||||
|
delete[] batch.data;
|
||||||
|
batch.size = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void clip_image_f32_batch_free(struct clip_image_f32_batch & batch) {
|
||||||
|
if (batch.size > 0) {
|
||||||
|
delete[] batch.data;
|
||||||
|
batch.size = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
|
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
|
||||||
img->nx = nx;
|
img->nx = nx;
|
||||||
|
@ -1494,11 +1506,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||||
pad_to_square = false;
|
pad_to_square = false;
|
||||||
}
|
}
|
||||||
// free the previous res_imgs if any set
|
// free the previous res_imgs if any set
|
||||||
if (res_imgs.size > 0 && res_imgs.size < 100) {
|
if (res_imgs.size > 0) {
|
||||||
for (size_t i = 0; i < res_imgs.size; i++) {
|
clip_image_f32_batch_free(res_imgs);
|
||||||
clip_image_f32_free(&(res_imgs.data[i]));
|
|
||||||
}
|
|
||||||
delete[] res_imgs.data;
|
|
||||||
}
|
}
|
||||||
res_imgs.data = nullptr;
|
res_imgs.data = nullptr;
|
||||||
res_imgs.size = 0;
|
res_imgs.size = 0;
|
||||||
|
@ -1650,7 +1659,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||||
|
|
||||||
res_imgs.size = 1;
|
res_imgs.size = 1;
|
||||||
res_imgs.data = new clip_image_f32[res_imgs.size];
|
res_imgs.data = new clip_image_f32[res_imgs.size];
|
||||||
res_imgs.data[0] = std::move(*res);
|
res_imgs.data[0] = *res;
|
||||||
|
clip_image_f32_free(res);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -60,6 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||||
|
|
||||||
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
||||||
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
||||||
|
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch & batch);
|
||||||
|
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
|
||||||
|
|
||||||
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||||
|
|
||||||
|
|
|
@ -100,7 +100,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||||
int num_patches_width = grid_shape.first; // grid 1-4
|
int num_patches_width = grid_shape.first; // grid 1-4
|
||||||
int num_patches_height = grid_shape.second; // grid 1-4
|
int num_patches_height = grid_shape.second; // grid 1-4
|
||||||
|
|
||||||
const size_t num_images = num_patches_width + num_patches_height + 1;
|
const size_t num_images = num_patches_width * num_patches_height + 1;
|
||||||
|
|
||||||
// TODO: size calculation is not calculated - it's only tens of MB
|
// TODO: size calculation is not calculated - it's only tens of MB
|
||||||
size_t ctx_size = 0;
|
size_t ctx_size = 0;
|
||||||
|
|
|
@ -975,7 +975,12 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
LOG_TEE("Error processing the given image");
|
LOG_TEE("Error processing the given image");
|
||||||
clip_free(clp_ctx);
|
clip_free(clp_ctx);
|
||||||
clip_image_f32_free(img_res_v.data);
|
clip_image_f32_batch_free(img_res_v);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (img_res_v.size == 0)
|
||||||
|
{
|
||||||
|
LOG_TEE("Error processing the given image");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -987,6 +992,7 @@ struct llama_server_context
|
||||||
if (!img.image_embedding)
|
if (!img.image_embedding)
|
||||||
{
|
{
|
||||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
||||||
|
clip_image_f32_batch_free(img_res_v);
|
||||||
clip_free(clp_ctx);
|
clip_free(clp_ctx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -994,10 +1000,11 @@ struct llama_server_context
|
||||||
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
|
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
|
||||||
{
|
{
|
||||||
LOG_TEE("Unable to encode image\n");
|
LOG_TEE("Unable to encode image\n");
|
||||||
|
clip_image_f32_batch_free(img_res_v);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
clip_image_f32_free(img_res_v.data);
|
clip_image_f32_batch_free(img_res_v);
|
||||||
|
|
||||||
img.request_encode_image = false;
|
img.request_encode_image = false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -7790,6 +7790,7 @@ GGML_CALL void ggml_init_cublas() {
|
||||||
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
|
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
|
||||||
initialized = true;
|
initialized = true;
|
||||||
g_cublas_loaded = false;
|
g_cublas_loaded = false;
|
||||||
|
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -707,9 +707,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
|
||||||
q.cmd_buffer_idx = 0;
|
q.cmd_buffer_idx = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
|
static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
|
||||||
|
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
|
||||||
|
vk::MemoryType memory_type = mem_props->memoryTypes[i];
|
||||||
|
if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
|
||||||
|
(flags & memory_type.propertyFlags) == flags &&
|
||||||
|
mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
|
||||||
|
return static_cast<int32_t>(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return UINT32_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl;
|
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
||||||
|
|
||||||
|
@ -736,15 +748,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
||||||
|
|
||||||
uint32_t memory_type_index = UINT32_MAX;
|
uint32_t memory_type_index = UINT32_MAX;
|
||||||
|
|
||||||
for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) {
|
memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
|
||||||
vk::MemoryType memory_type = mem_props.memoryTypes[i];
|
buf->memory_property_flags = req_flags;
|
||||||
if ((mem_req.memoryTypeBits & ((uint64_t)1 << i)) && (req_flags & memory_type.propertyFlags) == req_flags && mem_props.memoryHeaps[memory_type.heapIndex].size >= mem_req.size) {
|
|
||||||
memory_type_index = i;
|
if (memory_type_index == UINT32_MAX && fallback_flags) {
|
||||||
break;
|
memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
|
||||||
}
|
buf->memory_property_flags = fallback_flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (memory_type_index >= mem_props.memoryTypeCount) {
|
if (memory_type_index == UINT32_MAX) {
|
||||||
ctx->device.lock()->device.destroyBuffer(buf->buffer);
|
ctx->device.lock()->device.destroyBuffer(buf->buffer);
|
||||||
buf->size = 0;
|
buf->size = 0;
|
||||||
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
||||||
|
@ -758,10 +770,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
||||||
buf->size = 0;
|
buf->size = 0;
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
buf->memory_property_flags = req_flags;
|
|
||||||
buf->ptr = nullptr;
|
buf->ptr = nullptr;
|
||||||
|
|
||||||
if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||||
buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
|
buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -778,9 +789,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
||||||
return buf;
|
return buf;
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
|
static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
||||||
try {
|
try {
|
||||||
return ggml_vk_create_buffer(ctx, size, req_flags);
|
return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags);
|
||||||
} catch (const vk::SystemError& e) {
|
} catch (const vk::SystemError& e) {
|
||||||
std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
|
std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
|
||||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||||
|
@ -791,17 +802,17 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size
|
||||||
static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
|
static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
|
||||||
vk_buffer buf;
|
vk_buffer buf;
|
||||||
try {
|
try {
|
||||||
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
||||||
} catch (const vk::SystemError& e) {
|
|
||||||
if (ctx->device.lock()->uma) {
|
if (ctx->device.lock()->uma) {
|
||||||
// Fall back to host memory type
|
// Fall back to host memory type
|
||||||
buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||||
} else {
|
} else {
|
||||||
|
buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||||
|
}
|
||||||
|
} catch (const vk::SystemError& e) {
|
||||||
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
|
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
|
||||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return buf;
|
return buf;
|
||||||
}
|
}
|
||||||
|
@ -1422,7 +1433,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
vk_buffer buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
vk_buffer buf = ggml_vk_create_buffer(ctx, size,
|
||||||
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||||
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||||
|
|
||||||
if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
|
if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
|
||||||
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
|
||||||
|
@ -1568,7 +1581,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
||||||
static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
|
static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
|
||||||
if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
|
if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
|
||||||
ggml_vk_destroy_buffer(ctx->sync_staging);
|
ggml_vk_destroy_buffer(ctx->sync_staging);
|
||||||
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
|
||||||
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||||
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4082,7 +4097,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||||
std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_VULKAN_RUN_TESTS)
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
||||||
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
||||||
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached
|
||||||
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||||
ggml_vk_test_transfer(ctx, 8192 * 1000, false);
|
ggml_vk_test_transfer(ctx, 8192 * 1000, false);
|
||||||
ggml_vk_test_transfer(ctx, 8192 * 1000, true);
|
ggml_vk_test_transfer(ctx, 8192 * 1000, true);
|
||||||
|
|
||||||
|
@ -4174,7 +4191,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||||
if (ctx->staging != nullptr) {
|
if (ctx->staging != nullptr) {
|
||||||
ggml_vk_destroy_buffer(ctx->staging);
|
ggml_vk_destroy_buffer(ctx->staging);
|
||||||
}
|
}
|
||||||
ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
|
||||||
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||||
|
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -40,7 +40,7 @@ class Keys:
|
||||||
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||||
EXPERT_COUNT = "{arch}.expert_count"
|
EXPERT_COUNT = "{arch}.expert_count"
|
||||||
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||||
POOLING_LAYER = "{arch}.pooling_layer"
|
POOLING_TYPE = "{arch}.pooling_type"
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
@ -73,6 +73,8 @@ class Keys:
|
||||||
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||||
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||||
|
CLS_ID = "tokenizer.ggml.cls_token_id"
|
||||||
|
MASK_ID = "tokenizer.ggml.mask_token_id"
|
||||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||||
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
||||||
|
@ -559,6 +561,12 @@ class RopeScalingType(Enum):
|
||||||
YARN = 'yarn'
|
YARN = 'yarn'
|
||||||
|
|
||||||
|
|
||||||
|
class PoolingType(IntEnum):
|
||||||
|
NONE = 0
|
||||||
|
MEAN = 1
|
||||||
|
CLS = 2
|
||||||
|
|
||||||
|
|
||||||
class GGMLQuantizationType(IntEnum):
|
class GGMLQuantizationType(IntEnum):
|
||||||
F32 = 0
|
F32 = 0
|
||||||
F16 = 1
|
F16 = 1
|
||||||
|
@ -685,5 +693,7 @@ KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
|
||||||
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
|
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
|
||||||
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
|
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
|
||||||
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
|
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
|
||||||
|
KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
|
||||||
|
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
|
||||||
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
||||||
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
||||||
|
|
|
@ -19,6 +19,7 @@ from .constants import (
|
||||||
GGUFValueType,
|
GGUFValueType,
|
||||||
Keys,
|
Keys,
|
||||||
RopeScalingType,
|
RopeScalingType,
|
||||||
|
PoolingType,
|
||||||
TokenType,
|
TokenType,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -360,8 +361,8 @@ class GGUFWriter:
|
||||||
def add_causal_attention(self, value: bool) -> None:
|
def add_causal_attention(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
|
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_pooling_layer(self, value: bool) -> None:
|
def add_pooling_type(self, value: PoolingType) -> None:
|
||||||
self.add_bool(Keys.LLM.POOLING_LAYER.format(arch=self.arch), value)
|
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_rope_dimension_count(self, count: int) -> None:
|
def add_rope_dimension_count(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
|
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
|
||||||
|
@ -414,6 +415,12 @@ class GGUFWriter:
|
||||||
def add_pad_token_id(self, id: int) -> None:
|
def add_pad_token_id(self, id: int) -> None:
|
||||||
self.add_uint32(Keys.Tokenizer.PAD_ID, id)
|
self.add_uint32(Keys.Tokenizer.PAD_ID, id)
|
||||||
|
|
||||||
|
def add_cls_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.CLS_ID, id)
|
||||||
|
|
||||||
|
def add_mask_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.MASK_ID, id)
|
||||||
|
|
||||||
def add_add_bos_token(self, value: bool) -> None:
|
def add_add_bos_token(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Tokenizer.ADD_BOS, value)
|
self.add_bool(Keys.Tokenizer.ADD_BOS, value)
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,7 @@ class SpecialVocab:
|
||||||
if special_token_types is not None:
|
if special_token_types is not None:
|
||||||
self.special_token_types = special_token_types
|
self.special_token_types = special_token_types
|
||||||
else:
|
else:
|
||||||
self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad')
|
self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask')
|
||||||
self._load(Path(path))
|
self._load(Path(path))
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
|
@ -152,10 +152,6 @@ class SpecialVocab:
|
||||||
add_entry = tokenizer_config.get(f'add_{typ}_token')
|
add_entry = tokenizer_config.get(f'add_{typ}_token')
|
||||||
if isinstance(add_entry, bool):
|
if isinstance(add_entry, bool):
|
||||||
self.add_special_token[typ] = add_entry
|
self.add_special_token[typ] = add_entry
|
||||||
if not added_tokens:
|
|
||||||
# We will need this to get the content for the token, so if it's empty
|
|
||||||
# may as well just give up.
|
|
||||||
continue
|
|
||||||
entry = tokenizer_config.get(f'{typ}_token')
|
entry = tokenizer_config.get(f'{typ}_token')
|
||||||
if isinstance(entry, str):
|
if isinstance(entry, str):
|
||||||
tc_content = entry
|
tc_content = entry
|
||||||
|
|
82
llama.cpp
82
llama.cpp
|
@ -256,7 +256,7 @@ enum llm_kv {
|
||||||
LLM_KV_TENSOR_DATA_LAYOUT,
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
||||||
LLM_KV_EXPERT_COUNT,
|
LLM_KV_EXPERT_COUNT,
|
||||||
LLM_KV_EXPERT_USED_COUNT,
|
LLM_KV_EXPERT_USED_COUNT,
|
||||||
LLM_KV_POOLING_LAYER,
|
LLM_KV_POOLING_TYPE,
|
||||||
|
|
||||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||||
|
@ -314,7 +314,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
||||||
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
||||||
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
||||||
{ LLM_KV_POOLING_LAYER, "%s.pooling_layer" },
|
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
||||||
|
|
||||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||||
|
@ -1561,8 +1561,8 @@ struct llama_hparams {
|
||||||
float f_max_alibi_bias = 0.0f;
|
float f_max_alibi_bias = 0.0f;
|
||||||
|
|
||||||
bool causal_attn = true;
|
bool causal_attn = true;
|
||||||
bool pooling_layer = false;
|
|
||||||
|
|
||||||
|
uint32_t pooling_type = LLAMA_POOLING_NONE;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams & other) const {
|
||||||
if (this->vocab_only != other.vocab_only) return true;
|
if (this->vocab_only != other.vocab_only) return true;
|
||||||
|
@ -1925,7 +1925,8 @@ struct llama_context {
|
||||||
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
||||||
struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
|
struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
|
||||||
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
||||||
struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
||||||
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
#ifdef GGML_USE_MPI
|
||||||
ggml_mpi_context * ctx_mpi = NULL;
|
ggml_mpi_context * ctx_mpi = NULL;
|
||||||
|
@ -3095,7 +3096,7 @@ static void llm_load_hparams(
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||||
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
||||||
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 3:
|
case 3:
|
||||||
|
@ -3116,7 +3117,7 @@ static void llm_load_hparams(
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||||
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
||||||
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||||
|
|
||||||
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
||||||
model.type = e_model::MODEL_137M;
|
model.type = e_model::MODEL_137M;
|
||||||
|
@ -4943,7 +4944,7 @@ struct llm_build_context {
|
||||||
const int32_t n_orig_ctx;
|
const int32_t n_orig_ctx;
|
||||||
|
|
||||||
const bool do_rope_shift;
|
const bool do_rope_shift;
|
||||||
const bool do_pooling;
|
const uint32_t pooling_type;
|
||||||
|
|
||||||
const llm_build_cb & cb;
|
const llm_build_cb & cb;
|
||||||
|
|
||||||
|
@ -4987,7 +4988,7 @@ struct llm_build_context {
|
||||||
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
||||||
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
||||||
do_rope_shift (worst_case || kv_self.has_shift),
|
do_rope_shift (worst_case || kv_self.has_shift),
|
||||||
do_pooling (hparams.pooling_layer && cparams.do_pooling),
|
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
|
||||||
cb (cb),
|
cb (cb),
|
||||||
buf_compute_meta (lctx.buf_compute_meta) {
|
buf_compute_meta (lctx.buf_compute_meta) {
|
||||||
// all initializations should be done in init()
|
// all initializations should be done in init()
|
||||||
|
@ -5849,7 +5850,8 @@ struct llm_build_context {
|
||||||
// get input vectors with right size
|
// get input vectors with right size
|
||||||
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
||||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
|
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
|
||||||
|
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
|
||||||
|
|
||||||
// construct input embeddings (token, type, position)
|
// construct input embeddings (token, type, position)
|
||||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
|
@ -5966,8 +5968,12 @@ struct llm_build_context {
|
||||||
cur = inpL;
|
cur = inpL;
|
||||||
|
|
||||||
// pooling layer
|
// pooling layer
|
||||||
if (do_pooling) {
|
if (pooling_type == LLAMA_POOLING_MEAN) {
|
||||||
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
|
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
||||||
|
} else if (pooling_type == LLAMA_POOLING_CLS) {
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
|
||||||
}
|
}
|
||||||
cb(cur, "result_embd", -1);
|
cb(cur, "result_embd", -1);
|
||||||
|
|
||||||
|
@ -7535,15 +7541,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
|
||||||
assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
|
|
||||||
float * data = (float *) lctx.inp_sum->data;
|
|
||||||
|
|
||||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
|
||||||
data[i] = 1.0f/float(batch.n_tokens);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (kv_self.has_shift) {
|
if (kv_self.has_shift) {
|
||||||
const int64_t n_ctx = cparams.n_ctx;
|
const int64_t n_ctx = cparams.n_ctx;
|
||||||
|
|
||||||
|
@ -7556,17 +7553,46 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hparams.pooling_layer && cparams.do_pooling) {
|
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
||||||
const int64_t n_tokens = batch.n_tokens;
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
||||||
float * data = (float *) lctx.inp_sum->data;
|
float * data = (float *) lctx.inp_mean->data;
|
||||||
|
|
||||||
memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
|
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
||||||
|
|
||||||
|
std::vector<uint64_t> sum(n_tokens, 0);
|
||||||
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||||
|
sum[seq_id] += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> div(n_tokens, 0.0f);
|
||||||
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
|
const uint64_t s = sum[i];
|
||||||
|
if (s > 0) {
|
||||||
|
div[i] = 1.0f/float(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_tokens; ++i) {
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
const llama_seq_id seq_id = batch.seq_id[i][0];
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||||
data[seq_id*n_tokens + i] = 1.0f;
|
data[seq_id*n_tokens + i] = div[seq_id];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
|
||||||
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
||||||
|
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
||||||
|
|
||||||
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||||
|
const llama_pos pos = batch.pos[i];
|
||||||
|
if (pos == 0) {
|
||||||
|
data[seq_id] = i;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -11452,7 +11478,8 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
||||||
ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
|
ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
|
||||||
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
||||||
ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
||||||
|
ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
||||||
|
|
||||||
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
||||||
ggml_set_name(ctx->inp_embd, "inp_embd");
|
ggml_set_name(ctx->inp_embd, "inp_embd");
|
||||||
|
@ -11460,7 +11487,8 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
||||||
ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
|
ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
|
||||||
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
||||||
ggml_set_name(ctx->inp_sum, "inp_sum");
|
ggml_set_name(ctx->inp_mean, "inp_mean");
|
||||||
|
ggml_set_name(ctx->inp_cls, "inp_cls");
|
||||||
|
|
||||||
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
||||||
|
|
||||||
|
|
6
llama.h
6
llama.h
|
@ -112,6 +112,12 @@ extern "C" {
|
||||||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum llama_pooling_type {
|
||||||
|
LLAMA_POOLING_NONE = 0,
|
||||||
|
LLAMA_POOLING_MEAN = 1,
|
||||||
|
LLAMA_POOLING_CLS = 2,
|
||||||
|
};
|
||||||
|
|
||||||
enum llama_split_mode {
|
enum llama_split_mode {
|
||||||
LLAMA_SPLIT_NONE = 0, // single GPU
|
LLAMA_SPLIT_NONE = 0, // single GPU
|
||||||
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
||||||
|
|
107
scripts/hf.sh
Executable file
107
scripts/hf.sh
Executable file
|
@ -0,0 +1,107 @@
|
||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Shortcut for downloading HF models
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./main -m $(./examples/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||||
|
# ./main -m $(./examples/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||||
|
# ./main -m $(./examples/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||||
|
#
|
||||||
|
|
||||||
|
# all logs go to stderr
|
||||||
|
function log {
|
||||||
|
echo "$@" 1>&2
|
||||||
|
}
|
||||||
|
|
||||||
|
function usage {
|
||||||
|
log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [-h|--help]"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# check for curl or wget
|
||||||
|
function has_cmd {
|
||||||
|
if ! [ -x "$(command -v $1)" ]; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
if has_cmd wget; then
|
||||||
|
cmd="wget -q --show-progress -c -O %s %s"
|
||||||
|
elif has_cmd curl; then
|
||||||
|
cmd="curl -C - -f -o %s -L %s"
|
||||||
|
else
|
||||||
|
log "[E] curl or wget not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
url=""
|
||||||
|
repo=""
|
||||||
|
file=""
|
||||||
|
|
||||||
|
# parse args
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--url)
|
||||||
|
url="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--repo)
|
||||||
|
repo="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--file)
|
||||||
|
file="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-h|--help)
|
||||||
|
usage
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
url="$1"
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -n "$repo" ] && [ -n "$file" ]; then
|
||||||
|
url="https://huggingface.co/$repo/resolve/main/$file"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$url" ]; then
|
||||||
|
log "[E] missing --url"
|
||||||
|
usage
|
||||||
|
fi
|
||||||
|
|
||||||
|
# check if the URL is a HuggingFace model, and if so, try to download it
|
||||||
|
is_url=false
|
||||||
|
|
||||||
|
if [[ ${#url} -gt 22 ]]; then
|
||||||
|
if [[ ${url:0:22} == "https://huggingface.co" ]]; then
|
||||||
|
is_url=true
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$is_url" = false ]; then
|
||||||
|
log "[E] invalid URL, must start with https://huggingface.co"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# replace "blob/main" with "resolve/main"
|
||||||
|
url=${url/blob\/main/resolve\/main}
|
||||||
|
|
||||||
|
basename=$(basename $url)
|
||||||
|
|
||||||
|
log "[+] attempting to download $basename"
|
||||||
|
|
||||||
|
if [ -n "$cmd" ]; then
|
||||||
|
cmd=$(printf "$cmd" "$basename" "$url")
|
||||||
|
log "[+] $cmd"
|
||||||
|
if $cmd; then
|
||||||
|
echo $basename
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "[-] failed to download"
|
||||||
|
|
||||||
|
exit 1
|
Loading…
Add table
Add a link
Reference in a new issue