Merge branch 'ggerganov:master' into server_branch
This commit is contained in:
commit
ecbb531b1f
18 changed files with 328 additions and 346 deletions
|
@ -255,11 +255,11 @@ effectiveStdenv.mkDerivation (
|
||||||
# Configurations we don't want even the CI to evaluate. Results in the
|
# Configurations we don't want even the CI to evaluate. Results in the
|
||||||
# "unsupported platform" messages. This is mostly a no-op, because
|
# "unsupported platform" messages. This is mostly a no-op, because
|
||||||
# cudaPackages would've refused to evaluate anyway.
|
# cudaPackages would've refused to evaluate anyway.
|
||||||
badPlatforms = optionals (useCuda || useOpenCL || useVulkan) lib.platforms.darwin;
|
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
|
||||||
|
|
||||||
# Configurations that are known to result in build failures. Can be
|
# Configurations that are known to result in build failures. Can be
|
||||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
# overridden by importing Nixpkgs with `allowBroken = true`.
|
||||||
broken = (useMetalKit && !effectiveStdenv.isDarwin) || (useVulkan && effectiveStdenv.isDarwin);
|
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
||||||
|
|
||||||
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
||||||
homepage = "https://github.com/ggerganov/llama.cpp/";
|
homepage = "https://github.com/ggerganov/llama.cpp/";
|
||||||
|
|
|
@ -110,6 +110,7 @@ option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests"
|
||||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||||
|
option(LLAMA_METAL_EMBED_LIBRARY "llama: embed Metal library" OFF)
|
||||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||||
|
@ -201,6 +202,29 @@ if (LLAMA_METAL)
|
||||||
# copy ggml-metal.metal to bin directory
|
# copy ggml-metal.metal to bin directory
|
||||||
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||||
|
|
||||||
|
if (LLAMA_METAL_EMBED_LIBRARY)
|
||||||
|
enable_language(ASM)
|
||||||
|
add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
|
||||||
|
|
||||||
|
set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
|
||||||
|
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
|
||||||
|
set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${EMBED_METALLIB_ASSEMBLY}
|
||||||
|
COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
|
||||||
|
COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||||
|
COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||||
|
COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||||
|
COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||||
|
COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||||
|
DEPENDS ${METALLIB_SOURCE}
|
||||||
|
COMMENT "Generate assembly for embedded Metal library"
|
||||||
|
)
|
||||||
|
|
||||||
|
set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_METAL_SHADER_DEBUG)
|
if (LLAMA_METAL_SHADER_DEBUG)
|
||||||
# custom command to do the following:
|
# custom command to do the following:
|
||||||
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
|
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
|
||||||
|
|
22
Makefile
22
Makefile
|
@ -173,7 +173,7 @@ ifdef LLAMA_DEBUG
|
||||||
MK_LDFLAGS += -g
|
MK_LDFLAGS += -g
|
||||||
|
|
||||||
ifeq ($(UNAME_S),Linux)
|
ifeq ($(UNAME_S),Linux)
|
||||||
MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
|
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
MK_CPPFLAGS += -DNDEBUG
|
MK_CPPFLAGS += -DNDEBUG
|
||||||
|
@ -533,11 +533,29 @@ ifdef LLAMA_METAL
|
||||||
ifdef LLAMA_METAL_NDEBUG
|
ifdef LLAMA_METAL_NDEBUG
|
||||||
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
|
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
|
||||||
endif
|
endif
|
||||||
|
ifdef LLAMA_METAL_EMBED_LIBRARY
|
||||||
|
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
|
||||||
|
OBJS += ggml-metal-embed.o
|
||||||
|
endif
|
||||||
endif # LLAMA_METAL
|
endif # LLAMA_METAL
|
||||||
|
|
||||||
ifdef LLAMA_METAL
|
ifdef LLAMA_METAL
|
||||||
ggml-metal.o: ggml-metal.m ggml-metal.h
|
ggml-metal.o: ggml-metal.m ggml-metal.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
ifdef LLAMA_METAL_EMBED_LIBRARY
|
||||||
|
ggml-metal-embed.o: ggml-metal.metal
|
||||||
|
@echo "Embedding Metal library"
|
||||||
|
$(eval TEMP_ASSEMBLY=$(shell mktemp))
|
||||||
|
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
|
||||||
|
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
|
||||||
|
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
|
||||||
|
@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
|
||||||
|
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
|
||||||
|
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
|
||||||
|
@$(AS) $(TEMP_ASSEMBLY) -o $@
|
||||||
|
@rm -f ${TEMP_ASSEMBLY}
|
||||||
|
endif
|
||||||
endif # LLAMA_METAL
|
endif # LLAMA_METAL
|
||||||
|
|
||||||
ifdef LLAMA_MPI
|
ifdef LLAMA_MPI
|
||||||
|
@ -701,7 +719,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||||
|
|
|
@ -156,6 +156,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
||||||
- [semperai/amica](https://github.com/semperai/amica)
|
- [semperai/amica](https://github.com/semperai/amica)
|
||||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||||
|
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -123,6 +123,7 @@ pub fn build(b: *std.build.Builder) !void {
|
||||||
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
||||||
const train = make.obj("train", "common/train.cpp");
|
const train = make.obj("train", "common/train.cpp");
|
||||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||||
|
const llava = make.obj("llava", "examples/llava/llava.cpp");
|
||||||
|
|
||||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
|
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
|
||||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||||
|
@ -131,7 +132,7 @@ pub fn build(b: *std.build.Builder) !void {
|
||||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||||
|
|
||||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
|
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
|
||||||
if (server.target.isWindows()) {
|
if (server.target.isWindows()) {
|
||||||
server.linkSystemLibrary("ws2_32");
|
server.linkSystemLibrary("ws2_32");
|
||||||
}
|
}
|
||||||
|
|
|
@ -59,14 +59,40 @@ python ./convert.py ../llava-v1.5-7b --skip-unknown
|
||||||
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
|
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
|
||||||
|
|
||||||
## LLaVA 1.6 gguf conversion
|
## LLaVA 1.6 gguf conversion
|
||||||
|
1) First clone a LLaVA 1.6 model:
|
||||||
1) Backup your pth/safetensor model files as llava-surgery modifies them
|
```console
|
||||||
2) Use `python llava-surgery-v2.py -C -m /path/to/hf-model` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
|
||||||
|
```
|
||||||
|
2) Backup your pth/safetensor model files as llava-surgery modifies them
|
||||||
|
3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||||
|
```console
|
||||||
|
python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||||
|
```
|
||||||
- you will find a llava.projector and a llava.clip file in your model directory
|
- you will find a llava.projector and a llava.clip file in your model directory
|
||||||
3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory (https://huggingface.co/cmp-nct/llava-1.6-gguf/blob/main/config_vit.json) and rename it to config.json.
|
4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
|
||||||
4) Create the visual gguf model: `python ./examples/llava/convert-image-encoder-to-gguf.py -m ../path/to/vit --llava-projector ../path/to/llava.projector --output-dir ../path/to/output --clip-model-is-vision`
|
```console
|
||||||
|
mkdir vit
|
||||||
|
cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
|
||||||
|
cp ../llava-v1.6-vicuna-7b/llava.projector vit/
|
||||||
|
curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
|
||||||
|
```
|
||||||
|
|
||||||
|
5) Create the visual gguf model:
|
||||||
|
```console
|
||||||
|
python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||||
|
```
|
||||||
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
||||||
5) Everything else as usual: convert.py the hf model, quantize as needed
|
|
||||||
|
6) Then convert the model to gguf format:
|
||||||
|
```console
|
||||||
|
python ./convert.py ../llava-v1.6-vicuna-7b/
|
||||||
|
```
|
||||||
|
|
||||||
|
7) And finally we can run the llava-cli using the 1.6 model version:
|
||||||
|
```console
|
||||||
|
./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
|
||||||
|
```
|
||||||
|
|
||||||
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
|
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
|
||||||
**note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
|
**note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
|
||||||
|
|
||||||
|
|
|
@ -311,7 +311,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
||||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
|
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
|
||||||
if (!image_embd) {
|
if (!image_embd) {
|
||||||
fprintf(stderr, "Unable to allocate memory for image embeddings\n");
|
fprintf(stderr, "Unable to allocate memory for image embeddings\n");
|
||||||
|
|
|
@ -31,6 +31,8 @@ struct llava_image_embed {
|
||||||
/** sanity check for clip <-> llava embed size match */
|
/** sanity check for clip <-> llava embed size match */
|
||||||
LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
|
LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
|
||||||
|
|
||||||
|
LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
||||||
|
|
||||||
/** build an image embed from image file bytes */
|
/** build an image embed from image file bytes */
|
||||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
||||||
/** build an image embed from a path to an image filename */
|
/** build an image embed from a path to an image filename */
|
||||||
|
|
|
@ -134,10 +134,11 @@ node index.js
|
||||||
## API Endpoints
|
## API Endpoints
|
||||||
|
|
||||||
- **GET** `/health`: Returns the current state of the server:
|
- **GET** `/health`: Returns the current state of the server:
|
||||||
- `{"status": "loading model"}` if the model is still being loaded.
|
- 503 -> `{"status": "loading model"}` if the model is still being loaded.
|
||||||
- `{"status": "error"}` if the model failed to load.
|
- 500 -> `{"status": "error"}` if the model failed to load.
|
||||||
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
- 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
||||||
- `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available
|
- 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
|
||||||
|
- 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
|
||||||
|
|
||||||
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
||||||
|
|
||||||
|
|
|
@ -15,13 +15,11 @@
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
inline static json oaicompat_completion_params_parse(
|
inline static json oaicompat_completion_params_parse(
|
||||||
|
const struct llama_model * model,
|
||||||
const json &body, /* openai api json semantics */
|
const json &body, /* openai api json semantics */
|
||||||
const std::string &chat_template)
|
const std::string &chat_template)
|
||||||
{
|
{
|
||||||
json llama_params;
|
json llama_params;
|
||||||
std::string formatted_prompt = chat_template == "chatml"
|
|
||||||
? format_chatml(body["messages"]) // OpenAI 'messages' to chatml (with <|im_start|>,...)
|
|
||||||
: format_llama2(body["messages"]); // OpenAI 'messages' to llama2 (with [INST],...)
|
|
||||||
|
|
||||||
llama_params["__oaicompat"] = true;
|
llama_params["__oaicompat"] = true;
|
||||||
|
|
||||||
|
@ -34,7 +32,7 @@ inline static json oaicompat_completion_params_parse(
|
||||||
// https://platform.openai.com/docs/api-reference/chat/create
|
// https://platform.openai.com/docs/api-reference/chat/create
|
||||||
llama_sampling_params default_sparams;
|
llama_sampling_params default_sparams;
|
||||||
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
||||||
llama_params["prompt"] = formatted_prompt;
|
llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
|
||||||
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||||
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
||||||
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
|
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
#include "oai.hpp"
|
#include "oai.hpp"
|
||||||
|
|
||||||
#include "../llava/clip.h"
|
#include "../llava/clip.h"
|
||||||
|
#include "../llava/llava.h"
|
||||||
|
|
||||||
#include "stb_image.h"
|
#include "stb_image.h"
|
||||||
|
|
||||||
|
@ -39,7 +40,7 @@ struct server_params
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
std::string public_path = "examples/server/public";
|
std::string public_path = "examples/server/public";
|
||||||
std::string chat_template = "chatml";
|
std::string chat_template = "";
|
||||||
int32_t port = 8080;
|
int32_t port = 8080;
|
||||||
int32_t read_timeout = 600;
|
int32_t read_timeout = 600;
|
||||||
int32_t write_timeout = 600;
|
int32_t write_timeout = 600;
|
||||||
|
@ -1073,43 +1074,12 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
clip_image_f32_batch img_res_v;
|
|
||||||
img_res_v.size = 0;
|
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
||||||
img_res_v.data = nullptr;
|
|
||||||
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v))
|
|
||||||
{
|
|
||||||
LOG_TEE("Error processing the given image");
|
|
||||||
clip_free(clp_ctx);
|
|
||||||
clip_image_f32_batch_free(img_res_v);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (img_res_v.size == 0)
|
|
||||||
{
|
|
||||||
LOG_TEE("Error processing the given image");
|
LOG_TEE("Error processing the given image");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// note: assumes only one image was returned by clip_image_preprocess
|
|
||||||
clip_image_f32 * img_res = img_res_v.data;
|
|
||||||
|
|
||||||
img.image_tokens = clip_n_patches(clp_ctx);
|
|
||||||
img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
|
|
||||||
if (!img.image_embedding)
|
|
||||||
{
|
|
||||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
|
||||||
clip_image_f32_batch_free(img_res_v);
|
|
||||||
clip_free(clp_ctx);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
|
|
||||||
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
|
|
||||||
{
|
|
||||||
LOG_TEE("Unable to encode image\n");
|
|
||||||
clip_image_f32_batch_free(img_res_v);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
clip_image_f32_batch_free(img_res_v);
|
|
||||||
|
|
||||||
img.request_encode_image = false;
|
img.request_encode_image = false;
|
||||||
}
|
}
|
||||||
|
@ -2021,8 +1991,9 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||||
printf(" --chat-template FORMAT_NAME");
|
printf(" --chat-template JINJA_TEMPLATE\n");
|
||||||
printf(" set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str());
|
printf(" set custom jinja chat template (default: template taken from model's metadata)\n");
|
||||||
|
printf(" Note: only commonly used templates are accepted, since we don't have jinja parser\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2482,13 +2453,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
std::string value(argv[i]);
|
if (!verify_custom_template(argv[i])) {
|
||||||
if (value != "chatml" && value != "llama2") {
|
fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
|
||||||
fprintf(stderr, "error: chat template can be \"llama2\" or \"chatml\", but got: %s\n", value.c_str());
|
fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.chat_template = value;
|
sparams.chat_template = argv[i];
|
||||||
}
|
}
|
||||||
else if (arg == "--override-kv")
|
else if (arg == "--override-kv")
|
||||||
{
|
{
|
||||||
|
@ -2675,14 +2646,10 @@ int main(int argc, char **argv)
|
||||||
res.set_header("Access-Control-Allow-Headers", "*");
|
res.set_header("Access-Control-Allow-Headers", "*");
|
||||||
});
|
});
|
||||||
|
|
||||||
svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) {
|
svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) {
|
||||||
server_state current_state = state.load();
|
server_state current_state = state.load();
|
||||||
switch(current_state) {
|
switch(current_state) {
|
||||||
case SERVER_STATE_READY:
|
case SERVER_STATE_READY: {
|
||||||
if (llama.all_slots_are_idle) {
|
|
||||||
res.set_content(R"({"status": "ok"})", "application/json");
|
|
||||||
res.status = 200; // HTTP OK
|
|
||||||
} else {
|
|
||||||
int available_slots = 0;
|
int available_slots = 0;
|
||||||
int processing_slots = 0;
|
int processing_slots = 0;
|
||||||
for (llama_client_slot &slot: llama.slots) {
|
for (llama_client_slot &slot: llama.slots) {
|
||||||
|
@ -2705,10 +2672,14 @@ int main(int argc, char **argv)
|
||||||
{"slots_idle", available_slots},
|
{"slots_idle", available_slots},
|
||||||
{"slots_processing", processing_slots}};
|
{"slots_processing", processing_slots}};
|
||||||
res.set_content(health.dump(), "application/json");
|
res.set_content(health.dump(), "application/json");
|
||||||
|
if (req.has_param("fail_on_no_slot")) {
|
||||||
res.status = 503; // HTTP Service Unavailable
|
res.status = 503; // HTTP Service Unavailable
|
||||||
|
} else {
|
||||||
|
res.status = 200; // HTTP OK
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case SERVER_STATE_LOADING_MODEL:
|
case SERVER_STATE_LOADING_MODEL:
|
||||||
res.set_content(R"({"status": "loading model"})", "application/json");
|
res.set_content(R"({"status": "loading model"})", "application/json");
|
||||||
res.status = 503; // HTTP Service Unavailable
|
res.status = 503; // HTTP Service Unavailable
|
||||||
|
@ -3007,7 +2978,7 @@ int main(int argc, char **argv)
|
||||||
if (!validate_api_key(req, res)) {
|
if (!validate_api_key(req, res)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
json data = oaicompat_completion_params_parse(json::parse(req.body), sparams.chat_template);
|
json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template);
|
||||||
|
|
||||||
const int task_id = llama.queue_tasks.get_new_id();
|
const int task_id = llama.queue_tasks.get_new_id();
|
||||||
llama.queue_results.add_waiting_task_id(task_id);
|
llama.queue_results.add_waiting_task_id(task_id);
|
||||||
|
|
|
@ -170,50 +170,47 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
||||||
: default_value;
|
: default_value;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline std::string format_llama2(std::vector<json> messages)
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||||
|
inline bool verify_custom_template(const std::string & tmpl) {
|
||||||
|
llama_chat_message chat[] = {{"user", "test"}};
|
||||||
|
std::vector<char> buf(1);
|
||||||
|
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
|
||||||
|
return res >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Format given chat. If tmpl is empty, we take the template from model metadata
|
||||||
|
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages)
|
||||||
{
|
{
|
||||||
std::ostringstream output;
|
size_t alloc_size = 0;
|
||||||
bool is_inside_turn = false;
|
// vector holding all allocated string to be passed to llama_chat_apply_template
|
||||||
|
std::vector<std::string> str(messages.size() * 2);
|
||||||
|
std::vector<llama_chat_message> chat(messages.size());
|
||||||
|
|
||||||
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
for (size_t i = 0; i < messages.size(); ++i) {
|
||||||
if (!is_inside_turn) {
|
auto &curr_msg = messages[i];
|
||||||
output << "[INST] ";
|
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
|
||||||
}
|
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
|
||||||
std::string role = json_value(*it, "role", std::string("user"));
|
alloc_size += str[i*2 + 1].length();
|
||||||
std::string content = json_value(*it, "content", std::string(""));
|
chat[i].role = str[i*2 + 0].c_str();
|
||||||
if (role == "system") {
|
chat[i].content = str[i*2 + 1].c_str();
|
||||||
output << "<<SYS>>\n" << content << "\n<<SYS>>\n\n";
|
|
||||||
is_inside_turn = true;
|
|
||||||
} else if (role == "user") {
|
|
||||||
output << content << " [/INST]";
|
|
||||||
is_inside_turn = true;
|
|
||||||
} else {
|
|
||||||
output << " " << content << " </s>";
|
|
||||||
is_inside_turn = false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_VERBOSE("format_llama2", {{"text", output.str()}});
|
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
|
||||||
|
std::vector<char> buf(alloc_size * 2);
|
||||||
|
|
||||||
return output.str();
|
// run the first time to get the total output length
|
||||||
|
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
||||||
|
|
||||||
|
// if it turns out that our buffer is too small, we resize it
|
||||||
|
if ((size_t) res > buf.size()) {
|
||||||
|
buf.resize(res);
|
||||||
|
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
inline std::string format_chatml(std::vector<json> messages)
|
std::string formatted_chat(buf.data(), res);
|
||||||
{
|
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
||||||
std::ostringstream chatml_msgs;
|
|
||||||
|
|
||||||
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
return formatted_chat;
|
||||||
chatml_msgs << "<|im_start|>"
|
|
||||||
<< json_value(*it, "role", std::string("user")) << '\n';
|
|
||||||
chatml_msgs << json_value(*it, "content", std::string(""))
|
|
||||||
<< "<|im_end|>\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
chatml_msgs << "<|im_start|>assistant" << '\n';
|
|
||||||
|
|
||||||
LOG_VERBOSE("format_chatml", {{"text", chatml_msgs.str()}});
|
|
||||||
|
|
||||||
return chatml_msgs.str();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
|
@ -150,6 +150,7 @@
|
||||||
packages =
|
packages =
|
||||||
{
|
{
|
||||||
default = config.legacyPackages.llamaPackages.llama-cpp;
|
default = config.legacyPackages.llamaPackages.llama-cpp;
|
||||||
|
vulkan = config.packages.default.override { useVulkan = true; };
|
||||||
}
|
}
|
||||||
// lib.optionalAttrs pkgs.stdenv.isLinux {
|
// lib.optionalAttrs pkgs.stdenv.isLinux {
|
||||||
opencl = config.packages.default.override { useOpenCL = true; };
|
opencl = config.packages.default.override { useOpenCL = true; };
|
||||||
|
@ -157,7 +158,6 @@
|
||||||
|
|
||||||
mpi-cpu = config.packages.default.override { useMpi = true; };
|
mpi-cpu = config.packages.default.override { useMpi = true; };
|
||||||
mpi-cuda = config.packages.default.override { useMpi = true; };
|
mpi-cuda = config.packages.default.override { useMpi = true; };
|
||||||
vulkan = config.packages.default.override { useVulkan = true; };
|
|
||||||
}
|
}
|
||||||
// lib.optionalAttrs (system == "x86_64-linux") {
|
// lib.optionalAttrs (system == "x86_64-linux") {
|
||||||
rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
|
rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
|
||||||
|
|
22
ggml-cuda.cu
22
ggml-cuda.cu
|
@ -54,6 +54,8 @@
|
||||||
#define cudaDeviceProp hipDeviceProp_t
|
#define cudaDeviceProp hipDeviceProp_t
|
||||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||||
#define cudaError_t hipError_t
|
#define cudaError_t hipError_t
|
||||||
|
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||||
|
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||||
#define cudaEventDisableTiming hipEventDisableTiming
|
#define cudaEventDisableTiming hipEventDisableTiming
|
||||||
#define cudaEventRecord hipEventRecord
|
#define cudaEventRecord hipEventRecord
|
||||||
|
@ -9325,9 +9327,15 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
||||||
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
||||||
if (can_access_peer) {
|
if (can_access_peer) {
|
||||||
if (enable_peer_access) {
|
if (enable_peer_access) {
|
||||||
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
|
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
|
||||||
|
if (err != cudaErrorPeerAccessAlreadyEnabled) {
|
||||||
|
CUDA_CHECK(err);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
|
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
|
||||||
|
if (err != cudaErrorPeerAccessNotEnabled) {
|
||||||
|
CUDA_CHECK(err);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10999,10 +11007,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backe
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
// unused at the moment
|
static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
||||||
//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
||||||
// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
|
||||||
//}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||||
|
@ -11390,7 +11398,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
if (node->src[j] != nullptr) {
|
if (node->src[j] != nullptr) {
|
||||||
assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
|
assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
|
||||||
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
||||||
assert(node->src[j]->extra != nullptr);
|
assert(node->src[j]->extra != nullptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
220
ggml-sycl.cpp
220
ggml-sycl.cpp
|
@ -9188,7 +9188,9 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
template <int qk, int qi, typename block_q_t, int vdr,
|
||||||
|
vec_dot_q_sycl_t vec_dot_q_sycl>
|
||||||
|
static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
|
||||||
float *dst, const int ncols,
|
float *dst, const int ncols,
|
||||||
const int nrows,
|
const int nrows,
|
||||||
dpct::queue_ptr stream) {
|
dpct::queue_ptr stream) {
|
||||||
|
@ -9197,164 +9199,10 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||||
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
|
mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
|
||||||
vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK4_1 == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
|
|
||||||
vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK5_0 == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
|
|
||||||
vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK5_1 == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
|
|
||||||
vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK8_0 == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
|
|
||||||
vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
|
|
||||||
vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
|
|
||||||
vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
|
|
||||||
vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
|
|
||||||
vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
||||||
float *dst, const int ncols,
|
|
||||||
const int nrows,
|
|
||||||
dpct::queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
|
||||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
||||||
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
|
|
||||||
vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
||||||
item_ct1);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12095,36 +11943,62 @@ inline void ggml_sycl_op_mul_mat_vec_q(
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t row_diff = row_high - row_low;
|
const int64_t row_diff = row_high - row_low;
|
||||||
|
|
||||||
|
// TODO: support these quantization types
|
||||||
|
GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
|
||||||
|
src0->type == GGML_TYPE_IQ2_XS ||
|
||||||
|
src0->type == GGML_TYPE_IQ3_XXS ||
|
||||||
|
src0->type == GGML_TYPE_IQ1_S));
|
||||||
|
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
|
||||||
|
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
|
||||||
|
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
|
||||||
|
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
|
||||||
|
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
|
||||||
|
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
|
||||||
|
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
|
||||||
|
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
|
||||||
|
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
|
||||||
|
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
|
||||||
|
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
||||||
|
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -12145,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||||
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
||||||
const dpct::queue_ptr &stream) {
|
const dpct::queue_ptr &stream) {
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||||
|
|
||||||
const int64_t row_diff = row_high - row_low;
|
const int64_t row_diff = row_high - row_low;
|
||||||
|
|
||||||
|
@ -15093,6 +14967,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (a->type == GGML_TYPE_IQ1_S) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (a->type == GGML_TYPE_IQ3_XXS) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
if (a->type == GGML_TYPE_IQ2_XXS) {
|
if (a->type == GGML_TYPE_IQ2_XXS) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1091,7 +1091,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_instance_init() {
|
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
||||||
|
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
||||||
|
|
||||||
|
void ggml_vk_instance_init() {
|
||||||
if (vk_instance_initialized) {
|
if (vk_instance_initialized) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -1100,20 +1103,34 @@ static void ggml_vk_instance_init() {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
||||||
const std::vector<const char*> layers = {
|
|
||||||
#ifdef GGML_VULKAN_VALIDATE
|
const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
|
||||||
"VK_LAYER_KHRONOS_validation",
|
const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
|
||||||
#endif
|
const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
|
||||||
};
|
|
||||||
const std::vector<const char*> extensions = {
|
std::vector<const char*> layers;
|
||||||
#ifdef GGML_VULKAN_VALIDATE
|
|
||||||
"VK_EXT_validation_features",
|
if (validation_ext) {
|
||||||
#endif
|
layers.push_back("VK_LAYER_KHRONOS_validation");
|
||||||
};
|
}
|
||||||
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
|
std::vector<const char*> extensions;
|
||||||
#ifdef GGML_VULKAN_VALIDATE
|
if (validation_ext) {
|
||||||
const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
|
extensions.push_back("VK_EXT_validation_features");
|
||||||
vk::ValidationFeaturesEXT validation_features = {
|
}
|
||||||
|
if (portability_enumeration_ext) {
|
||||||
|
extensions.push_back("VK_KHR_portability_enumeration");
|
||||||
|
}
|
||||||
|
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
|
||||||
|
if (portability_enumeration_ext) {
|
||||||
|
instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<vk::ValidationFeatureEnableEXT> features_enable;
|
||||||
|
vk::ValidationFeaturesEXT validation_features;
|
||||||
|
|
||||||
|
if (validation_ext) {
|
||||||
|
features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
|
||||||
|
validation_features = {
|
||||||
features_enable,
|
features_enable,
|
||||||
{},
|
{},
|
||||||
};
|
};
|
||||||
|
@ -1121,7 +1138,7 @@ static void ggml_vk_instance_init() {
|
||||||
instance_create_info.setPNext(&validation_features);
|
instance_create_info.setPNext(&validation_features);
|
||||||
|
|
||||||
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
||||||
#endif
|
}
|
||||||
vk_instance.instance = vk::createInstance(instance_create_info);
|
vk_instance.instance = vk::createInstance(instance_create_info);
|
||||||
|
|
||||||
memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
|
memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
|
||||||
|
@ -1168,12 +1185,12 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
||||||
vk_instance.devices[idx] = std::make_shared<vk_device>();
|
vk_instance.devices[idx] = std::make_shared<vk_device>();
|
||||||
ctx->device = vk_instance.devices[idx];
|
ctx->device = vk_instance.devices[idx];
|
||||||
ctx->device.lock()->physical_device = devices[dev_num];
|
ctx->device.lock()->physical_device = devices[dev_num];
|
||||||
std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
|
const std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
|
||||||
|
|
||||||
bool maintenance4_support = false;
|
bool maintenance4_support = false;
|
||||||
|
|
||||||
// Check if maintenance4 is supported
|
// Check if maintenance4 is supported
|
||||||
for (auto properties : ext_props) {
|
for (const auto& properties : ext_props) {
|
||||||
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
|
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
|
||||||
maintenance4_support = true;
|
maintenance4_support = true;
|
||||||
}
|
}
|
||||||
|
@ -1204,7 +1221,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
||||||
bool fp16_storage = false;
|
bool fp16_storage = false;
|
||||||
bool fp16_compute = false;
|
bool fp16_compute = false;
|
||||||
|
|
||||||
for (auto properties : ext_props) {
|
for (const auto& properties : ext_props) {
|
||||||
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
||||||
fp16_storage = true;
|
fp16_storage = true;
|
||||||
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
||||||
|
@ -5301,6 +5318,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
|
||||||
return vk_instance.device_indices.size();
|
return vk_instance.device_indices.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Extension availability
|
||||||
|
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
||||||
|
#ifdef GGML_VULKAN_VALIDATE
|
||||||
|
bool portability_enumeration_ext = false;
|
||||||
|
// Check for portability enumeration extension for MoltenVK support
|
||||||
|
for (const auto& properties : instance_extensions) {
|
||||||
|
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!portability_enumeration_ext) {
|
||||||
|
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return false;
|
||||||
|
|
||||||
|
UNUSED(instance_extensions);
|
||||||
|
}
|
||||||
|
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
||||||
|
#ifdef __APPLE__
|
||||||
|
bool portability_enumeration_ext = false;
|
||||||
|
// Check for portability enumeration extension for MoltenVK support
|
||||||
|
for (const auto& properties : instance_extensions) {
|
||||||
|
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!portability_enumeration_ext) {
|
||||||
|
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return false;
|
||||||
|
|
||||||
|
UNUSED(instance_extensions);
|
||||||
|
}
|
||||||
|
|
||||||
// checks
|
// checks
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
|
|
2
ggml.c
2
ggml.c
|
@ -273,6 +273,8 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
||||||
#include <Accelerate/Accelerate.h>
|
#include <Accelerate/Accelerate.h>
|
||||||
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
||||||
#include "ggml-opencl.h"
|
#include "ggml-opencl.h"
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
#endif
|
#endif
|
||||||
#elif defined(GGML_USE_OPENBLAS)
|
#elif defined(GGML_USE_OPENBLAS)
|
||||||
#if defined(GGML_BLAS_USE_MKL)
|
#if defined(GGML_BLAS_USE_MKL)
|
||||||
|
|
|
@ -12602,7 +12602,7 @@ LLAMA_API int32_t llama_chat_apply_template(
|
||||||
// load template from model
|
// load template from model
|
||||||
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
||||||
std::string template_key = "tokenizer.chat_template";
|
std::string template_key = "tokenizer.chat_template";
|
||||||
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size());
|
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
||||||
if (res < 0) {
|
if (res < 0) {
|
||||||
// worst case: there is no information about template, we will use chatml by default
|
// worst case: there is no information about template, we will use chatml by default
|
||||||
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
|
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue