From 9d23589d638dc74577d5ff880e6d4248b795f12e Mon Sep 17 00:00:00 2001 From: Erik Scholz Date: Tue, 27 Jun 2023 19:06:33 +0200 Subject: [PATCH 1/8] fix pthreads setaffinity usage on android (#2020) --- ggml.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 92faf03f7..684caaa37 100644 --- a/ggml.c +++ b/ggml.c @@ -16684,7 +16684,8 @@ typedef pthread_t ggml_thread_t; #endif -#ifdef __linux__ +// Android's libc implementation "bionic" does not support setting affinity +#if defined(__linux__) && !defined(__BIONIC__) void set_numa_thread_affinity(int thread_n, int n_threads) { if (!ggml_is_numa()) { return; From cfa0750bc9dbc2d957a91b8ed09ab0035d8f3d4e Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Wed, 28 Jun 2023 23:53:37 +0800 Subject: [PATCH 2/8] llama : support input embeddings directly (#1910) * add interface for float input * fixed inpL shape and type * add examples of input floats * add test example for embd input * fixed sampling * add free for context * fixed add end condition for generating * add examples for llava.py * add READMD for llava.py * add READMD for llava.py * add example of PandaGPT * refactor the interface and fixed the styles * add cmake build for embd-input * add cmake build for embd-input * Add MiniGPT-4 example * change the order of the args of llama_eval_internal * fix ci error --- .gitignore | 3 +- Makefile | 11 +- convert-lora-to-ggml.py | 6 +- examples/CMakeLists.txt | 1 + examples/embd-input/.gitignore | 4 + examples/embd-input/CMakeLists.txt | 15 ++ examples/embd-input/README.md | 63 +++++++ examples/embd-input/embd-input-lib.cpp | 220 ++++++++++++++++++++++++ examples/embd-input/embd-input-test.cpp | 35 ++++ examples/embd-input/embd-input.h | 30 ++++ examples/embd-input/embd_input.py | 71 ++++++++ examples/embd-input/llava.py | 70 ++++++++ examples/embd-input/minigpt4.py | 128 ++++++++++++++ examples/embd-input/panda_gpt.py | 98 +++++++++++ llama.cpp | 70 ++++++-- llama.h | 8 + 16 files changed, 811 insertions(+), 22 deletions(-) create mode 100644 examples/embd-input/.gitignore create mode 100644 examples/embd-input/CMakeLists.txt create mode 100644 examples/embd-input/README.md create mode 100644 examples/embd-input/embd-input-lib.cpp create mode 100644 examples/embd-input/embd-input-test.cpp create mode 100644 examples/embd-input/embd-input.h create mode 100644 examples/embd-input/embd_input.py create mode 100644 examples/embd-input/llava.py create mode 100644 examples/embd-input/minigpt4.py create mode 100644 examples/embd-input/panda_gpt.py diff --git a/.gitignore b/.gitignore index e7bfd52e3..4fccec31b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.o *.a +*.so .DS_Store .build/ .cache/ @@ -39,8 +40,8 @@ models/* /vdot /server /Pipfile +/embd-input-test /libllama.so - build-info.h arm_neon.h compile_commands.json diff --git a/Makefile b/Makefile index bda11791d..03f38bdba 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple libembdinput.so embd-input-test ifdef LLAMA_BUILD_SERVER BUILD_TARGETS += server @@ -272,7 +272,7 @@ libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) clean: - rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h + rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch embd-input-test build-info.h # # Examples @@ -305,6 +305,13 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml. server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) +libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) + + +embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput + train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index 9090e8d6d..f43c836f5 100644 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -113,6 +113,10 @@ with open(output_path, "wb") as fout: write_file_header(fout, params) for k, v in model.items(): + if k.endswith(".default.weight"): + k = k.replace(".default.weight", ".weight") + if k in ["llama_proj.weight", "llama_proj.bias"]: + continue if k.endswith("lora_A.weight"): if v.dtype != torch.float16 and v.dtype != torch.float32: v = v.float() @@ -120,7 +124,7 @@ with open(output_path, "wb") as fout: else: v = v.float() - t = v.numpy() + t = v.detach().numpy() tname = translate_tensor_name(k) print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") write_tensor_header(fout, tname, t.shape, t.dtype) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index cf9c4a223..161960bb8 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -39,6 +39,7 @@ else() add_subdirectory(baby-llama) add_subdirectory(train-text-from-scratch) add_subdirectory(simple) + add_subdirectory(embd-input) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/examples/embd-input/.gitignore b/examples/embd-input/.gitignore new file mode 100644 index 000000000..87ef68771 --- /dev/null +++ b/examples/embd-input/.gitignore @@ -0,0 +1,4 @@ +PandaGPT +MiniGPT-4 +*.pth + diff --git a/examples/embd-input/CMakeLists.txt b/examples/embd-input/CMakeLists.txt new file mode 100644 index 000000000..2b623953e --- /dev/null +++ b/examples/embd-input/CMakeLists.txt @@ -0,0 +1,15 @@ +set(TARGET embdinput) +add_library(${TARGET} embd-input-lib.cpp embd-input.h) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() + +set(TARGET embd-input-test) +add_executable(${TARGET} embd-input-test.cpp) +target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/embd-input/README.md b/examples/embd-input/README.md new file mode 100644 index 000000000..02d028f26 --- /dev/null +++ b/examples/embd-input/README.md @@ -0,0 +1,63 @@ +### Examples for input embedding directly + +## Requirement +build `libembdinput.so` +run the following comman in main dir (../../). +``` +make +``` + +## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py) + +1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/). +2. Convert it to ggml format. +3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin). + +``` +import torch + +bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin" +pth_path = "./examples/embd_input/llava_projection.pth" + +dic = torch.load(bin_path) +used_key = ["model.mm_projector.weight","model.mm_projector.bias"] +torch.save({k: dic[k] for k in used_key}, pth_path) +``` +4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`. + + +## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py) + +1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format. +The `adapter_config.json` is +``` +{ + "peft_type": "LORA", + "fan_in_fan_out": false, + "bias": null, + "modules_to_save": null, + "r": 32, + "lora_alpha": 32, + "lora_dropout": 0.1, + "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"] +} +``` +2. Papare the `vicuna` v0 model. +3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model. +4. Clone the PandaGPT source. +``` +git clone https://github.com/yxuansu/PandaGPT +``` +5. Install the requirement of PandaGPT. +6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py. + +## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py) + +1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`. +2. Clone the MiniGPT-4 source. +``` +git clone https://github.com/Vision-CAIR/MiniGPT-4/ +``` +3. Install the requirement of PandaGPT. +4. Papare the `vicuna` v0 model. +5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`. diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp new file mode 100644 index 000000000..37de52ad6 --- /dev/null +++ b/examples/embd-input/embd-input-lib.cpp @@ -0,0 +1,220 @@ +// Defines sigaction on msys: +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include "embd-input.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static llama_context ** g_ctx; + +extern "C" { + +struct MyModel* create_mymodel(int argc, char ** argv) { + gpt_params params; + + if (gpt_params_parse(argc, argv, params) == false) { + return nullptr; + } + + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + + if (params.seed < 0) { + params.seed = time(NULL); + } + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + + llama_init_backend(params.numa); + + llama_model * model; + llama_context * ctx; + + g_ctx = &ctx; + + // load the model and apply lora adapter, if any + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + return nullptr; + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + struct MyModel * ret = new MyModel(); + ret->ctx = ctx; + ret->params = params; + ret->n_past = 0; + // printf("ctx: %d\n", ret->ctx); + return ret; +} + +void free_mymodel(struct MyModel * mymodel) { + llama_context * ctx = mymodel->ctx; + llama_print_timings(ctx); + llama_free(ctx); + delete mymodel; +} + + +bool eval_float(void * model, float * input, int N){ + MyModel * mymodel = (MyModel*)model; + llama_context * ctx = mymodel->ctx; + gpt_params params = mymodel->params; + int n_emb = llama_n_embd(ctx); + int n_past = mymodel->n_past; + int n_batch = N; // params.n_batch; + + for (int i = 0; i < (int) N; i += n_batch) { + int n_eval = (int) N - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + if (llama_eval_embd(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + n_past += n_eval; + } + mymodel->n_past = n_past; + return true; +} + +bool eval_tokens(void * model, std::vector tokens) { + MyModel * mymodel = (MyModel* )model; + llama_context * ctx; + ctx = mymodel->ctx; + gpt_params params = mymodel->params; + int n_past = mymodel->n_past; + for (int i = 0; i < (int) tokens.size(); i += params.n_batch) { + int n_eval = (int) tokens.size() - i; + if (n_eval > params.n_batch) { + n_eval = params.n_batch; + } + if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + n_past += n_eval; + } + mymodel->n_past = n_past; + return true; +} + +bool eval_id(struct MyModel* mymodel, int id) { + std::vector tokens; + tokens.push_back(id); + return eval_tokens(mymodel, tokens); +} + +bool eval_string(struct MyModel * mymodel,const char* str){ + llama_context * ctx = mymodel->ctx; + std::string str2 = str; + std::vector embd_inp = ::llama_tokenize(ctx, str2, true); + eval_tokens(mymodel, embd_inp); + return true; +} + +llama_token sampling_id(struct MyModel* mymodel) { + llama_context* ctx = mymodel->ctx; + gpt_params params = mymodel->params; + // int n_ctx = llama_n_ctx(ctx); + + // out of user input, sample next token + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + const float top_p = params.top_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; + // const float repeat_penalty = params.repeat_penalty; + // const float alpha_presence = params.presence_penalty; + // const float alpha_frequency = params.frequency_penalty; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + // const bool penalize_nl = params.penalize_nl; + + llama_token id = 0; + { + auto logits = llama_get_logits(ctx); + auto n_vocab = llama_n_vocab(ctx); + + // Apply params.logit_bias map + for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { + logits[it->first] += it->second; + } + + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // TODO: Apply penalties + // float nl_logit = logits[llama_token_nl()]; + // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); + // llama_sample_repetition_penalty(ctx, &candidates_p, + // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + // last_n_repeat, repeat_penalty); + // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, + // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + // last_n_repeat, alpha_frequency, alpha_presence); + // if (!penalize_nl) { + // logits[llama_token_nl()] = nl_logit; + // } + + if (temp <= 0) { + // Greedy sampling + id = llama_sample_token_greedy(ctx, &candidates_p); + } else { + if (mirostat == 1) { + static float mirostat_mu = 2.0f * mirostat_tau; + const int mirostat_m = 100; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); + } else if (mirostat == 2) { + static float mirostat_mu = 2.0f * mirostat_tau; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); + } else { + // Temperature sampling + llama_sample_top_k(ctx, &candidates_p, top_k, 1); + llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); + llama_sample_typical(ctx, &candidates_p, typical_p, 1); + llama_sample_top_p(ctx, &candidates_p, top_p, 1); + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token(ctx, &candidates_p); + } + } + } + + return id; +} + +const char * sampling(struct MyModel * mymodel) { + llama_context * ctx = mymodel->ctx; + int id = sampling_id(mymodel); + std::string ret; + if (id == llama_token_eos()) ret = ""; + else ret = llama_token_to_str(ctx, id); + eval_id(mymodel, id); + return ret.c_str(); +} + +} diff --git a/examples/embd-input/embd-input-test.cpp b/examples/embd-input/embd-input-test.cpp new file mode 100644 index 000000000..e5e040f62 --- /dev/null +++ b/examples/embd-input/embd-input-test.cpp @@ -0,0 +1,35 @@ +#include "embd-input.h" +#include +#include +#include + +int main(int argc, char** argv) { + + auto mymodel = create_mymodel(argc, argv); + int N = 10; + int max_tgt_len = 500; + int n_embd = llama_n_embd(mymodel->ctx); + + // add random float embd to test evaluation + float * data = new float[N*n_embd]; + std::default_random_engine e; + std::uniform_real_distribution u(0,1); + for (int i=0;iparams.prompt.c_str()); + const char* tmp; + for (int i=0; i")==0) break; + printf("%s", tmp); + fflush(stdout); + } + printf("\n"); + free_mymodel(mymodel); + return 0; +} diff --git a/examples/embd-input/embd-input.h b/examples/embd-input/embd-input.h new file mode 100644 index 000000000..4fefabd42 --- /dev/null +++ b/examples/embd-input/embd-input.h @@ -0,0 +1,30 @@ +#ifndef _EMBD_INPUT_H_ +#define _EMBD_INPUT_H_ 1 + +#include "common.h" +#include "llama.h" +#include "build-info.h" + + +extern "C" { + +typedef struct MyModel { + llama_context* ctx; + gpt_params params; + int n_past = 0; +} MyModel; + + +struct MyModel* create_mymodel(int argc, char ** argv); + +bool eval_float(void* model, float* input, int N); +bool eval_tokens(void* model, std::vector tokens); +bool eval_id(struct MyModel* mymodel, int id); +bool eval_string(struct MyModel* mymodel, const char* str); +const char* sampling(struct MyModel* mymodel); +llama_token sampling_id(struct MyModel* mymodel); +void free_mymodel(struct MyModel* mymodel); + +} + +#endif diff --git a/examples/embd-input/embd_input.py b/examples/embd-input/embd_input.py new file mode 100644 index 000000000..be2896614 --- /dev/null +++ b/examples/embd-input/embd_input.py @@ -0,0 +1,71 @@ +import ctypes +from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int +import numpy as np +import os + +libc = cdll.LoadLibrary("./libembdinput.so") +libc.sampling.restype=c_char_p +libc.create_mymodel.restype=c_void_p +libc.eval_string.argtypes=[c_void_p, c_char_p] +libc.sampling.argtypes=[c_void_p] +libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int] + + +class MyModel: + def __init__(self, args): + argc = len(args) + c_str = [c_char_p(i.encode()) for i in args] + args_c = (c_char_p * argc)(*c_str) + self.model = c_void_p(libc.create_mymodel(argc, args_c)) + self.max_tgt_len = 512 + self.print_string_eval = True + + def __del__(self): + libc.free_mymodel(self.model) + + def eval_float(self, x): + libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1]) + + def eval_string(self, x): + libc.eval_string(self.model, x.encode()) # c_char_p(x.encode())) + if self.print_string_eval: + print(x) + + def eval_token(self, x): + libc.eval_id(self.model, x) + + def sampling(self): + s = libc.sampling(self.model) + return s + + def stream_generate(self, end=""): + ret = b"" + end = end.encode() + for _ in range(self.max_tgt_len): + tmp = self.sampling() + ret += tmp + yield tmp + if ret.endswith(end): + break + + def generate_with_print(self, end=""): + ret = b"" + for i in self.stream_generate(end=end): + ret += i + print(i.decode(errors="replace"), end="", flush=True) + print("") + return ret.decode(errors="replace") + + + def generate(self, end=""): + text = b"".join(self.stream_generate(end=end)) + return text.decode(errors="replace") + +if __name__ == "__main__": + model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"]) + model.eval_string("""user: what is the color of the flag of UN?""") + x = np.random.random((5120,10))# , dtype=np.float32) + model.eval_float(x) + model.eval_string("""assistant:""") + for i in model.generate(): + print(i.decode(errors="replace"), end="", flush=True) diff --git a/examples/embd-input/llava.py b/examples/embd-input/llava.py new file mode 100644 index 000000000..2f20cb722 --- /dev/null +++ b/examples/embd-input/llava.py @@ -0,0 +1,70 @@ +import sys +import os +sys.path.insert(0, os.path.dirname(__file__)) +from embd_input import MyModel +import numpy as np +from torch import nn +import torch +from transformers import CLIPVisionModel, CLIPImageProcessor +from PIL import Image + +# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1' +vision_tower = "openai/clip-vit-large-patch14" +select_hidden_state_layer = -2 +# (vision_config.image_size // vision_config.patch_size) ** 2 +image_token_len = (224//14)**2 + +class Llava: + def __init__(self, args): + self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower) + self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower) + self.mm_projector = nn.Linear(1024, 5120) + self.model = MyModel(["main", *args]) + + def load_projection(self, path): + state = torch.load(path) + self.mm_projector.load_state_dict({ + "weight": state["model.mm_projector.weight"], + "bias": state["model.mm_projector.bias"]}) + + def chat(self, question): + self.model.eval_string("user: ") + self.model.eval_string(question) + self.model.eval_string("\nassistant: ") + return self.model.generate_with_print() + + def chat_with_image(self, image, question): + with torch.no_grad(): + embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True) + select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer] + image_feature = select_hidden_state[:, 1:] + embd_image = self.mm_projector(image_feature) + embd_image = embd_image.cpu().numpy()[0] + self.model.eval_string("user: ") + self.model.eval_token(32003-2) # im_start + self.model.eval_float(embd_image.T) + for i in range(image_token_len-embd_image.shape[0]): + self.model.eval_token(32003-3) # im_patch + self.model.eval_token(32003-1) # im_end + self.model.eval_string(question) + self.model.eval_string("\nassistant: ") + return self.model.generate_with_print() + + +if __name__=="__main__": + # model form liuhaotian/LLaVA-13b-delta-v1-1 + a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"]) + # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin. + # Also here can use pytorch_model-00003-of-00003.bin directly. + a.load_projection(os.path.join( + os.path.dirname(__file__) , + "llava_projetion.pth")) + respose = a.chat_with_image( + Image.open("./media/llama1-logo.png").convert('RGB'), + "what is the text in the picture?") + respose + a.chat("what is the color of it?") + + + diff --git a/examples/embd-input/minigpt4.py b/examples/embd-input/minigpt4.py new file mode 100644 index 000000000..8e98f8517 --- /dev/null +++ b/examples/embd-input/minigpt4.py @@ -0,0 +1,128 @@ +import sys +import os +sys.path.insert(0, os.path.dirname(__file__)) +from embd_input import MyModel +import numpy as np +from torch import nn +import torch +from PIL import Image + +minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4") +sys.path.insert(0, minigpt4_path) +from minigpt4.models.blip2 import Blip2Base +from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor + + +class MiniGPT4(Blip2Base): + """ + MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4 + """ + def __init__(self, + args, + vit_model="eva_clip_g", + q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth", + img_size=224, + drop_path_rate=0, + use_grad_checkpoint=False, + vit_precision="fp32", + freeze_vit=True, + freeze_qformer=True, + num_query_token=32, + llama_model="", + prompt_path="", + prompt_template="", + max_txt_len=32, + end_sym='\n', + low_resource=False, # use 8 bit and put vit in cpu + device_8bit=0 + ): + super().__init__() + self.img_size = img_size + self.low_resource = low_resource + self.preprocessor = Blip2ImageEvalProcessor(img_size) + + print('Loading VIT') + self.visual_encoder, self.ln_vision = self.init_vision_encoder( + vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision + ) + print('Loading VIT Done') + print('Loading Q-Former') + self.Qformer, self.query_tokens = self.init_Qformer( + num_query_token, self.visual_encoder.num_features + ) + self.Qformer.cls = None + self.Qformer.bert.embeddings.word_embeddings = None + self.Qformer.bert.embeddings.position_embeddings = None + for layer in self.Qformer.bert.encoder.layer: + layer.output = None + layer.intermediate = None + self.load_from_pretrained(url_or_filename=q_former_model) + print('Loading Q-Former Done') + self.llama_proj = nn.Linear( + self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size + ) + self.max_txt_len = max_txt_len + self.end_sym = end_sym + self.model = MyModel(["main", *args]) + # system promt + self.model.eval_string("Give the following image: ImageContent. " + "You will be able to see the image once I provide it to you. Please answer my questions." + "###") + + def encode_img(self, image): + image = self.preprocessor(image) + image = image.unsqueeze(0) + device = image.device + if self.low_resource: + self.vit_to_cpu() + image = image.to("cpu") + + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)).to(device) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_llama = self.llama_proj(query_output.last_hidden_state) + # atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device) + return inputs_llama + + def load_projection(self, path): + state = torch.load(path)["model"] + self.llama_proj.load_state_dict({ + "weight": state["llama_proj.weight"], + "bias": state["llama_proj.bias"]}) + + def chat(self, question): + self.model.eval_string("Human: ") + self.model.eval_string(question) + self.model.eval_string("\n### Assistant:") + return self.model.generate_with_print(end="###") + + def chat_with_image(self, image, question): + with torch.no_grad(): + embd_image = self.encode_img(image) + embd_image = embd_image.cpu().numpy()[0] + self.model.eval_string("Human: ") + self.model.eval_float(embd_image.T) + self.model.eval_string(" ") + self.model.eval_string(question) + self.model.eval_string("\n### Assistant:") + return self.model.generate_with_print(end="###") + + +if __name__=="__main__": + a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"]) + a.load_projection(os.path.join( + os.path.dirname(__file__) , + "pretrained_minigpt4.pth")) + respose = a.chat_with_image( + Image.open("./media/llama1-logo.png").convert('RGB'), + "what is the text in the picture?") + a.chat("what is the color of it?") diff --git a/examples/embd-input/panda_gpt.py b/examples/embd-input/panda_gpt.py new file mode 100644 index 000000000..0cfac5f32 --- /dev/null +++ b/examples/embd-input/panda_gpt.py @@ -0,0 +1,98 @@ +import sys +import os +sys.path.insert(0, os.path.dirname(__file__)) +from embd_input import MyModel +import numpy as np +from torch import nn +import torch + +# use PandaGPT path +panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT") +imagebind_ckpt_path = "./models/panda_gpt/" + +sys.path.insert(0, os.path.join(panda_gpt_path,"code","model")) +from ImageBind.models import imagebind_model +from ImageBind import data + +ModalityType = imagebind_model.ModalityType +max_tgt_len = 400 + +class PandaGPT: + def __init__(self, args): + self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path) + self.visual_encoder.eval() + self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120) + self.max_tgt_len = max_tgt_len + self.model = MyModel(["main", *args]) + self.generated_text = "" + self.device = "cpu" + + def load_projection(self, path): + state = torch.load(path, map_location="cpu") + self.llama_proj.load_state_dict({ + "weight": state["llama_proj.weight"], + "bias": state["llama_proj.bias"]}) + + def eval_inputs(self, inputs): + self.model.eval_string("") + embds = self.extract_multimoal_feature(inputs) + for i in embds: + self.model.eval_float(i.T) + self.model.eval_string(" ") + + def chat(self, question): + return self.chat_with_image(None, question) + + def chat_with_image(self, inputs, question): + if self.generated_text == "": + self.model.eval_string("###") + self.model.eval_string(" Human: ") + if inputs: + self.eval_inputs(inputs) + self.model.eval_string(question) + self.model.eval_string("\n### Assistant:") + ret = self.model.generate_with_print(end="###") + self.generated_text += ret + return ret + + def extract_multimoal_feature(self, inputs): + features = [] + for key in ["image", "audio", "video", "thermal"]: + if key + "_paths" in inputs: + embeds = self.encode_data(key, inputs[key+"_paths"]) + features.append(embeds) + return features + + def encode_data(self, data_type, data_paths): + + type_map = { + "image": ModalityType.VISION, + "audio": ModalityType.AUDIO, + "video": ModalityType.VISION, + "thermal": ModalityType.THERMAL, + } + load_map = { + "image": data.load_and_transform_vision_data, + "audio": data.load_and_transform_audio_data, + "video": data.load_and_transform_video_data, + "thermal": data.load_and_transform_thermal_data + } + + load_function = load_map[data_type] + key = type_map[data_type] + + inputs = {key: load_function(data_paths, self.device)} + with torch.no_grad(): + embeddings = self.visual_encoder(inputs) + embeds = embeddings[key] + embeds = self.llama_proj(embeds).cpu().numpy() + return embeds + + +if __name__=="__main__": + a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"]) + a.load_projection("./models/panda_gpt/adapter_model.bin") + a.chat_with_image( + {"image_paths": ["./media/llama1-logo.png"]}, + "what is the text in the picture? 'llama' or 'lambda'?") + a.chat("what is the color of it?") diff --git a/llama.cpp b/llama.cpp index 2482bdd18..5a142aba6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1369,22 +1369,26 @@ static bool llama_model_load( // evaluate the transformer // -// - lctx: llama context -// - tokens: new batch of tokens to process -// - n_past: the context size so far -// - n_threads: number of threads to use -// - cgraph_fname: filename of the exported computation graph +// - lctx: llama context +// - tokens: new batch of tokens to process +// - embd embeddings input +// - n_tokens number of tokens +// - n_past: the context size so far +// - n_threads: number of threads to use // static bool llama_eval_internal( - llama_context & lctx, - const llama_token * tokens, - const int n_tokens, - const int n_past, - const int n_threads, + llama_context & lctx, + const llama_token * tokens, + const float * embd, + const int n_tokens, + const int n_past, + const int n_threads, const char * cgraph_fname) { + LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); + // enforce that the first token is BOS - if (n_past == 0 && tokens[0] != llama_token_bos()) { + if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) { fprintf(stderr, "%s: first token must be BOS\n", __func__); return false; } @@ -1424,12 +1428,18 @@ static bool llama_eval_internal( ggml_cgraph gf = {}; gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_set_name(embd, "embd"); - memcpy(embd->data, tokens, N*ggml_element_size(embd)); - struct ggml_tensor * cur; - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); + struct ggml_tensor * inpL; + + if (tokens) { + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_set_name(embd, "embd"); + memcpy(embd->data, tokens, N*ggml_element_size(embd)); + inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); + } else { + inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); + } const int i_gpu_start = n_layer - n_gpu_layers; (void) i_gpu_start; @@ -2654,6 +2664,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } + + // // interface implementation // @@ -3421,7 +3433,29 @@ int llama_eval( int n_tokens, int n_past, int n_threads) { - if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) { + if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) { + fprintf(stderr, "%s: failed to eval\n", __func__); + return 1; + } + + // get a more accurate load time, upon first eval + // TODO: fix this + if (!ctx->has_evaluated_once) { + ctx->t_load_us = ggml_time_us() - ctx->t_start_us; + ctx->has_evaluated_once = true; + } + + return 0; +} + + +int llama_eval_embd( + struct llama_context * ctx, + const float * embd, + int n_tokens, + int n_past, + int n_threads) { + if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } @@ -3442,7 +3476,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) { const std::vector tmp(n_batch, llama_token_bos()); - if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) { + if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } diff --git a/llama.h b/llama.h index 76239be25..c2f2e5331 100644 --- a/llama.h +++ b/llama.h @@ -226,6 +226,14 @@ extern "C" { int n_past, int n_threads); + // Same as llama_eval, but use float matrix input directly. + LLAMA_API int llama_eval_embd( + struct llama_context * ctx, + const float * embd, + int n_tokens, + int n_past, + int n_threads); + // Export a static computation graph for context of 511 and batch size of 1 // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these // parameters here to keep things simple From 7f9753fa1263c4eded9a3de19778562f0e1093d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 28 Jun 2023 18:35:54 +0200 Subject: [PATCH 3/8] CUDA GPU acceleration for LoRAs + f16 models (#1970) --- examples/common.cpp | 7 ------ ggml-cuda.cu | 53 +++++++++++++++++++++++++++++++++++---------- ggml-cuda.h | 1 + llama.cpp | 36 +++++++++++++++++++++++++++++- 4 files changed, 78 insertions(+), 19 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 002302734..5addd10a1 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -416,13 +416,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { exit(1); } -#ifdef GGML_USE_CUBLAS - if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) { - fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__); - exit(1); - } -#endif // GGML_USE_CUBLAS - if (escape_prompt) { process_escapes(params.prompt); } diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c34e96abf..be75cb792 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -223,6 +223,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co dst[i] = x[i] + y[i]; } +static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = __hadd(x[i], __float2half(y[i])); +} + static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -1459,6 +1468,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in add_f32<<>>(x, y, dst, k); } +static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; + add_f16_f32_f16<<>>(x, y, dst, k); +} + static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) { const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE; mul_f32<<>>(x, y, dst, kx, ky); @@ -1941,7 +1955,7 @@ inline void ggml_cuda_op_add( float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, cudaStream_t & cudaStream_main){ - GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr); GGML_ASSERT(src1_ddf_i != nullptr); GGML_ASSERT(dst_ddf_i != nullptr); @@ -1949,7 +1963,13 @@ inline void ggml_cuda_op_add( const int64_t i01_diff = i01_high - i01_low; // compute - add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main); + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main); + } else { + GGML_ASSERT(false); + } CUDA_CHECK(cudaGetLastError()); (void) src1; @@ -2547,8 +2567,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm } void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true); + // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op. + // Due to flatten_rows == true this does in practice not make a difference however. + // Better solution would be nice but right now that would require disproportionate changes. + GGML_ASSERT( + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) && + src1->type == GGML_TYPE_F32 && + (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16)); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true); } void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -2801,7 +2827,7 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) { delete extra; } -void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) { +void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) { if (scratch && g_scratch_size == 0) { return; } @@ -2810,11 +2836,11 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) { if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) { const ggml_op src0_op = tensor->src0->op; if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) { - ggml_cuda_assign_buffers_impl(tensor->src0, scratch); + ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace); } } if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) { - ggml_cuda_assign_buffers_impl(tensor->src1, scratch); + ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace); } tensor->backend = GGML_BACKEND_GPU; @@ -2822,11 +2848,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) { memset(extra, 0, sizeof(*extra)); const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) || - tensor->op == GGML_OP_VIEW; + tensor->op == GGML_OP_VIEW || + force_inplace; const size_t size = ggml_nbytes(tensor); CUDA_CHECK(cudaSetDevice(g_main_device)); - if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) { + if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) { struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; size_t offset = 0; @@ -2865,11 +2892,15 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) { } void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, true); + ggml_cuda_assign_buffers_impl(tensor, true, false); } void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, false); + ggml_cuda_assign_buffers_impl(tensor, false, false); +} + +void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, false, true); } void ggml_cuda_set_main_device(int main_device) { diff --git a/ggml-cuda.h b/ggml-cuda.h index d32b44842..7a65a3558 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -29,6 +29,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor); void ggml_cuda_free_data(struct ggml_tensor * tensor); void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); +void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor); void ggml_cuda_set_main_device(int main_device); void ggml_cuda_set_scratch_size(size_t scratch_size); void ggml_cuda_free_scratch(void); diff --git a/llama.cpp b/llama.cpp index 5a142aba6..5f3761b0e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2976,7 +2976,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const return false; } } - ggml_tensor* lora_tensor; + ggml_tensor * lora_tensor; if (n_dims == 2) { lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); } @@ -2984,6 +2984,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims); return 1; } + ggml_set_name(lora_tensor, "lora_tensor"); // load tensor data size_t offset = fin.tellg(); @@ -2999,6 +3000,21 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { ggml_tensor * dest_t = model_tensors[base_name]; + + offload_func_t offload_func = llama_nop; + offload_func_t offload_func_force_inplace = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { + if (dest_t->type != GGML_TYPE_F16) { + throw std::runtime_error(format( + "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__)); + } + offload_func = ggml_cuda_assign_buffers; + offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace; + } +#endif // GGML_USE_CUBLAS + ggml_tensor * base_t; if (model_loader) { // load from base model @@ -3026,7 +3042,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const } ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; + GGML_ASSERT(loraA->type == GGML_TYPE_F32); + ggml_set_name(loraA, "loraA"); + ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; + GGML_ASSERT(loraB->type == GGML_TYPE_F32); + ggml_set_name(loraB, "loraB"); if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" @@ -3036,19 +3057,32 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const // w = w + BA*s ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); + offload_func(BA); + ggml_set_name(BA, "BA"); if (scaling != 1.0f) { ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + ggml_set_name(scale_tensor, "scale_tensor"); + BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); + offload_func(BA); + ggml_set_name(BA, "BA_scaled"); } ggml_tensor * r; if (base_t == dest_t) { r = ggml_add_inplace(lora_ctx, dest_t, BA); + offload_func_force_inplace(r); + ggml_set_name(r, "r_add_inplace"); } else { r = ggml_add(lora_ctx, base_t, BA); + offload_func(r); + ggml_set_name(r, "r_add"); + r = ggml_cpy(lora_ctx, r, dest_t); + offload_func(r); + ggml_set_name(r, "r_cpy"); } struct ggml_cgraph gf = ggml_build_forward(r); From b922bc351b69770cec2d35d2aa50fa052b95ca93 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Wed, 28 Jun 2023 10:13:02 -0700 Subject: [PATCH 4/8] llama : remove shards weight file support (#2000) * Remove multiple shards * Remove multiple file loaders * Remove llama_load_tensor_shard class * Simplify load logic * Remove dead code guess_n_parts function * Remove vocab_only from constructor of llama_model_loader * Remove alignment_prevents_mmap which is not more needed. * Remove useless check --- llama.cpp | 233 ++++++++---------------------------------------------- 1 file changed, 35 insertions(+), 198 deletions(-) diff --git a/llama.cpp b/llama.cpp index 5f3761b0e..47e11d03c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -364,96 +364,14 @@ static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml return size / ggml_blck_size(type); } -struct llama_load_tensor_shard { - std::vector ne; - size_t size; - enum ggml_type type; - size_t file_idx; - size_t file_off; - - void calc_size() { - size = llama_calc_tensor_size(ne, type); - } -}; - -enum llama_split_type { - SPLIT_NONE, - SPLIT_BY_COLUMNS, - SPLIT_BY_ROWS -}; - struct llama_load_tensor { - std::vector shards; - std::string name; enum ggml_type type = GGML_TYPE_F32; - llama_split_type split_type = SPLIT_NONE; std::vector ne; + size_t file_off; size_t size; struct ggml_tensor * ggml_tensor = NULL; uint8_t * data; - - llama_load_tensor(const std::string & name) : name(name) {} - - void calc_all() { - calc_type(); - calc_split_type(); - calc_ne(); - calc_size(); - } - - void calc_type() { - const auto & first_shard = shards.at(0); - for (const auto & shard : shards) { - if (shard.type != first_shard.type) { - throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str())); - } - } - type = first_shard.type; - } - - void calc_split_type() { - if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file - shards.size() == 1) { // only one file? - split_type = SPLIT_NONE; - } else if (name.find("tok_embeddings.") == 0 || - name.find(".attention.wo.weight") != std::string::npos || - name.find(".feed_forward.w2.weight") != std::string::npos) { - split_type = SPLIT_BY_COLUMNS; - } else { - split_type = SPLIT_BY_ROWS; - } - } - - void calc_ne() { - const auto & first_shard = shards.at(0); - for (const auto & shard : shards) { - if (shard.ne != first_shard.ne) { - throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s", - name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str())); - } - } - ne = first_shard.ne; - LLAMA_ASSERT(shards.size() <= UINT32_MAX); - uint32_t n_shards = (uint32_t) shards.size(); - switch (split_type) { - case SPLIT_NONE: - ne = first_shard.ne; - break; - case SPLIT_BY_COLUMNS: - ne = {checked_mul(first_shard.ne[0], n_shards), - first_shard.ne[1]}; - break; - case SPLIT_BY_ROWS: - ne = {first_shard.ne[0], - checked_mul(first_shard.ne[1], n_shards)}; - break; - } - } - - void calc_size() { - size = llama_calc_tensor_size(ne, type); - } }; struct llama_load_tensors_map { @@ -476,13 +394,13 @@ struct llama_file_loader { llama_hparams hparams; llama_vocab vocab; - llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map) + llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map) : file(fname, "rb") { fprintf(stderr, "llama.cpp: loading model from %s\n", fname); read_magic(); read_hparams(); read_vocab(); - read_tensor_metadata(file_idx, tensors_map); + read_tensor_metadata(tensors_map); } void read_magic() { uint32_t magic = file.read_u32(); @@ -539,19 +457,19 @@ struct llama_file_loader { tok_score.score = score; } } - void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) { + void read_tensor_metadata(llama_load_tensors_map & tensors_map) { while (file.tell() < file.size) { - llama_load_tensor_shard shard; + llama_load_tensor tensor; uint32_t n_dims = file.read_u32(); uint32_t name_len = file.read_u32(); - shard.type = (enum ggml_type) file.read_u32(); - shard.ne.resize(n_dims); - file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims); + tensor.type = (enum ggml_type) file.read_u32(); + tensor.ne.resize(n_dims); + file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims); std::string name = file.read_string(name_len); if (n_dims < 1 || n_dims > 2) { throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims)); } - switch (shard.type) { + switch (tensor.type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -566,30 +484,20 @@ struct llama_file_loader { case GGML_TYPE_Q6_K: break; default: { - throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type)); + throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type)); } } - if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) { - // skip to the next multiple of 32 bytes - file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); - } - shard.file_idx = file_idx; - shard.file_off = file.tell(); + // skip to the next multiple of 32 bytes + file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); - shard.calc_size(); - file.seek(shard.size, SEEK_CUR); + tensor.file_off = file.tell(); + tensor.name = name; + tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type); + file.seek(tensor.size, SEEK_CUR); - auto it = tensors_map.name_to_idx.find(name); - size_t idx; - if (it != tensors_map.name_to_idx.end()) { - idx = it->second; - } else { - tensors_map.tensors.emplace_back(name); - idx = tensors_map.tensors.size() - 1; - tensors_map.name_to_idx.emplace(name, idx); - } - tensors_map.tensors.at(idx).shards.push_back(shard); + tensors_map.tensors.push_back(tensor); + tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1; } } }; @@ -659,56 +567,19 @@ struct llama_file_saver { }; struct llama_model_loader { - std::vector> file_loaders; + std::unique_ptr file_loader; llama_load_tensors_map tensors_map; bool use_mmap; size_t num_ggml_tensors_created = 0; struct ggml_context * ggml_ctx = NULL; std::unique_ptr mapping; - llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) { - auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); - file_loaders.emplace_back(first_file); - uint32_t n_parts = vocab_only ? 1 : guess_n_parts(); - for (uint32_t i = 1; i < n_parts; i++) { - std::string fname = fname_base + "." + std::to_string(i); - auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map); - file_loaders.emplace_back(ith_file); - if (ith_file->hparams != first_file->hparams) { - throw std::runtime_error(format("llama.cpp: hparams inconsistent between files")); - } - } + llama_model_loader(const std::string & fname_base, bool use_mmap) { + file_loader = std::unique_ptr(new llama_file_loader(fname_base.c_str(), tensors_map)); if (!llama_mmap::SUPPORTED) { use_mmap = false; } - if (use_mmap && alignment_prevents_mmap()) { - fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n"); - use_mmap = false; - } this->use_mmap = use_mmap; - for (llama_load_tensor & lt : tensors_map.tensors) { - lt.calc_all(); - } - } - - bool alignment_prevents_mmap() { - for (const llama_load_tensor & lt : tensors_map.tensors) { - for (const llama_load_tensor_shard & shard : lt.shards) { - if (shard.file_off & 3) { - return true; - } - } - } - return false; - } - - uint32_t guess_n_parts() const { - auto it = tensors_map.name_to_idx.find("tok_embeddings.weight"); - if (it == tensors_map.name_to_idx.end()) { - throw std::runtime_error(std::string("missing tok_embeddings.weight")); - } - const llama_load_tensor & lt = tensors_map.tensors.at(it->second); - return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0); } void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const { @@ -774,7 +645,7 @@ struct llama_model_loader { } if (use_mmap) { - mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa())); + mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa())); if (lmlock) { lmlock->init(mapping->addr); } @@ -830,45 +701,13 @@ struct llama_model_loader { void load_data_for(llama_load_tensor & lt) { if (use_mmap) { - LLAMA_ASSERT(lt.shards.size() == 1); - lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off; - } else if (lt.split_type == SPLIT_NONE) { - llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file; - file.seek(lt.shards.at(0).file_off, SEEK_SET); + lt.data = (uint8_t *) mapping->addr + lt.file_off; + } else { + llama_file & file = file_loader->file; + file.seek(lt.file_off, SEEK_SET); file.read_raw(lt.data, lt.size); - } else if (lt.split_type == SPLIT_BY_ROWS) { - size_t offset = 0; - for (llama_load_tensor_shard & shard : lt.shards) { - llama_file & file = file_loaders.at(shard.file_idx)->file; - file.seek(shard.file_off, SEEK_SET); - file.read_raw(lt.data + offset, shard.size); - offset += shard.size; - } - LLAMA_ASSERT(offset == lt.size); - } else if (lt.split_type == SPLIT_BY_COLUMNS) { - // Let's load the data into temporary buffers to ensure the OS performs large loads. - std::vector tmp_bufs(lt.shards.size()); - for (size_t i = 0; i < lt.shards.size(); i++) { - llama_load_tensor_shard & shard = lt.shards.at(i); - llama_file & file = file_loaders.at(shard.file_idx)->file; - file.seek(shard.file_off, SEEK_SET); - tmp_bufs.at(i).resize(shard.size); - file.read_raw(tmp_bufs.at(i).addr, shard.size); - } - // Then reshape. - size_t num_rows = lt.ne.at(1); - size_t per_shard_row_size = lt.shards.at(0).size / num_rows; - size_t out_offset = 0; - for (size_t row = 0; row < num_rows; row++) { - for (llama_buffer & tmp_buf : tmp_bufs) { - memcpy(lt.data + out_offset, - tmp_buf.addr + row * per_shard_row_size, - per_shard_row_size); - out_offset += per_shard_row_size; - } - } - LLAMA_ASSERT(out_offset == lt.size); } + if (0) { print_checksum(lt); } @@ -1067,12 +906,12 @@ static void llama_model_load_internal( model.t_start_us = ggml_time_us(); - std::unique_ptr ml(new llama_model_loader(fname, use_mmap, vocab_only)); + std::unique_ptr ml(new llama_model_loader(fname, use_mmap)); - vocab = std::move(ml->file_loaders.at(0)->vocab); - model.hparams = ml->file_loaders.at(0)->hparams; + vocab = std::move(ml->file_loader->vocab); + model.hparams = ml->file_loader->hparams; model.n_gpu_layers = n_gpu_layers; - llama_file_version file_version = ml->file_loaders.at(0)->file_version; + llama_file_version file_version = ml->file_loader->file_version; auto & hparams = model.hparams; { @@ -1106,7 +945,6 @@ static void llama_model_load_internal( fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); - fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size()); fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } @@ -2461,9 +2299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s nthread = std::thread::hardware_concurrency(); } - std::unique_ptr model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false, - /*vocab_only*/ false)); - llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype); + std::unique_ptr model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false)); + llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype); #ifdef GGML_USE_K_QUANTS int n_attention_wv = 0; @@ -2897,7 +2734,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const llama_buffer base_buf; if (path_base_model) { fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model); - model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false)); + model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true)); size_t ctx_size; size_t mmapped_size; @@ -2915,7 +2752,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const // maybe this should in llama_model_loader if (model_loader->use_mmap) { - model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa())); + model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa())); } } From 6432aabb6dc887436e4d57414b63116189c3b13b Mon Sep 17 00:00:00 2001 From: "Salvador E. Tropea" Date: Wed, 28 Jun 2023 14:26:26 -0300 Subject: [PATCH 5/8] cuda : fix missing const qualifier in casts (#2027) --- ggml-cuda.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index be75cb792..5f05d9181 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1244,7 +1244,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, } static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) { - const half * x = (half *) vx; + const half * x = (const half *) vx; const int row_x = blockDim.y*blockIdx.y + threadIdx.y; const int channel = blockDim.z*blockIdx.z + threadIdx.z; @@ -1294,7 +1294,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x, const int nchannels_x, const int channel_stride_x) { - const half * x = (half *) vx; + const half * x = (const half *) vx; const int row_x = blockDim.y*blockIdx.y + threadIdx.y; const int channel = blockDim.z*blockIdx.z + threadIdx.z; @@ -1337,14 +1337,14 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous } static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) { - const float * xi = (float *) cxi; + const float * xi = (const float *) cxi; float * dsti = (float *) cdsti; *dsti = *xi; } static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) { - const float * xi = (float *) cxi; + const float * xi = (const float *) cxi; half * dsti = (half *) cdsti; *dsti = __float2half(*xi); From 5b351e94d041742cd50ffcf2d44718d63bab398a Mon Sep 17 00:00:00 2001 From: "Salvador E. Tropea" Date: Wed, 28 Jun 2023 14:27:31 -0300 Subject: [PATCH 6/8] cuda : remove nchannels_x argument from mul_mat_vec_nc_f16_f32 (#2028) - Not used --- ggml-cuda.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5f05d9181..4e0d3dbde 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1292,7 +1292,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, - const int row_stride_x, const int nchannels_x, const int channel_stride_x) { + const int row_stride_x, const int channel_stride_x) { const half * x = (const half *) vx; @@ -1698,7 +1698,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda( const dim3 block_nums(1, nrows_x, nchannels_x); const dim3 block_dims(WARP_SIZE, 1, 1); mul_mat_vec_nc_f16_f32<<>> - (vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x); + (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x); } static void ggml_cpy_f32_f32_cuda( From d3494bb86bf7ad5b0b60aae0220ea576f273b5c0 Mon Sep 17 00:00:00 2001 From: m3ndax Date: Wed, 28 Jun 2023 20:39:08 +0200 Subject: [PATCH 7/8] llama : replacing auto &kv with const auto &kv (#2041) * Replacing auto &kv with const auto &kv * Create codacy.yml * Delete codacy.yml --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 47e11d03c..ef80b4e8b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2723,7 +2723,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const // create a name -> tensor map of the model to accelerate lookups std::unordered_map model_tensors; - for (auto & kv: model.tensors_by_name) { + for (const auto & kv: model.tensors_by_name) { model_tensors.insert(kv); } From 96a712ca1b7f427e3bd7ffc0c70b2105cfc7fbf1 Mon Sep 17 00:00:00 2001 From: LostRuins <39025047+LostRuins@users.noreply.github.com> Date: Thu, 29 Jun 2023 11:56:43 +0800 Subject: [PATCH 8/8] Porting the improved K-Quant CUDA kernels to OpenCL (#1966) * Added broken new q4k quant * xx + ib0 * Fix q2_k fast kernel * Use preprocessor for QK_K * Add q6_k fast matmul kernel * ported q3k speedup successfully * ported q2k and q5k speedups * remove old dot kernels and template * fixed global const struct types * fixing address spaces * fixed string too long CI issue --------- Co-authored-by: 0cc4m --- ggml-opencl.cpp | 545 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 361 insertions(+), 184 deletions(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 95f4cec6d..fed4ffb0c 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -21,11 +21,19 @@ #define CL_DMMV_BLOCK_SIZE 32 +#ifndef K_QUANTS_PER_ITERATION +#define K_QUANTS_PER_ITERATION 1 +#else +static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); +#endif + #define MULTILINE_QUOTE(...) #__VA_ARGS__ static std::string program_source = MULTILINE_QUOTE( typedef char int8_t; typedef uchar uint8_t; +typedef short int16_t; +typedef ushort uint16_t; typedef int int32_t; typedef uint uint32_t; @@ -175,7 +183,9 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float *v0 = vload_half(0, &x[ib + 0]); *v1 = vload_half(0, &x[ib + 1]); } +); +static std::string k_quants_source = MULTILINE_QUOTE( inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m) { if (j < 4) @@ -199,7 +209,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa const int is = 8 * n + l / 16; const uint8_t q = x[i].qs[32 * n + l]; - __global float *y = yy + i * 256 + 128 * n; + __global float *y = yy + i * QK_K + 128 * n; const float dall = vload_half(0, &x[i].d); const float dmin = vload_half(0, &x[i].dmin); @@ -231,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa float d_all = vload_half(0, &x[i].d); float dl = d_all * (us - 32); - __global float *y = yy + i * 256 + 128 * n + 32 * j; + __global float *y = yy + i * QK_K + 128 * n + 32 * j; const __global uint8_t *q = x[i].qs + 32 * n; const __global uint8_t *hm = x[i].hmask; @@ -248,7 +258,7 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa const int is = 2 * il; const int n = 4; - __global float *y = yy + i * 256 + 64 * il + n * ir; + __global float *y = yy + i * QK_K + 64 * il + n * ir; const float dall = vload_half(0, &x[i].d); const float dmin = vload_half(0, &x[i].dmin); @@ -277,7 +287,7 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa const int ir = tid % 16; const int is = 2 * il; - __global float *y = yy + i * 256 + 64 * il + 2 * ir; + __global float *y = yy + i * QK_K + 64 * il + 2 * ir; const float dall = vload_half(0, &x[i].d); const float dmin = vload_half(0, &x[i].dmin); @@ -309,7 +319,7 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __globa const int il = tid - 32 * ip; const int is = 8 * ip + il / 16; - __global float *y = yy + i * 256 + 128 * ip + il; + __global float *y = yy + i * QK_K + 128 * ip + il; const float d = vload_half(0, &x[i].d); @@ -323,161 +333,383 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __globa y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); } +__kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { -void vec_dot_q2_K(__global const struct block_q2_K* x, const int ib, const int iqs, const __global float *yy, float *result) { + const int row = get_group_id(0); - int n = iqs / 128; - int r = iqs - 128 * n; - int l = r / 8; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; - __global const float *y = yy + 128 * n + l; - __global const uint8_t *q = x[ib].qs + 32 * n + l; - __global const uint8_t *s = x[ib].scales + 8 * n; + __global const struct block_q2_K * x = xx + ib0; - const float dall = vload_half(0, &x[ib].d); - const float dmin = vload_half(0, &x[ib].dmin); + const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...15 + const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1 - float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4)) - + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4)) - + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4)) - + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4)) - + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4)) - + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4)) - + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4)) - + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4)); + const int step = 16/K_QUANTS_PER_ITERATION; - *result = sum; -} + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 -void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int iqs, const __global float *yy, float *result) { + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int s_offset = 8*im; + const int y_offset = 128*im + l0; - const uint32_t kmask1 = 0x03030303; - const uint32_t kmask2 = 0x0f0f0f0f; + tmp[16 * ix + tid] = 0; - uint32_t aux[3]; - uint32_t utmp[4]; + uint32_t aux[4]; + const uint8_t * d = (const uint8_t *)aux; + const uint8_t * m = (const uint8_t *)(aux + 2); - int n = iqs/128; - int r = iqs - 128*n; - int l = r/8; + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - __global const float * y = yy + 128*n + l; - __global const uint8_t * q = x[ib].qs + 32*n + l; - __global const uint8_t * hm = x[ib].hmask + l; - const int8_t * s = (const int8_t *)utmp + 8*n; + __global const float * y = yy + i * QK_K + y_offset; + __global const uint8_t * q = x[i].qs + q_offset; - aux[0] = x[ib].scales[0] | x[ib].scales[1] << 8 | x[ib].scales[2] << 16 | x[ib].scales[3] << 24; - aux[1] = x[ib].scales[4] | x[ib].scales[5] << 8 | x[ib].scales[6] << 16 | x[ib].scales[7] << 24; - aux[2] = x[ib].scales[8] | x[ib].scales[9] << 8 | x[ib].scales[10] << 16 | x[ib].scales[11] << 24; + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); - utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); - utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); - utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); - utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); + __global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset); + aux[0] = a[0] & 0x0f0f0f0f; + aux[1] = a[1] & 0x0f0f0f0f; + aux[2] = (a[0] >> 4) & 0x0f0f0f0f; + aux[3] = (a[1] >> 4) & 0x0f0f0f0f; - const float dall = vload_half(0, &x[ib].d); - const uint8_t m = 1 << (4*n); + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) + + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) + + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) + + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) + + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) + +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); + sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] + + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; - float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4)) - + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4)) - + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4)) - + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4)) - + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4)) - + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4)) - + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4)) - + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4)); + } + tmp[16 * ix + tid] += dall * sum1 - dmin * sum2; - *result = sum * dall; - -} - -void vec_dot_q4_K(__global const struct block_q4_K* x, const int ib, const int iqs, const __global float *yy, float *result) { - - const int j = iqs / 64; // j is in 0...3 - const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4 - const int is = 2*j; // is is in 0...6 in steps of 2 - - __global const float * y = yy + 64*j + ir; - __global const uint8_t * q = x[ib].qs + 32*j + ir; - - const float dall = vload_half(0, &x[ib].d); - const float dmin = vload_half(0, &x[ib].dmin); - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[ib].scales, &sc, &m); - const float d1 = dall * sc; - const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[ib].scales, &sc, &m); - const float d2 = dall * sc; - const float m2 = dmin * m; - - float sum = 0; - for (int k = 0; k < 4; ++k) { - sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1); - sum += y[k + 32] * (d2 * (q[k] >> 4) - m2); } - *result = sum; + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=16; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } } -void vec_dot_q5_K(__global const struct block_q5_K* x, const int ib, const int iqs, const __global float *yy, float *result) { +__kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; - const int j = iqs / 64; - const int ir = (iqs - 64*j)/2; - const int is = 2*j; + const int row = get_group_id(0); - __global const float * y = yy + 64*j + ir; - __global const uint8_t * ql = x[ib].qs + 32*j + ir; - __global const uint8_t * qh = x[ib].qh + ir; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; - const float dall = vload_half(0, &x[ib].d); - const float dmin = vload_half(0, &x[ib].dmin); + __global const struct block_q3_K * x = xx + ib0; - uint8_t sc, m; - get_scale_min_k4(is + 0, x[ib].scales, &sc, &m); - const float d1 = dall * sc; - const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[ib].scales, &sc, &m); - const float d2 = dall * sc; - const float m2 = dmin * m; + const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop + const int step = 16/K_QUANTS_PER_ITERATION; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0....15 or 0...7 + + const uint8_t m = 1 << (4*im); + + const int l0 = n*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + + uint16_t utmp[4]; + const int8_t * s = (const int8_t *)utmp; + + const uint16_t s_shift = 4*im; + + tmp[16 * ix + tid] = 0; + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + __global const float * y = yy + i * QK_K + y_offset; + __global const uint8_t * q = x[i].qs + q_offset; + __global const uint8_t * h = x[i].hmask + l0; + + __global const uint16_t * a = (__global const uint16_t *)x[i].scales; + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + const float d = vload_half(0, &x[i].d); + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp[16 * ix + tid] += d * sum; - uint8_t hm = 1 << is; - float sum = 0; - for (int k = 0; k < 4; ++k) { - sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1); } - hm <<= 1; - for (int k = 0; k < 4; ++k) { - sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2); - } - *result = sum; + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=16; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } } -void vec_dot_q6_K(__global const struct block_q6_K* x, const int ib, const int iqs, const __global float *yy, float *result) { +__kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { + //to rename it later, just to test now + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; - const int ip = iqs / 128; // 0 or 1 - const int il = (iqs - 128*ip)/8; // 0...15 - const int is = 8*ip; + const int row = get_group_id(0); + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; - __global const float * y = yy + 128*ip + il; + const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15 + const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; - const float d = vload_half(0, &x[ib].d); + const int step = 8/K_QUANTS_PER_ITERATION; - __global const uint8_t * ql = x[ib].ql + 64*ip + il; - __global const uint8_t * qh = x[ib].qh + 32*ip + il; - __global const int8_t * sc = x[ib].scales + is; + const int il = tid/step; // 0...3 + const int ir = tid - step*il;// 0...3 + const int n = 2*K_QUANTS_PER_ITERATION; - *result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32) - + y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32) - + y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32) - + y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32) - + y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32) - + y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32) - + y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32) - + y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32); + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + __global const struct block_q4_K * x = xx + ib0; + + tmp[16 * ix + tid] = 0; + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + __global const uint8_t * q1 = x[i].qs + q_offset; + __global const uint8_t * q2 = q1 + 64; + __global const float * y1 = yy + i*QK_K + y_offset; + __global const float * y2 = y1 + 128; + + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); + + __global const uint16_t * a = (__global const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + float4 s = (float4)(0.f); + float smin = 0; + for (int l = 0; l < n; ++l) { + s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4); + s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4); + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin; + + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=16; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +__kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { + + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int row = get_group_id(0); + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const int tid = get_local_id(0)/2; // 0...15 + const int ix = get_local_id(0)%2; + + const int il = tid/4; // 0...3 + const int ir = tid - 4*il;// 0...3 + const int n = 2; + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const uint8_t hm1 = 1 << (2*im); + const uint8_t hm2 = hm1 << 4; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + __global const struct block_q5_K * x = xx + ib0; + + tmp[16 * ix + tid] = 0; + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + __global const uint8_t * ql1 = x[i].qs + q_offset; + __global const uint8_t * ql2 = ql1 + 64; + __global const uint8_t * qh = x[i].qh + l0; + __global const float * y1 = yy + i*QK_K + y_offset; + __global const float * y2 = y1 + 128; + + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); + + __global const uint16_t * a = (__global const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + float4 sum = (float4)(0.f); + float smin = 0; + for (int l = 0; l < n; ++l) { + sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0)) + + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0)); + sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0)) + + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0)); + sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0)) + + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0)); + sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0)) + + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0)); + smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] + + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; + } + tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; + + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=16; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +__kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) { + + const int row = get_group_id(0); + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + __global const struct block_q6_K * x = xx + ib0; + + const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + + const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + +#if K_QUANTS_PER_ITERATION == 1 + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 + const int is = 0; +#else + const int l0 = 4 * in; // 0, 4, 8, ..., 28 + const int is = in / 4; +#endif + const int ql_offset = 64*im + l0; + const int qh_offset = 32*im + l0; + const int s_offset = 8*im + is; + const int y_offset = 128*im + l0; + + tmp[16 * ix + tid] = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + __global const float * y = yy + i * QK_K + y_offset; + __global const uint8_t * ql = x[i].ql + ql_offset; + __global const uint8_t * qh = x[i].qh + qh_offset; + __global const int8_t * s = x[i].scales + s_offset; + + const float d = vload_half(0, &x[i].d); + +#if K_QUANTS_PER_ITERATION == 1 + float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); + tmp[16 * ix + tid] += sum; +#else + float sum = 0; + for (int l = 0; l < 4; ++l) { + sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + } + tmp[16 * ix + tid] += sum; +#endif + + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=16; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } } ); @@ -549,44 +781,6 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float } ); -std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE( -__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { - const int block_size = get_local_size(0); - const int row = get_group_id(0); - const int tid = get_local_id(0); - - const int iter_stride = 256; - const int vals_per_iter = iter_stride / block_size; - const int num_blocks_per_row = ncols / 256; - const int ib0 = row*num_blocks_per_row; - - tmp[tid] = 0; - - for (int i = 0; i < ncols; i += iter_stride) { - const int col = i + vals_per_iter*tid; - const int ib = ib0 + col/256; // x block index - const int iqs = col%256; // x quant index - const int iybs = col - col%256; // y block start index - - // dequantize - float v; - DOT_KERNEL(x, ib, iqs, y + iybs, &v); - tmp[tid] += v; - } - - // sum up partial sums and write back result - barrier(CLK_LOCAL_MEM_FENCE); - for (int s=block_size/2; s>0; s>>=1) { - if (tid < s) { - tmp[tid] += tmp[tid + s]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) { - dst[row] = tmp[0]; - } -} -); std::string mul_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) { @@ -649,18 +843,6 @@ std::array mul_str_values = { "mul_f32", "float" }; -std::array dmmv_k_str_keys = { - "KERNEL_NAME", "X_TYPE", "DOT_KERNEL" -}; - -std::array dmmv_k_str_values = { - "dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K", - "dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K", - "dequantize_mul_mat_vec_q4_K", "struct block_q4_K", "vec_dot_q4_K", - "dequantize_mul_mat_vec_q5_K", "struct block_q5_K", "vec_dot_q5_K", - "dequantize_mul_mat_vec_q6_K", "struct block_q6_K", "vec_dot_q6_K", -}; - std::string& replace(std::string& s, const std::string& from, const std::string& to) { size_t pos = 0; while ((pos = s.find(from, pos)) != std::string::npos) { @@ -673,6 +855,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string& std::string generate_kernels() { std::stringstream src; src << program_source << '\n'; + src << k_quants_source << '\n'; for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) { std::string dequant_kernel = dequant_template; std::string dmmv_kernel = dequant_mul_mat_vec_template; @@ -690,13 +873,6 @@ std::string generate_kernels() { } src << mul_kernel << '\n'; } - for (size_t i = 0; i < dmmv_k_str_values.size(); i += dmmv_k_str_keys.size()) { - std::string dmmv_k_kernel = dequant_mul_mat_vec_k_template; - for (size_t j = 0; j < dmmv_k_str_keys.size(); j++) { - replace(dmmv_k_kernel, dmmv_k_str_keys[j], dmmv_k_str_values[i + j]); - } - src << dmmv_k_kernel << '\n'; - } return src.str(); } @@ -729,10 +905,11 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co exit(1); } - const char* compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math " - "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1"; + std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math " + "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 " + "-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION); - err = clBuildProgram(p, 0, NULL, compile_opts, NULL, NULL); + err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL); if(err < 0) { clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);