Merge branch 'master' into xsn/lora_server_hotswap
This commit is contained in:
commit
21cb13384c
68 changed files with 1619 additions and 508 deletions
|
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
|
|||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git libcurl4-openssl-dev curl
|
||||
apt-get install -y build-essential git libcurl4-openssl-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
@ -16,7 +16,7 @@ RUN make -j$(nproc) llama-server
|
|||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||
|
||||
COPY --from=build /app/llama-server /llama-server
|
||||
|
||||
|
|
|
@ -95,8 +95,16 @@ Typically finetunes of the base models below are supported as well.
|
|||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||
- [x] [OLMo](https://allenai.org/olmo)
|
||||
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
|
||||
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
||||
- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
|
||||
- [x] [Smaug](https://huggingface.co/models?search=Smaug)
|
||||
- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B)
|
||||
- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
|
||||
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
|
||||
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
|
||||
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
|
||||
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
|
||||
|
||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
||||
|
||||
|
@ -145,6 +153,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
|||
- [Faraday](https://faraday.dev/) (proprietary)
|
||||
- [LMStudio](https://lmstudio.ai/) (proprietary)
|
||||
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
|
||||
- [ramalama](https://github.com/containers/ramalama) (MIT)
|
||||
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
||||
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
||||
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
|
||||
|
|
|
@ -2052,8 +2052,8 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|||
//
|
||||
// Model utils
|
||||
//
|
||||
|
||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
|
||||
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||
llama_init_result iparams;
|
||||
auto mparams = llama_model_params_from_gpt_params(params);
|
||||
|
||||
llama_model * model = nullptr;
|
||||
|
@ -2068,7 +2068,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||
return std::make_tuple(nullptr, nullptr);
|
||||
return iparams;
|
||||
}
|
||||
|
||||
auto cparams = llama_context_params_from_gpt_params(params);
|
||||
|
@ -2077,7 +2077,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||
if (lctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||
llama_free_model(model);
|
||||
return std::make_tuple(nullptr, nullptr);
|
||||
return iparams;
|
||||
}
|
||||
|
||||
if (!params.control_vectors.empty()) {
|
||||
|
@ -2088,7 +2088,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||
if (cvec.n_embd == -1) {
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
return std::make_tuple(nullptr, nullptr);
|
||||
return iparams;
|
||||
}
|
||||
|
||||
int err = llama_control_vector_apply(lctx,
|
||||
|
@ -2100,7 +2100,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||
if (err) {
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
return std::make_tuple(nullptr, nullptr);
|
||||
return iparams;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2111,7 +2111,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
return std::make_tuple(nullptr, nullptr);
|
||||
return iparams;
|
||||
}
|
||||
}
|
||||
if (!params.lora_init_without_apply) {
|
||||
|
@ -2149,7 +2149,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||
llama_reset_timings(lctx);
|
||||
}
|
||||
|
||||
return std::make_tuple(model, lctx);
|
||||
iparams.model = model;
|
||||
iparams.context = lctx;
|
||||
return iparams;
|
||||
}
|
||||
|
||||
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
|
||||
|
|
|
@ -314,8 +314,12 @@ std::string fs_get_cache_file(const std::string & filename);
|
|||
// Model utils
|
||||
//
|
||||
|
||||
// TODO: avoid tuplue, use struct
|
||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
|
||||
struct llama_init_result {
|
||||
struct llama_model * model = nullptr;
|
||||
struct llama_context * context = nullptr;
|
||||
};
|
||||
|
||||
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
||||
|
||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
||||
|
|
|
@ -414,9 +414,10 @@ int main(int argc, char ** argv) {
|
|||
llama_numa_init(params.numa);
|
||||
|
||||
// load the model to get hparams
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
|
||||
// int n_ctx = llama_n_ctx(ctx);
|
||||
int n_layers = llama_n_layer(model);
|
||||
|
|
|
@ -79,11 +79,11 @@ int main(int argc, char ** argv) {
|
|||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
return 1;
|
||||
|
|
|
@ -163,9 +163,10 @@ int main(int argc, char ** argv) {
|
|||
params.warmup = false;
|
||||
|
||||
// init
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
if (model == nullptr || ctx == nullptr) {
|
||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||
return 1;
|
||||
|
|
|
@ -611,10 +611,10 @@ int main(int argc, char ** argv) {
|
|||
params.warmup = false;
|
||||
|
||||
// init
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
if (model == nullptr || ctx == nullptr) {
|
||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||
return 1;
|
||||
|
|
|
@ -179,7 +179,10 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// load the model and apply lora adapter, if any
|
||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_TEE("%s: error: unable to load model\n", __func__);
|
||||
|
|
|
@ -58,11 +58,11 @@ int main(int argc, char ** argv) {
|
|||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model = NULL;
|
||||
llama_context * ctx = NULL;
|
||||
|
||||
// load the target model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
|
||||
// Tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
|
|
|
@ -22,11 +22,11 @@ int main(int argc, char ** argv){
|
|||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model = NULL;
|
||||
llama_context * ctx = NULL;
|
||||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
GGML_ASSERT(model != nullptr);
|
||||
|
||||
// tokenize the prompt
|
||||
|
|
|
@ -26,11 +26,11 @@ int main(int argc, char ** argv){
|
|||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model = NULL;
|
||||
llama_context * ctx = NULL;
|
||||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
|
|
|
@ -34,11 +34,11 @@ int main(int argc, char ** argv){
|
|||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model = NULL;
|
||||
llama_context * ctx = NULL;
|
||||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
|
|
|
@ -207,7 +207,10 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// load the model and apply lora adapter, if any
|
||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
if (sparams.cfg_scale > 1.f) {
|
||||
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
||||
ctx_guidance = llama_new_context_with_model(model, lparams);
|
||||
|
|
|
@ -129,11 +129,11 @@ int main(int argc, char ** argv) {
|
|||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model = NULL;
|
||||
llama_context * ctx = NULL;
|
||||
|
||||
// load the target model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
|
||||
// load the prompts from an external file if there are any
|
||||
if (params.prompt.empty()) {
|
||||
|
|
|
@ -2018,11 +2018,11 @@ int main(int argc, char ** argv) {
|
|||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
return 1;
|
||||
|
|
|
@ -148,11 +148,12 @@ int main(int argc, char ** argv) {
|
|||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
return 1;
|
||||
|
|
|
@ -28,10 +28,11 @@ int main(int argc, char ** argv) {
|
|||
std::string result2;
|
||||
|
||||
// init
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (model == nullptr || ctx == nullptr) {
|
||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||
return 1;
|
||||
|
|
|
@ -678,7 +678,10 @@ struct server_context {
|
|||
// dedicate one sequence to the system prompt
|
||||
params.n_parallel += 1;
|
||||
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
params.n_parallel -= 1; // but be sneaky about it
|
||||
if (model == nullptr) {
|
||||
LOG_ERROR("unable to load model", {{"model", params.model}});
|
||||
|
@ -901,7 +904,7 @@ struct server_context {
|
|||
|
||||
slot.params.stream = json_value(data, "stream", false);
|
||||
slot.params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||
slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
||||
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
|
||||
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
||||
|
|
|
@ -355,24 +355,6 @@ static json oaicompat_completion_params_parse(
|
|||
|
||||
llama_params["__oaicompat"] = true;
|
||||
|
||||
// Map OpenAI parameters to llama.cpp parameters
|
||||
//
|
||||
// For parameters that are defined by the OpenAI documentation (e.g.
|
||||
// temperature), we explicitly specify OpenAI's intended default; we
|
||||
// need to do that because sometimes OpenAI disagrees with llama.cpp
|
||||
//
|
||||
// https://platform.openai.com/docs/api-reference/chat/create
|
||||
llama_sampling_params default_sparams;
|
||||
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
||||
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
|
||||
llama_params["logit_bias"] = json_value(body, "logit_bias", json::object());
|
||||
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
|
||||
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
||||
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
||||
llama_params["stream"] = json_value(body, "stream", false);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 1.0);
|
||||
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
|
||||
|
||||
|
|
|
@ -66,7 +66,9 @@ int main(int argc, char ** argv) {
|
|||
llama_context * ctx_dft = NULL;
|
||||
|
||||
// load the target model
|
||||
std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init_tgt = llama_init_from_gpt_params(params);
|
||||
model_tgt = llama_init_tgt.model;
|
||||
ctx_tgt = llama_init_tgt.context;
|
||||
|
||||
// load the draft model
|
||||
params.model = params.model_draft;
|
||||
|
@ -75,7 +77,9 @@ int main(int argc, char ** argv) {
|
|||
params.n_threads = params.n_threads_draft;
|
||||
}
|
||||
params.n_threads_batch = params.n_threads_batch_draft;
|
||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||
llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
|
||||
model_dft = llama_init_dft.model;
|
||||
ctx_dft = llama_init_dft.context;
|
||||
|
||||
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
||||
LOG("vocab_type tgt: %d\n", vocab_type_tgt);
|
||||
|
|
|
@ -1456,7 +1456,6 @@ extern "C" {
|
|||
// if mode & 2 == 1, GPT-NeoX style
|
||||
//
|
||||
// b is an int32 vector with size a->ne[2], it contains the positions
|
||||
// c is freq factors (e.g. phi3-128k), (optional)
|
||||
GGML_API struct ggml_tensor * ggml_rope(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
@ -1473,6 +1472,7 @@ extern "C" {
|
|||
int mode);
|
||||
|
||||
// custom RoPE
|
||||
// c is freq factors (e.g. phi3-128k), (optional)
|
||||
GGML_API struct ggml_tensor * ggml_rope_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
|
|
@ -627,7 +627,6 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
|
|||
GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
||||
const void* src,
|
||||
void* dst) {
|
||||
GGML_ASSERT(tensor->op == GGML_OP_NONE);
|
||||
|
||||
int64_t n_elems = ggml_nelements(tensor);
|
||||
int64_t groups = n_elems / QK4_0;
|
||||
|
@ -679,7 +678,6 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
|||
*/
|
||||
GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
|
||||
const ggml_tensor* tensor, void* src, void* dst) {
|
||||
GGML_ASSERT(tensor->op == GGML_OP_NONE);
|
||||
|
||||
int64_t n_elems = ggml_nelements(tensor);
|
||||
int64_t groups = n_elems / QK4_0;
|
||||
|
@ -1666,10 +1664,13 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
|
|||
}
|
||||
case GGML_OP_MUL_MAT: {
|
||||
switch (op->src[0]->type) {
|
||||
// case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_Q8_0:
|
||||
// TODO: fix me
|
||||
// Current groupsize should not be greater than k-1 in
|
||||
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
|
||||
case GGML_TYPE_Q4_0:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
@ -1694,6 +1695,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
|
|||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
|
|
@ -37,6 +37,10 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
|
|||
return ACL_INT16;
|
||||
case GGML_TYPE_I32:
|
||||
return ACL_INT32;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return ACL_INT4;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return ACL_INT8;
|
||||
default:
|
||||
return ACL_DT_UNDEFINED;
|
||||
}
|
||||
|
@ -89,33 +93,6 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
|
|||
return false;
|
||||
}
|
||||
|
||||
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
|
||||
size_t type_size, int64_t* ne, size_t* nb,
|
||||
int64_t dims, aclFormat format,
|
||||
size_t offset) {
|
||||
int64_t tmp_ne[GGML_MAX_DIMS * 2];
|
||||
int64_t tmp_stride[GGML_MAX_DIMS * 2];
|
||||
|
||||
memcpy(tmp_ne, ne, dims * sizeof(int64_t));
|
||||
for (int i = 0; i < dims; i++) {
|
||||
tmp_stride[i] = nb[i] / type_size;
|
||||
}
|
||||
|
||||
std::reverse(tmp_ne, tmp_ne + dims);
|
||||
std::reverse(tmp_stride, tmp_stride + dims);
|
||||
|
||||
int64_t acl_storage_len = 0;
|
||||
for (int i = 0; i < dims; i++) {
|
||||
acl_storage_len += (ne[i] - 1) * nb[i];
|
||||
}
|
||||
|
||||
aclTensor* acl_tensor =
|
||||
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
|
||||
format, &acl_storage_len, 1, data_ptr);
|
||||
|
||||
return acl_tensor;
|
||||
}
|
||||
|
||||
int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
|
||||
const ggml_tensor* src1,
|
||||
int64_t* bcast_src0_ne,
|
||||
|
|
|
@ -23,6 +23,9 @@
|
|||
#ifndef CANN_ACL_TENSOR_H
|
||||
#define CANN_ACL_TENSOR_H
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
|
||||
#include <aclnn/aclnn_base.h>
|
||||
#include "common.h"
|
||||
|
||||
|
@ -65,7 +68,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
|
|||
size_t offset = 0);
|
||||
|
||||
/**
|
||||
* @brief Creates an ACL tensor from provided parameters.
|
||||
* @brief Template for creating an ACL tensor from provided parameters. typename TYPE
|
||||
* should be size_t or float.
|
||||
*
|
||||
* @details This function creates an ACL tensor using the provided data pointer,
|
||||
* data type, dimensions, strides, format, offset, and additional parameters.
|
||||
|
@ -83,10 +87,34 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
|
|||
* @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
|
||||
* @return Pointer to the created ACL tensor.
|
||||
*/
|
||||
template<typename TYPE>
|
||||
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
|
||||
size_t type_size, int64_t* ne, size_t* nb,
|
||||
int64_t dims, aclFormat format = ACL_FORMAT_ND,
|
||||
size_t offset = 0);
|
||||
TYPE type_size, int64_t* ne, TYPE* nb,
|
||||
int64_t dims,
|
||||
aclFormat format = ACL_FORMAT_ND,
|
||||
size_t offset = 0) {
|
||||
int64_t tmp_ne[GGML_MAX_DIMS * 2];
|
||||
int64_t tmp_stride[GGML_MAX_DIMS * 2];
|
||||
|
||||
memcpy(tmp_ne, ne, dims * sizeof(int64_t));
|
||||
for (int i = 0; i < dims; i++) {
|
||||
tmp_stride[i] = nb[i] / type_size;
|
||||
}
|
||||
|
||||
std::reverse(tmp_ne, tmp_ne + dims);
|
||||
std::reverse(tmp_stride, tmp_stride + dims);
|
||||
|
||||
int64_t acl_storage_len = 0;
|
||||
for (int i = 0; i < dims; i++) {
|
||||
acl_storage_len += (ne[i] - 1) * nb[i];
|
||||
}
|
||||
|
||||
aclTensor* acl_tensor =
|
||||
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
|
||||
format, &acl_storage_len, 1, data_ptr);
|
||||
|
||||
return acl_tensor;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Checks if tensors require broadcasting based on their shapes.
|
||||
|
|
|
@ -910,6 +910,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_Q4_0) {
|
||||
aclrtlaunch_ascendc_quantize_f16_to_q4_0(
|
||||
24, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
|
@ -971,6 +978,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_Q4_0) {
|
||||
aclrtlaunch_ascendc_quantize_f32_to_q4_0(
|
||||
24, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
|
@ -2463,21 +2477,33 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
|||
* @param dst The destination tensor where the result of the matrix
|
||||
* multiplication will be stored.
|
||||
*/
|
||||
static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
|
||||
ggml_tensor* dst) {
|
||||
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
ggml_tensor* dst,
|
||||
const enum ggml_type type) {
|
||||
ggml_tensor* src0 = dst->src[0]; // weight
|
||||
ggml_tensor* src1 = dst->src[1]; // input
|
||||
|
||||
// The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
|
||||
// is regarded as batch. weight need transpose.
|
||||
int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
|
||||
size_t weight_elem_size = sizeof(uint8_t);
|
||||
size_t weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
|
||||
float weight_elem_size;
|
||||
if (type == GGML_TYPE_Q4_0) {
|
||||
weight_elem_size = float(sizeof(uint8_t)) / 2;
|
||||
}
|
||||
else if (type == GGML_TYPE_Q8_0) {
|
||||
weight_elem_size = float(sizeof(uint8_t));
|
||||
}
|
||||
else {
|
||||
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
|
||||
}
|
||||
float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
|
||||
|
||||
// size of one matrix is element_size * height * width.
|
||||
size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
|
||||
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
|
||||
|
||||
// scale stored at the end of weight. Also need transpose.
|
||||
GGML_ASSERT(QK4_0 == QK8_0);
|
||||
int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
|
||||
size_t scale_elem_size = sizeof(uint16_t);
|
||||
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
|
||||
|
@ -2541,8 +2567,9 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
|
|||
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
|
||||
input_elem_size, input_ne, input_nb, 2);
|
||||
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
||||
(char*)src0->data + batch0 * weight_stride, ACL_INT8,
|
||||
weight_elem_size, weight_ne, weight_nb, 2);
|
||||
(char*)src0->data + batch0 * weight_stride,
|
||||
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
|
||||
weight_nb, 2);
|
||||
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
||||
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
|
||||
scale_elem_size, scale_ne, scale_nb, 2);
|
||||
|
@ -2596,11 +2623,9 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
case GGML_TYPE_F16:
|
||||
ggml_cann_mat_mul_fp(ctx, dst);
|
||||
break;
|
||||
// case GGML_TYPE_Q4_0:
|
||||
// ggml_cann_mul_mat_q4_0(ctx, dst);
|
||||
// break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
ggml_cann_mul_mat_q8_0(ctx, dst);
|
||||
ggml_cann_mul_mat_quant(ctx, dst, type);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
|
|
|
@ -9,6 +9,7 @@ file(GLOB SRC_FILES
|
|||
get_row_q8_0.cpp
|
||||
quantize_f32_q8_0.cpp
|
||||
quantize_f16_q8_0.cpp
|
||||
quantize_float_to_q4_0.cpp
|
||||
dup.cpp
|
||||
)
|
||||
|
||||
|
@ -29,4 +30,4 @@ ascendc_library(ascendc_kernels STATIC
|
|||
${SRC_FILES}
|
||||
)
|
||||
|
||||
#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
||||
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
||||
|
|
|
@ -8,6 +8,8 @@
|
|||
|
||||
#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
|
||||
#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
|
||||
#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
|
||||
#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
|
||||
|
||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
|
||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
|
||||
|
|
278
ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
Normal file
278
ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
Normal file
|
@ -0,0 +1,278 @@
|
|||
#include "kernel_operator.h"
|
||||
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
#define Group_Size 32
|
||||
|
||||
template <typename SRC_T>
|
||||
class QUANTIZE_FLOAT_TO_Q4_0 {
|
||||
public:
|
||||
__aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *output_ne_ub) {
|
||||
// TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
|
||||
// permute=[0,0,0,0]):
|
||||
// [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
// input stride of data elements
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
}
|
||||
|
||||
// output stride of data elements
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
|
||||
}
|
||||
|
||||
// scale saved one by one after data:. [group1_scale, group2_scale, ...]
|
||||
scale_ne = input_ne;
|
||||
scale_stride[0] = 1;
|
||||
scale_stride[1] = input_ne[0] / Group_Size;
|
||||
for (int i = 2; i < 4; i++) {
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
// split input tensor by rows.
|
||||
uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
|
||||
dr = nr / op_block_num;
|
||||
|
||||
uint64_t tails = nr % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
group_size_in_row = scale_stride[1];
|
||||
int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
|
||||
output_ne[3] * sizeof(uint8_t) / 2;
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
|
||||
output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
|
||||
group_size_in_row *
|
||||
sizeof(half)));
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM,
|
||||
Group_Size * sizeof(int8_t) / 2);
|
||||
pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
|
||||
pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
|
||||
pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
|
||||
pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
|
||||
pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
|
||||
pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
|
||||
pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
|
||||
DataCopy(input_local, input_gm[offset], Group_Size);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
// reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
|
||||
// and using DataCopyPad to avoid 32 bits align.
|
||||
LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
|
||||
LocalTensor<int8_t> output_int8_local =
|
||||
output_local.ReinterpretCast<int8_t>();
|
||||
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t);
|
||||
DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
|
||||
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
|
||||
LocalTensor<float> input_local) {
|
||||
DataCopy(cast_local, input_local, Group_Size);
|
||||
}
|
||||
|
||||
__aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
|
||||
LocalTensor<half> input_local) {
|
||||
Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
|
||||
}
|
||||
|
||||
__aicore__ inline half calculate_group(int64_t row, int64_t group) {
|
||||
const int64_t i3 = row / (input_ne[1] * input_ne[2]);
|
||||
const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
|
||||
const int64_t i1 =
|
||||
row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
|
||||
|
||||
const int64_t input_offset = i1 * input_stride[1] +
|
||||
i2 * input_stride[2] +
|
||||
i3 * input_stride[3] + Group_Size * group;
|
||||
|
||||
// output_offset is stride for output_gm which datatype is int8_t and
|
||||
// divided by 2 is needed for int4b_t.
|
||||
const int64_t output_offset = (i1 * output_stride[1] +
|
||||
i2 * output_stride[2] +
|
||||
i3 * output_stride[3] +
|
||||
Group_Size * group) / 2;
|
||||
copy_in(input_offset);
|
||||
|
||||
LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
|
||||
LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
|
||||
LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
|
||||
LocalTensor<float> work_local = work_queue.AllocTensor<float>();
|
||||
LocalTensor<float> max_local = max_queue.AllocTensor<float>();
|
||||
LocalTensor<float> min_local = min_queue.AllocTensor<float>();
|
||||
LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
|
||||
LocalTensor<half> half_local = half_queue.AllocTensor<half>();
|
||||
|
||||
input_to_cast(cast_local, input_local);
|
||||
|
||||
ReduceMax(max_local, cast_local, work_local, Group_Size);
|
||||
ReduceMin(min_local, cast_local, work_local, Group_Size);
|
||||
const float max_value = max_local.GetValue(0);
|
||||
const float min_value = min_local.GetValue(0);
|
||||
float d = max_value;
|
||||
if (min_value < 0 && (-1 * min_value) > max_value) {
|
||||
d = min_value;
|
||||
}
|
||||
|
||||
d = d / (-8);
|
||||
if (d != 0) {
|
||||
Muls(cast_local, cast_local, 1.0f / d, Group_Size);
|
||||
}
|
||||
|
||||
// range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
|
||||
float scalar = 8.5f;
|
||||
Adds(cast_local, cast_local, scalar, Group_Size);
|
||||
Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
|
||||
scalar = 15.0f;
|
||||
Mins(cast_local, cast_local, scalar, Group_Size);
|
||||
scalar = -8.0f;
|
||||
Adds(cast_local, cast_local, scalar, Group_Size);
|
||||
|
||||
// float->half->int4b
|
||||
Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
|
||||
Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
|
||||
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
work_queue.FreeTensor(work_local);
|
||||
max_queue.FreeTensor(max_local);
|
||||
min_queue.FreeTensor(min_local);
|
||||
int8_queue.FreeTensor(int8_local);
|
||||
half_queue.FreeTensor(half_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
return (half)d;
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
|
||||
uint32_t scale_local_offset = 0;
|
||||
uint32_t scale_global_offset = 0;
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
half scale = calculate_group(i, j);
|
||||
scale_local.SetValue(scale_local_offset++, scale);
|
||||
// Copy Group_Size/2 length data each time.
|
||||
if (scale_local_offset == Group_Size / 2) {
|
||||
scale_local_offset = 0;
|
||||
// TODO: OPTIMIZE ME
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopy(scale_gm[scale_global_offset], scale_local,
|
||||
Group_Size / 2);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
scale_global_offset += Group_Size / 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (scale_local_offset != 0) {
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = scale_local_offset * sizeof(half);
|
||||
DataCopyPad(scale_gm[scale_global_offset], scale_local,
|
||||
dataCopyParams);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
}
|
||||
scale_queue.FreeTensor(scale_local);
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t *scale_ne;
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<SRC_T> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int8_t> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
|
||||
QUANTIZE_FLOAT_TO_Q4_0<half> op;
|
||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||
op.calculate();
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
|
||||
QUANTIZE_FLOAT_TO_Q4_0<float> op;
|
||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||
op.calculate();
|
||||
}
|
File diff suppressed because it is too large
Load diff
|
@ -2312,7 +2312,7 @@ inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) {
|
|||
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
||||
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
||||
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
||||
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
||||
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
||||
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
||||
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
||||
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
||||
|
|
|
@ -4,9 +4,11 @@
|
|||
#include "generic_binary_head.comp"
|
||||
|
||||
void main() {
|
||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) + FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)]));
|
||||
}
|
||||
|
|
|
@ -4,10 +4,12 @@
|
|||
#include "generic_unary_head.comp"
|
||||
|
||||
void main() {
|
||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
|
||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
|
||||
}
|
||||
|
|
35
ggml/src/vulkan-shaders/concat.comp
Normal file
35
ggml/src/vulkan-shaders/concat.comp
Normal file
|
@ -0,0 +1,35 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_binary_head.comp"
|
||||
|
||||
void main() {
|
||||
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
const int dim = p.param3;
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i3 = idx / (p.ne22*p.ne21*p.ne20);
|
||||
const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20;
|
||||
const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20);
|
||||
const uint i2_offset = i2*p.ne21*p.ne20;
|
||||
const uint i1 = (idx - i3_offset - i2_offset) / p.ne20;
|
||||
const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20;
|
||||
|
||||
uint o[4] = {0, 0, 0, 0};
|
||||
o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03));
|
||||
|
||||
const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
|
||||
const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10;
|
||||
const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20;
|
||||
|
||||
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
||||
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
|
||||
#else
|
||||
data_d[p.d_offset + dst_idx] = is_src0 ? data_a[src0_idx] : data_b[src1_idx];
|
||||
#endif
|
||||
}
|
|
@ -4,13 +4,15 @@
|
|||
#include "generic_unary_head.comp"
|
||||
|
||||
void main() {
|
||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
|
||||
#else
|
||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = data_a[src0_idx(gl_GlobalInvocationID.x)];
|
||||
data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -4,9 +4,11 @@
|
|||
#include "generic_binary_head.comp"
|
||||
|
||||
void main() {
|
||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) / FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)]));
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
|||
void main() {
|
||||
const float GELU_COEF_A = 0.044715f;
|
||||
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
const uint i = gl_GlobalInvocationID.x;
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
|
|
23
ggml/src/vulkan-shaders/gelu_quick.comp
Normal file
23
ggml/src/vulkan-shaders/gelu_quick.comp
Normal file
|
@ -0,0 +1,23 @@
|
|||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const float GELU_QUICK_COEF = -1.702f;
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float x = float(data_a[i]);
|
||||
data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
|
||||
}
|
|
@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
|
|||
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
||||
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
|
||||
uint d_offset;
|
||||
float param1; float param2;
|
||||
float param1; float param2; int param3;
|
||||
} p;
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
@ -16,6 +16,10 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
|||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
uint get_idx() {
|
||||
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
}
|
||||
|
||||
uint src0_idx(uint idx) {
|
||||
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
|
||||
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
||||
|
|
|
@ -14,6 +14,10 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
|||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
uint get_idx() {
|
||||
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
}
|
||||
|
||||
uint src0_idx(uint idx) {
|
||||
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
|
||||
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
||||
|
|
66
ggml/src/vulkan-shaders/group_norm.comp
Normal file
66
ggml/src/vulkan-shaders/group_norm.comp
Normal file
|
@ -0,0 +1,66 @@
|
|||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#define BLOCK_SIZE 512
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
shared float tmp[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint group_size = p.KX;
|
||||
const float eps = p.param1;
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint start = gl_WorkGroupID.x * group_size + tid;
|
||||
const uint end = start + group_size;
|
||||
|
||||
tmp[tid] = 0.0f;
|
||||
|
||||
// Calculate mean
|
||||
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
|
||||
tmp[tid] += float(data_a[col]);
|
||||
}
|
||||
|
||||
// tmp up partial tmps and write back result
|
||||
barrier();
|
||||
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
const float mean = tmp[0] / group_size;
|
||||
barrier();
|
||||
tmp[tid] = 0.0f;
|
||||
|
||||
// Calculate variance
|
||||
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
|
||||
const float xi = float(data_a[col]) - mean;
|
||||
data_d[col] = D_TYPE(xi);
|
||||
tmp[tid] += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier();
|
||||
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
const float variance = tmp[0] / group_size;
|
||||
const float scale = inversesqrt(variance + eps);
|
||||
|
||||
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
|
||||
data_d[col] *= D_TYPE(scale);
|
||||
}
|
||||
}
|
57
ggml/src/vulkan-shaders/im2col.comp
Normal file
57
ggml/src/vulkan-shaders/im2col.comp
Normal file
|
@ -0,0 +1,57 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint batch_offset; uint offset_delta;
|
||||
uint IC;
|
||||
uint IW; uint IH;
|
||||
uint OW; uint OH;
|
||||
uint KW; uint KH;
|
||||
uint pelements;
|
||||
uint CHW;
|
||||
int s0; int s1;
|
||||
int p0; int p1;
|
||||
int d0; int d1;
|
||||
} p;
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
#define BLOCK_SIZE 256
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.x;
|
||||
if (i >= p.pelements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
|
||||
const uint kx = i / ksize;
|
||||
const uint kd = kx * ksize;
|
||||
const uint ky = (i - kd) / p.OW;
|
||||
const uint ix = i % p.OW;
|
||||
|
||||
const uint oh = gl_GlobalInvocationID.y;
|
||||
const uint batch = gl_GlobalInvocationID.z / p.IC;
|
||||
const uint ic = gl_GlobalInvocationID.z % p.IC;
|
||||
|
||||
const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
|
||||
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
|
||||
|
||||
const uint offset_dst =
|
||||
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
|
||||
(ic * (p.KW * p.KH) + ky * p.KW + kx);
|
||||
|
||||
if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) {
|
||||
data_d[offset_dst] = D_TYPE(0.0f);
|
||||
} else {
|
||||
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
|
||||
data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]);
|
||||
}
|
||||
}
|
22
ggml/src/vulkan-shaders/leaky_relu.comp
Normal file
22
ggml/src/vulkan-shaders/leaky_relu.comp
Normal file
|
@ -0,0 +1,22 @@
|
|||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float val = float(data_a[i]);
|
||||
data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
|
||||
}
|
|
@ -4,9 +4,11 @@
|
|||
#include "generic_binary_head.comp"
|
||||
|
||||
void main() {
|
||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)]));
|
||||
}
|
||||
|
|
|
@ -16,6 +16,13 @@ void main() {
|
|||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
// There are not enough cols to use all threads
|
||||
if (tid >= p.ncols) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint block_size = min(p.ncols, BLOCK_SIZE);
|
||||
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
|
@ -23,8 +30,8 @@ void main() {
|
|||
|
||||
tmp[tid] = FLOAT_TYPE(0.0f);
|
||||
|
||||
[[unroll]] for (uint i = 0; i < p.ncols/BLOCK_SIZE; i += 2) {
|
||||
const uint col = i*BLOCK_SIZE + 2*tid;
|
||||
[[unroll]] for (uint i = 0; i < p.ncols/block_size; i += 2) {
|
||||
const uint col = i*block_size + 2*tid;
|
||||
const uint ib = (row*p.ncols + col)/QUANT_K; // block index
|
||||
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
|
||||
const uint iybs = col - col%QUANT_K; // y block start index
|
||||
|
@ -38,7 +45,7 @@ void main() {
|
|||
|
||||
// sum up partial sums and write back result
|
||||
barrier();
|
||||
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
||||
[[unroll]] for (uint s = block_size/2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
|
|
|
@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
|||
shared vec2 sum[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint row = gl_WorkGroupID.x;
|
||||
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
sum[tid] = vec2(0.0f, 0.0f);
|
||||
|
|
26
ggml/src/vulkan-shaders/pad.comp
Normal file
26
ggml/src/vulkan-shaders/pad.comp
Normal file
|
@ -0,0 +1,26 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
void main() {
|
||||
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i3 = idx / (p.ne12*p.ne11*p.ne10);
|
||||
const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
|
||||
const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10);
|
||||
const uint i2_offset = i2*p.ne11*p.ne10;
|
||||
const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
|
||||
const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
|
||||
|
||||
const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
|
||||
const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
|
||||
|
||||
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
||||
|
||||
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
|
||||
}
|
|
@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
|||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.x;
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
|
|
|
@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
|||
shared FLOAT_TYPE sum[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint row = gl_WorkGroupID.x;
|
||||
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
|
||||
|
|
|
@ -4,9 +4,11 @@
|
|||
#include "generic_unary_head.comp"
|
||||
|
||||
void main() {
|
||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(p.param1));
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
|||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.x;
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
|
|
|
@ -28,7 +28,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE];
|
|||
|
||||
void main() {
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint rowx = gl_WorkGroupID.x;
|
||||
const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint rowy = rowx % p.KY;
|
||||
|
||||
float slope = 1.0f;
|
||||
|
|
|
@ -4,10 +4,12 @@
|
|||
#include "generic_unary_head.comp"
|
||||
|
||||
void main() {
|
||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
|
||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val * val);
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
|
||||
}
|
||||
|
|
|
@ -14,7 +14,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
|||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint row = gl_WorkGroupID.x;
|
||||
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint col = gl_LocalInvocationID.x;
|
||||
|
||||
tmp[col] = FLOAT_TYPE(0.0f);
|
||||
|
|
21
ggml/src/vulkan-shaders/tanh.comp
Normal file
21
ggml/src/vulkan-shaders/tanh.comp
Normal file
|
@ -0,0 +1,21 @@
|
|||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
data_d[i] = D_TYPE(tanh(data_a[i]));
|
||||
}
|
41
ggml/src/vulkan-shaders/timestep_embedding.comp
Normal file
41
ggml/src/vulkan-shaders/timestep_embedding.comp
Normal file
|
@ -0,0 +1,41 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint nb1;
|
||||
uint dim;
|
||||
uint max_period;
|
||||
} p;
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#define BLOCK_SIZE 256
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.y;
|
||||
const uint j = gl_GlobalInvocationID.x;
|
||||
const uint d_offset = i * p.nb1;
|
||||
|
||||
if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) {
|
||||
data_d[d_offset + p.dim] = 0.f;
|
||||
}
|
||||
|
||||
const uint half_dim = p.dim / 2;
|
||||
if (j >= half_dim) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float timestep = float(data_a[i]);
|
||||
const float freq = float(exp(-log(p.max_period) * j / half_dim));
|
||||
const float arg = timestep * freq;
|
||||
data_d[d_offset + j] = D_TYPE(cos(arg));
|
||||
data_d[d_offset + j + half_dim] = D_TYPE(sin(arg));
|
||||
}
|
|
@ -6,7 +6,7 @@
|
|||
#define QUANT_K 1
|
||||
#define QUANT_R 1
|
||||
|
||||
#ifndef LOAD_VEC_A
|
||||
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
||||
#define A_TYPE float
|
||||
#elif LOAD_VEC_A == 4
|
||||
#define A_TYPE vec4
|
||||
|
@ -19,7 +19,7 @@
|
|||
#define QUANT_K 1
|
||||
#define QUANT_R 1
|
||||
|
||||
#ifndef LOAD_VEC_A
|
||||
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
||||
#define A_TYPE float16_t
|
||||
#elif LOAD_VEC_A == 4
|
||||
#define A_TYPE f16vec4
|
||||
|
|
36
ggml/src/vulkan-shaders/upscale.comp
Normal file
36
ggml/src/vulkan-shaders/upscale.comp
Normal file
|
@ -0,0 +1,36 @@
|
|||
#version 450
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint ne; uint d_offset;
|
||||
uint nb00; uint nb01; uint nb02; uint nb03;
|
||||
uint ne10; uint ne11; uint ne12; uint ne13;
|
||||
float sf0; float sf1; float sf2; float sf3;
|
||||
} p;
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i10 = idx % p.ne10;
|
||||
const uint i11 = (idx / p.ne10) % p.ne11;
|
||||
const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
|
||||
const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
|
||||
|
||||
const uint i00 = uint(i10 / p.sf0);
|
||||
const uint i01 = uint(i11 / p.sf1);
|
||||
const uint i02 = uint(i12 / p.sf2);
|
||||
const uint i03 = uint(i13 / p.sf3);
|
||||
|
||||
data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
|
||||
}
|
|
@ -30,20 +30,6 @@
|
|||
|
||||
#define ASYNCIO_CONCURRENCY 64
|
||||
|
||||
// define prototypes
|
||||
void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str);
|
||||
bool directory_exists(const std::string& path);
|
||||
bool create_directory(const std::string& path);
|
||||
std::string to_uppercase(const std::string& input);
|
||||
bool string_ends_with(const std::string& str, const std::string& suffix);
|
||||
std::string join_paths(const std::string& path1, const std::string& path2);
|
||||
std::string basename(const std::string &path);
|
||||
void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16);
|
||||
std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b);
|
||||
void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id);
|
||||
void process_shaders(std::vector<std::future<void>>& tasks);
|
||||
void write_output_files();
|
||||
|
||||
std::mutex lock;
|
||||
std::vector<std::pair<std::string, std::string>> shader_fnames;
|
||||
|
||||
|
@ -52,7 +38,7 @@ std::string input_dir = "vulkan-shaders";
|
|||
std::string output_dir = "/tmp";
|
||||
std::string target_hpp = "ggml-vulkan-shaders.hpp";
|
||||
std::string target_cpp = "ggml-vulkan-shaders.cpp";
|
||||
bool clean = true;
|
||||
bool no_clean = false;
|
||||
|
||||
const std::vector<std::string> type_names = {
|
||||
"f32",
|
||||
|
@ -193,11 +179,7 @@ bool string_ends_with(const std::string& str, const std::string& suffix) {
|
|||
return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
static const char path_separator = '\\';
|
||||
#else
|
||||
static const char path_separator = '/';
|
||||
#endif
|
||||
static const char path_separator = '/';
|
||||
|
||||
std::string join_paths(const std::string& path1, const std::string& path2) {
|
||||
return path1 + path_separator + path2;
|
||||
|
@ -212,7 +194,11 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const
|
|||
std::string out_fname = join_paths(output_dir, name + ".spv");
|
||||
std::string in_path = join_paths(input_dir, in_fname);
|
||||
|
||||
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
|
||||
#ifdef _WIN32
|
||||
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""};
|
||||
#else
|
||||
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
|
||||
#endif
|
||||
for (const auto& define : defines) {
|
||||
cmd.push_back("-D" + define.first + "=" + define.second);
|
||||
}
|
||||
|
@ -283,9 +269,12 @@ void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmu
|
|||
|
||||
for (const auto& tname : type_names) {
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
// For unaligned, load one at a time for f32/f16, or two at a time for quants
|
||||
std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2";
|
||||
// For aligned matmul loads
|
||||
std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2";
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
|
||||
|
@ -354,6 +343,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
|||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
|
@ -371,6 +363,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
|||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
||||
|
@ -396,15 +391,42 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
|||
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
@ -438,6 +460,17 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
|||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
}
|
||||
|
||||
void write_output_files() {
|
||||
|
@ -449,10 +482,16 @@ void write_output_files() {
|
|||
|
||||
for (const auto& pair : shader_fnames) {
|
||||
const std::string& name = pair.first;
|
||||
const std::string& path = pair.second;
|
||||
#ifdef _WIN32
|
||||
std::string path = pair.second;
|
||||
std::replace(path.begin(), path.end(), '/', '\\' );
|
||||
#else
|
||||
const std::string& path = pair.second;
|
||||
#endif
|
||||
|
||||
FILE* spv = fopen(path.c_str(), "rb");
|
||||
if (!spv) {
|
||||
std::cerr << "Error opening SPIR-V file: " << path << "\n";
|
||||
std::cerr << "Error opening SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -464,7 +503,7 @@ void write_output_files() {
|
|||
size_t read_size = fread(data.data(), 1, size, spv);
|
||||
fclose(spv);
|
||||
if (read_size != size) {
|
||||
std::cerr << "Error reading SPIR-V file: " << path << "\n";
|
||||
std::cerr << "Error reading SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -478,9 +517,8 @@ void write_output_files() {
|
|||
}
|
||||
fprintf(src, "\n};\n\n");
|
||||
|
||||
if (clean) {
|
||||
if (!no_clean) {
|
||||
std::remove(path.c_str());
|
||||
// fprintf(stderr, "Removed: %s\n", path.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -496,18 +534,6 @@ int main(int argc, char** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
if (argc <= 1 || args.find("--help") != args.end()) {
|
||||
std::cout << "Usage:\n"
|
||||
"\tvulkan-shaders-gen [options]\n\n"
|
||||
"Options:\n"
|
||||
"\t--glslc <path> Path to glslc executable (default: /usr/bin/glslc)\n"
|
||||
"\t--input-dir Directory containing shader sources (required)\n"
|
||||
"\t--output-dir Output directory for generated SPIR-V files and optional C++ headers\n"
|
||||
"\t--target-hpp <path> Path to generate a header file with shader declarations in C++ format\n"
|
||||
"\t--target-cpp <path> Path to generate a source code file implementing the declared shaders (optional)\n"
|
||||
"\t--no-clean Keep temporary SPIR-V files after build (default: remove them)\n";
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
if (args.find("--glslc") != args.end()) {
|
||||
GLSLC = args["--glslc"]; // Path to glslc
|
||||
}
|
||||
|
@ -524,7 +550,7 @@ int main(int argc, char** argv) {
|
|||
target_cpp = args["--target-cpp"]; // Path to generated cpp file
|
||||
}
|
||||
if (args.find("--no-clean") != args.end()) {
|
||||
clean = false; // Keep temporary SPIR-V files in output-dir after build
|
||||
no_clean = true; // Keep temporary SPIR-V files in output-dir after build
|
||||
}
|
||||
|
||||
if (!directory_exists(input_dir)) {
|
||||
|
|
|
@ -161,6 +161,7 @@ class Keys:
|
|||
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
|
||||
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
||||
EOT_ID = "tokenizer.ggml.eot_token_id"
|
||||
EOM_ID = "tokenizer.ggml.eom_token_id"
|
||||
|
||||
class Adapter:
|
||||
TYPE = "adapter.type"
|
||||
|
@ -1327,3 +1328,4 @@ KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID
|
|||
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
|
||||
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
|
||||
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
|
||||
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
|
||||
|
|
|
@ -828,6 +828,9 @@ class GGUFWriter:
|
|||
def add_eot_token_id(self, id: int) -> None:
|
||||
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
|
||||
|
||||
def add_eom_token_id(self, id: int) -> None:
|
||||
self.add_uint32(Keys.Tokenizer.EOM_ID, id)
|
||||
|
||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||
pack_prefix = ''
|
||||
if not skip_pack_prefix:
|
||||
|
|
|
@ -284,20 +284,67 @@ class Metadata:
|
|||
########################
|
||||
if model_card is not None:
|
||||
|
||||
if "model_name" in model_card and metadata.name is None:
|
||||
# Not part of huggingface model card standard but notice some model creator using it
|
||||
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
|
||||
metadata.name = model_card.get("model_name")
|
||||
def use_model_card_metadata(metadata_key: str, model_card_key: str):
|
||||
if model_card_key in model_card and getattr(metadata, metadata_key, None) is None:
|
||||
setattr(metadata, metadata_key, model_card.get(model_card_key))
|
||||
|
||||
if "model_creator" in model_card and metadata.author is None:
|
||||
# Not part of huggingface model card standard but notice some model creator using it
|
||||
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
|
||||
metadata.author = model_card.get("model_creator")
|
||||
def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
|
||||
# Note: Will append rather than replace if already exist
|
||||
tags_value = model_card.get(model_card_key, None)
|
||||
if tags_value is None:
|
||||
return
|
||||
|
||||
if "model_type" in model_card and metadata.basename is None:
|
||||
# Not part of huggingface model card standard but notice some model creator using it
|
||||
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
|
||||
metadata.basename = model_card.get("model_type")
|
||||
current_value = getattr(metadata, metadata_key, None)
|
||||
if current_value is None:
|
||||
current_value = []
|
||||
|
||||
if isinstance(tags_value, str):
|
||||
current_value.append(tags_value)
|
||||
elif isinstance(tags_value, list):
|
||||
current_value.extend(tags_value)
|
||||
|
||||
setattr(metadata, metadata_key, current_value)
|
||||
|
||||
# LLAMA.cpp's direct internal convention
|
||||
# (Definitely not part of hugging face formal/informal standard)
|
||||
#########################################
|
||||
use_model_card_metadata("name", "name")
|
||||
use_model_card_metadata("author", "author")
|
||||
use_model_card_metadata("version", "version")
|
||||
use_model_card_metadata("organization", "organization")
|
||||
use_model_card_metadata("description", "description")
|
||||
use_model_card_metadata("finetune", "finetune")
|
||||
use_model_card_metadata("basename", "basename")
|
||||
use_model_card_metadata("size_label", "size_label")
|
||||
use_model_card_metadata("source_url", "url")
|
||||
use_model_card_metadata("source_doi", "doi")
|
||||
use_model_card_metadata("source_uuid", "uuid")
|
||||
use_model_card_metadata("source_repo_url", "repo_url")
|
||||
|
||||
# LLAMA.cpp's huggingface style convention
|
||||
# (Definitely not part of hugging face formal/informal standard... but with model_ appended to match their style)
|
||||
###########################################
|
||||
use_model_card_metadata("name", "model_name")
|
||||
use_model_card_metadata("author", "model_author")
|
||||
use_model_card_metadata("version", "model_version")
|
||||
use_model_card_metadata("organization", "model_organization")
|
||||
use_model_card_metadata("description", "model_description")
|
||||
use_model_card_metadata("finetune", "model_finetune")
|
||||
use_model_card_metadata("basename", "model_basename")
|
||||
use_model_card_metadata("size_label", "model_size_label")
|
||||
use_model_card_metadata("source_url", "model_url")
|
||||
use_model_card_metadata("source_doi", "model_doi")
|
||||
use_model_card_metadata("source_uuid", "model_uuid")
|
||||
use_model_card_metadata("source_repo_url", "model_repo_url")
|
||||
|
||||
# Hugging Face Direct Convention
|
||||
#################################
|
||||
|
||||
# Not part of huggingface model card standard but notice some model creator using it
|
||||
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
|
||||
use_model_card_metadata("name", "model_name")
|
||||
use_model_card_metadata("author", "model_creator")
|
||||
use_model_card_metadata("basename", "model_type")
|
||||
|
||||
if "base_model" in model_card:
|
||||
# This represents the parent models that this is based on
|
||||
|
@ -329,58 +376,18 @@ class Metadata:
|
|||
base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}"
|
||||
metadata.base_models.append(base_model)
|
||||
|
||||
if "license" in model_card and metadata.license is None:
|
||||
metadata.license = model_card.get("license")
|
||||
use_model_card_metadata("license", "license")
|
||||
use_model_card_metadata("license_name", "license_name")
|
||||
use_model_card_metadata("license_link", "license_link")
|
||||
|
||||
if "license_name" in model_card and metadata.license_name is None:
|
||||
metadata.license_name = model_card.get("license_name")
|
||||
use_array_model_card_metadata("tags", "tags")
|
||||
use_array_model_card_metadata("tags", "pipeline_tag")
|
||||
|
||||
if "license_link" in model_card and metadata.license_link is None:
|
||||
metadata.license_link = model_card.get("license_link")
|
||||
use_array_model_card_metadata("languages", "languages")
|
||||
use_array_model_card_metadata("languages", "language")
|
||||
|
||||
tags_value = model_card.get("tags", None)
|
||||
if tags_value is not None:
|
||||
|
||||
if metadata.tags is None:
|
||||
metadata.tags = []
|
||||
|
||||
if isinstance(tags_value, str):
|
||||
metadata.tags.append(tags_value)
|
||||
elif isinstance(tags_value, list):
|
||||
metadata.tags.extend(tags_value)
|
||||
|
||||
pipeline_tags_value = model_card.get("pipeline_tag", None)
|
||||
if pipeline_tags_value is not None:
|
||||
|
||||
if metadata.tags is None:
|
||||
metadata.tags = []
|
||||
|
||||
if isinstance(pipeline_tags_value, str):
|
||||
metadata.tags.append(pipeline_tags_value)
|
||||
elif isinstance(pipeline_tags_value, list):
|
||||
metadata.tags.extend(pipeline_tags_value)
|
||||
|
||||
language_value = model_card.get("languages", model_card.get("language", None))
|
||||
if language_value is not None:
|
||||
|
||||
if metadata.languages is None:
|
||||
metadata.languages = []
|
||||
|
||||
if isinstance(language_value, str):
|
||||
metadata.languages.append(language_value)
|
||||
elif isinstance(language_value, list):
|
||||
metadata.languages.extend(language_value)
|
||||
|
||||
dataset_value = model_card.get("datasets", model_card.get("dataset", None))
|
||||
if dataset_value is not None:
|
||||
|
||||
if metadata.datasets is None:
|
||||
metadata.datasets = []
|
||||
|
||||
if isinstance(dataset_value, str):
|
||||
metadata.datasets.append(dataset_value)
|
||||
elif isinstance(dataset_value, list):
|
||||
metadata.datasets.extend(dataset_value)
|
||||
use_array_model_card_metadata("datasets", "datasets")
|
||||
use_array_model_card_metadata("datasets", "dataset")
|
||||
|
||||
# Hugging Face Parameter Heuristics
|
||||
####################################
|
||||
|
|
|
@ -64,6 +64,7 @@ while read c; do
|
|||
src/ggml*.cu \
|
||||
src/ggml-cuda/* \
|
||||
src/ggml-sycl/* \
|
||||
src/vulkan-shaders/* \
|
||||
include/ggml*.h \
|
||||
tests/test-opt.cpp \
|
||||
tests/test-grad0.cpp \
|
||||
|
|
|
@ -1 +1 @@
|
|||
31d544f87835a55602883fe09156bb85a4c163d8
|
||||
18703ad600cc68dbdb04d57434c876989a841d12
|
||||
|
|
|
@ -1444,7 +1444,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
|
|||
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
|
||||
return token != -1 && (
|
||||
token == llama_token_eos_impl(vocab) ||
|
||||
token == llama_token_eot_impl(vocab)
|
||||
token == llama_token_eot_impl(vocab) ||
|
||||
token == llama_token_eom_impl(vocab)
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1500,6 +1501,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
|
|||
return vocab.special_eot_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
|
||||
return vocab.special_eom_id;
|
||||
}
|
||||
|
||||
int32_t llama_tokenize_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const char * text,
|
||||
|
|
|
@ -45,6 +45,7 @@ struct llama_vocab {
|
|||
id special_suffix_id = -1;
|
||||
id special_middle_id = -1;
|
||||
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
||||
id special_eom_id = -1;
|
||||
|
||||
// tokenizer flags
|
||||
bool tokenizer_add_space_prefix = false;
|
||||
|
@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
|||
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
|
||||
llama_token llama_token_eom_impl (const struct llama_vocab & vocab);
|
||||
|
||||
int32_t llama_tokenize_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
|
|
|
@ -122,17 +122,14 @@ static std::string trim(const std::string & str) {
|
|||
}
|
||||
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
std::string result;
|
||||
for (size_t pos = 0; ; pos += search.length()) {
|
||||
auto new_pos = s.find(search, pos);
|
||||
if (new_pos == std::string::npos) {
|
||||
result += s.substr(pos, s.size() - pos);
|
||||
break;
|
||||
}
|
||||
result += s.substr(pos, new_pos - pos) + replace;
|
||||
pos = new_pos;
|
||||
if (search.empty()) {
|
||||
return; // Avoid infinite loop if 'search' is an empty string
|
||||
}
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(search, pos)) != std::string::npos) {
|
||||
s.replace(pos, search.length(), replace);
|
||||
pos += replace.length();
|
||||
}
|
||||
s = std::move(result);
|
||||
}
|
||||
|
||||
static bool is_float_close(float a, float b, float abs_tol) {
|
||||
|
@ -362,6 +359,7 @@ enum llm_kv {
|
|||
LLM_KV_TOKENIZER_SUFFIX_ID,
|
||||
LLM_KV_TOKENIZER_MIDDLE_ID,
|
||||
LLM_KV_TOKENIZER_EOT_ID,
|
||||
LLM_KV_TOKENIZER_EOM_ID,
|
||||
|
||||
LLM_KV_ADAPTER_TYPE,
|
||||
LLM_KV_ADAPTER_LORA_ALPHA,
|
||||
|
@ -459,6 +457,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
||||
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
||||
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
||||
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
|
||||
|
||||
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
||||
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
||||
|
@ -5586,6 +5585,7 @@ static void llm_load_vocab(
|
|||
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
|
||||
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
||||
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
||||
{ LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
|
||||
};
|
||||
|
||||
for (const auto & it : special_token_types) {
|
||||
|
@ -5638,6 +5638,17 @@ static void llm_load_vocab(
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// find EOM token: "<|eom_id|>"
|
||||
//
|
||||
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
|
||||
// for now, we apply this workaround to find the EOM token based on its text
|
||||
if (vocab.special_eom_id == -1) {
|
||||
const auto & t = vocab.token_to_id.find("<|eom_id|>");
|
||||
if (t != vocab.token_to_id.end()) {
|
||||
vocab.special_eom_id = t->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// build special tokens cache
|
||||
|
|
|
@ -2271,9 +2271,10 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
|
||||
for (ggml_type type_a : other_types) {
|
||||
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
||||
|
||||
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), { 1, 1}, {1, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
|
||||
if (ggml_blck_size(type_a) != 256) {
|
||||
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1, 1}, {1, 1}));
|
||||
}
|
||||
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 1}));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue