Merge branch 'master' of https://github.com/ggerganov/llama.cpp into ntkv2
This commit is contained in:
commit
4d5ed8349d
37 changed files with 816 additions and 274 deletions
|
@ -1,8 +1,7 @@
|
|||
---
|
||||
name: Issue and enhancement template
|
||||
about: Used to report issues and request enhancements for llama.cpp
|
||||
title: "[User] Insert summary of your issue or enhancement.."
|
||||
labels: ''
|
||||
name: Bug template
|
||||
about: Used to report bugs in llama.cpp
|
||||
labels: ["bug"]
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
@ -46,7 +45,7 @@ $ g++ --version
|
|||
|
||||
# Failure Information (for bugs)
|
||||
|
||||
Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
|
||||
Please help provide information about the failure / bug.
|
||||
|
||||
# Steps to Reproduce
|
||||
|
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
Normal file
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
Normal file
|
@ -0,0 +1,28 @@
|
|||
---
|
||||
name: Enhancement template
|
||||
about: Used to request enhancements for llama.cpp
|
||||
labels: ["enhancement"]
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
# Prerequisites
|
||||
|
||||
Please answer the following questions for yourself before submitting an issue.
|
||||
|
||||
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
|
||||
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
|
||||
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
|
||||
|
||||
# Feature Description
|
||||
|
||||
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
||||
|
||||
# Motivation
|
||||
|
||||
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
||||
|
||||
# Possible Implementation
|
||||
|
||||
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
|
@ -331,6 +331,7 @@ if (LLAMA_CUBLAS)
|
|||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
|
|
5
Makefile
5
Makefile
|
@ -391,12 +391,9 @@ else
|
|||
endif #LLAMA_CUDA_NVCC
|
||||
ifdef CUDA_DOCKER_ARCH
|
||||
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
||||
endif # CUDA_DOCKER_ARCH
|
||||
ifdef CUDA_NATIVE_ARCH
|
||||
NVCCFLAGS += -arch=$(CUDA_NATIVE_ARCH)
|
||||
else
|
||||
NVCCFLAGS += -arch=native
|
||||
endif # CUDA_NATIVE_ARCH
|
||||
endif # CUDA_DOCKER_ARCH
|
||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||
endif # LLAMA_CUDA_FORCE_DMMV
|
||||
|
|
|
@ -933,13 +933,13 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||
}
|
||||
|
||||
if (params.ignore_eos) {
|
||||
params.sparams.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
||||
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
||||
}
|
||||
|
||||
{
|
||||
LOG("warming up the model with an empty run\n");
|
||||
|
||||
std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
|
||||
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
||||
llama_kv_cache_tokens_rm(lctx, -1, -1);
|
||||
llama_reset_timings(lctx);
|
||||
|
@ -994,7 +994,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
|
|||
}
|
||||
|
||||
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||
const llama_token bos_id = llama_token_bos(ctx);
|
||||
const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
|
||||
|
||||
std::string piece;
|
||||
std::string result;
|
||||
|
@ -1239,7 +1239,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|||
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
||||
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
||||
|
||||
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
|
||||
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
|
||||
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
||||
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
||||
|
||||
|
|
|
@ -147,7 +147,7 @@ llama_token llama_sampling_sample(
|
|||
|
||||
// apply penalties
|
||||
if (!prev.empty()) {
|
||||
const float nl_logit = logits[llama_token_nl(ctx_main)];
|
||||
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
|
||||
|
||||
llama_sample_repetition_penalties(ctx_main, &cur_p,
|
||||
prev.data() + prev.size() - penalty_last_n,
|
||||
|
@ -155,7 +155,7 @@ llama_token llama_sampling_sample(
|
|||
|
||||
if (!penalize_nl) {
|
||||
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
||||
if (cur_p.data[idx].id == llama_token_nl(ctx_main)) {
|
||||
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
|
||||
cur_p.data[idx].logit = nl_logit;
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -236,8 +236,8 @@ int64_t get_example_targets_batch(
|
|||
int64_t used_samples = 0;
|
||||
|
||||
ggml_set_f32(target_probs, 0.0f);
|
||||
llama_token bos = llama_token_bos(lctx);
|
||||
llama_token eos = llama_token_eos(lctx);
|
||||
llama_token bos = llama_token_bos(llama_get_model(lctx));
|
||||
llama_token eos = llama_token_eos(llama_get_model(lctx));
|
||||
// printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
|
||||
for (int k=0; k<n_batch; ++k) {
|
||||
// printf("%s: batch %d\n", __func__, k);
|
||||
|
@ -924,7 +924,7 @@ size_t tokenize_file(
|
|||
for (llama_token token=0; token < n_vocab; ++token) {
|
||||
max_token_text_size = std::max(
|
||||
max_token_text_size,
|
||||
strlen(llama_token_get_text(lctx, token)));
|
||||
strlen(llama_token_get_text(llama_get_model(lctx), token)));
|
||||
}
|
||||
|
||||
// upper bound of context byte length.
|
||||
|
|
|
@ -118,15 +118,24 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
|||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
|
||||
|
|
|
@ -123,15 +123,24 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
|||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
|
||||
|
|
|
@ -136,9 +136,11 @@ for i in range(vocab_size):
|
|||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
# NOTE: wouldn't we like to distinguish CONTROL tokens here?
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
|
|
@ -139,15 +139,24 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
|||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
|
||||
|
|
|
@ -111,17 +111,25 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
|||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ int main(int argc, char ** argv) {
|
|||
gpt_params params;
|
||||
|
||||
if (argc == 1 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN]\n" , argv[0]);
|
||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
|
||||
return 1 ;
|
||||
}
|
||||
|
||||
|
@ -21,6 +21,9 @@ int main(int argc, char ** argv) {
|
|||
// total length of the sequences including the prompt
|
||||
int n_len = 32;
|
||||
|
||||
// number of layers to offload to the GPU
|
||||
int n_gpu_layers = 0;
|
||||
|
||||
if (argc >= 2) {
|
||||
params.model = argv[1];
|
||||
}
|
||||
|
@ -37,6 +40,10 @@ int main(int argc, char ** argv) {
|
|||
n_len = std::atoi(argv[4]);
|
||||
}
|
||||
|
||||
if (argc >= 6) {
|
||||
n_gpu_layers = std::atoi(argv[5]);
|
||||
}
|
||||
|
||||
if (params.prompt.empty()) {
|
||||
params.prompt = "Hello my name is";
|
||||
}
|
||||
|
@ -49,7 +56,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
// model_params.n_gpu_layers = 99; // offload all layers to the GPU
|
||||
model_params.n_gpu_layers = n_gpu_layers;
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
|
@ -180,7 +187,7 @@ int main(int argc, char ** argv) {
|
|||
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
|
||||
// is it an end of stream? -> mark the stream as finished
|
||||
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
|
||||
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
|
||||
i_batch[i] = -1;
|
||||
LOG_TEE("\n");
|
||||
if (n_parallel > 1) {
|
||||
|
|
|
@ -47,7 +47,7 @@ struct beam_search_callback_data {
|
|||
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
||||
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
||||
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
|
||||
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
|
||||
return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
|
||||
}
|
||||
|
||||
// Function matching type llama_beam_search_callback_fn_t.
|
||||
|
|
|
@ -246,14 +246,14 @@ int main(int argc, char ** argv) {
|
|||
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
|
||||
inp_sfx.erase(inp_sfx.begin());
|
||||
}
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
||||
if (add_bos) {
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
|
||||
}
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
||||
embd_inp = inp_pfx;
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
embd_inp.push_back(llama_token_middle(ctx));
|
||||
embd_inp.push_back(llama_token_middle(model));
|
||||
|
||||
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
|
||||
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
|
||||
|
@ -261,7 +261,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
embd_inp.push_back(llama_token_bos(ctx));
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
}
|
||||
|
||||
|
@ -577,10 +577,10 @@ int main(int argc, char ** argv) {
|
|||
if ((int) embd_inp.size() <= n_consumed) {
|
||||
|
||||
// deal with eot token in infill mode
|
||||
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(ctx) || is_interacting) && params.interactive){
|
||||
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
|
||||
if(is_interacting && !params.interactive_first) {
|
||||
// print an eot token
|
||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
|
||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n");
|
||||
|
@ -627,14 +627,14 @@ int main(int argc, char ** argv) {
|
|||
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
|
||||
inp_sfx.erase(inp_sfx.begin());
|
||||
}
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
||||
if (add_bos) {
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
|
||||
}
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
||||
embd_inp = inp_pfx;
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
embd_inp.push_back(llama_token_middle(ctx));
|
||||
embd_inp.push_back(llama_token_middle(model));
|
||||
embd.clear();
|
||||
embd_guidance.clear();
|
||||
n_remain = params.n_predict;
|
||||
|
@ -644,7 +644,7 @@ int main(int argc, char ** argv) {
|
|||
is_interacting = false;
|
||||
}
|
||||
// deal with end of text token in interactive mode
|
||||
else if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) {
|
||||
else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
|
||||
LOG("found EOS token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
|
@ -661,7 +661,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG("adding input prefix BOS token\n");
|
||||
embd_inp.push_back(llama_token_bos(ctx));
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
|
@ -724,7 +724,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// end of text token
|
||||
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) {
|
||||
if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -736,7 +736,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
if (!params.interactive && n_remain <= 0) {
|
||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
|
||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
|
|
@ -933,7 +933,7 @@ struct sql_printer : public printer {
|
|||
};
|
||||
|
||||
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
|
||||
std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
|
||||
std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
|
||||
int n_processed = 0;
|
||||
|
||||
llama_set_n_threads(ctx, n_threads, n_threads);
|
||||
|
@ -946,7 +946,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
|
|||
}
|
||||
|
||||
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
|
||||
llama_token token = llama_token_bos(ctx);
|
||||
llama_token token = llama_token_bos(llama_get_model(ctx));
|
||||
|
||||
llama_set_n_threads(ctx, n_threads, n_threads);
|
||||
|
||||
|
|
|
@ -137,7 +137,7 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
|||
inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
|
||||
int id = sample_id(ctx_llama, params);
|
||||
static std::string ret;
|
||||
if (id == llama_token_eos(ctx_llama)) {
|
||||
if (id == llama_token_eos(llama_get_model(ctx_llama))) {
|
||||
ret = "</s>";
|
||||
} else {
|
||||
ret = llama_token_to_piece(ctx_llama, id);
|
||||
|
|
|
@ -16,6 +16,8 @@ add_library(common OBJECT
|
|||
${_common_path}/console.cpp
|
||||
${_common_path}/grammar-parser.h
|
||||
${_common_path}/grammar-parser.cpp
|
||||
${_common_path}/sampling.h
|
||||
${_common_path}/sampling.cpp
|
||||
)
|
||||
|
||||
# WARNING: because build-info.h is auto-generated, it will only
|
||||
|
|
|
@ -248,7 +248,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
embd_inp.push_back(llama_token_bos(ctx));
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
}
|
||||
|
||||
|
@ -693,7 +693,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// deal with end of text token in interactive mode
|
||||
if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) {
|
||||
if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
|
||||
LOG("found EOS token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
|
@ -720,7 +720,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG("adding input prefix BOS token\n");
|
||||
embd_inp.push_back(llama_token_bos(ctx));
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
|
@ -804,7 +804,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// end of text token
|
||||
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
|
||||
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
|
||||
LOG_TEE(" [end of text]\n");
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -347,7 +347,7 @@ int main(int argc, char ** argv) {
|
|||
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
|
||||
|
||||
if (client.n_decoded > 2 &&
|
||||
(id == llama_token_eos(ctx) ||
|
||||
(id == llama_token_eos(model) ||
|
||||
(params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
|
||||
client.response.find("User:") != std::string::npos ||
|
||||
client.response.find('\n') != std::string::npos)) {
|
||||
|
|
|
@ -227,7 +227,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||
|
||||
// add BOS token for the first batch of each chunk
|
||||
if (add_bos && j == 0) {
|
||||
tokens[batch_start] = llama_token_bos(ctx);
|
||||
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
|
||||
}
|
||||
|
||||
const auto batch_logits = llama_get_logits(ctx);
|
||||
|
@ -350,7 +350,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||
|
||||
// add BOS token for the first batch of each chunk
|
||||
if (add_bos && j == 0) {
|
||||
tokens[batch_start] = llama_token_bos(ctx);
|
||||
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
||||
|
|
|
@ -454,7 +454,7 @@ struct llama_client_slot
|
|||
}
|
||||
|
||||
void release() {
|
||||
if (state == PROCESSING)
|
||||
if (state == IDLE || state == PROCESSING)
|
||||
{
|
||||
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
|
||||
command = RELEASE;
|
||||
|
@ -726,7 +726,7 @@ struct llama_server_context
|
|||
|
||||
if (json_value(data, "ignore_eos", false))
|
||||
{
|
||||
slot->sparams.logit_bias[llama_token_eos(ctx)] = -INFINITY;
|
||||
slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
||||
}
|
||||
|
||||
const auto &logit_bias = data.find("logit_bias");
|
||||
|
@ -754,6 +754,7 @@ struct llama_server_context
|
|||
}
|
||||
|
||||
slot->params.antiprompt.clear();
|
||||
|
||||
const auto &stop = data.find("stop");
|
||||
if (stop != data.end() && stop->is_array())
|
||||
{
|
||||
|
@ -867,7 +868,7 @@ struct llama_server_context
|
|||
|
||||
kv_cache_clear();
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i)
|
||||
for (int i = 0; i < (int) system_tokens.size(); ++i)
|
||||
{
|
||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
}
|
||||
|
@ -894,16 +895,8 @@ struct llama_server_context
|
|||
{
|
||||
slot.release();
|
||||
}
|
||||
wait_all_are_idle();
|
||||
all_slots_are_idle = true;
|
||||
|
||||
// wait until system prompt load
|
||||
system_need_update = true;
|
||||
while (system_need_update)
|
||||
{
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(5));
|
||||
}
|
||||
// system prompt loaded, continue
|
||||
}
|
||||
|
||||
void process_system_prompt_data(const json &sys_props) {
|
||||
|
@ -915,26 +908,6 @@ struct llama_server_context
|
|||
{
|
||||
notify_system_prompt_changed();
|
||||
}
|
||||
else
|
||||
{
|
||||
system_need_update = true;
|
||||
}
|
||||
}
|
||||
|
||||
void wait_all_are_idle() {
|
||||
bool wait = true;
|
||||
while (wait)
|
||||
{
|
||||
wait = false;
|
||||
for (auto &slot : slots)
|
||||
{
|
||||
if (!slot.available())
|
||||
{
|
||||
wait = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
|
||||
|
@ -965,7 +938,6 @@ struct llama_server_context
|
|||
slot.has_next_token = false;
|
||||
}
|
||||
stop_pos = pos;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1056,7 +1028,7 @@ struct llama_server_context
|
|||
slot.has_next_token = false;
|
||||
}
|
||||
|
||||
if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(ctx))
|
||||
if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
|
||||
{
|
||||
slot.stopped_eos = true;
|
||||
slot.has_next_token = false;
|
||||
|
@ -1130,7 +1102,7 @@ struct llama_server_context
|
|||
|
||||
json get_formated_generation(llama_client_slot &slot)
|
||||
{
|
||||
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(ctx));
|
||||
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
||||
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
|
||||
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
||||
return json {
|
||||
|
@ -1444,7 +1416,7 @@ struct llama_server_context
|
|||
process_tasks();
|
||||
|
||||
// update the system prompt wait until all slots are idle state
|
||||
if (system_need_update)
|
||||
if (system_need_update && all_slots_are_idle)
|
||||
{
|
||||
LOG_TEE("updating system prompt\n");
|
||||
update_system_prompt();
|
||||
|
@ -1498,7 +1470,7 @@ struct llama_server_context
|
|||
for (auto & slot : slots)
|
||||
{
|
||||
// release the slot
|
||||
if (slot.state == PROCESSING && slot.command == RELEASE)
|
||||
if (slot.command == RELEASE)
|
||||
{
|
||||
slot.state = IDLE;
|
||||
slot.command = NONE;
|
||||
|
@ -1509,7 +1481,7 @@ struct llama_server_context
|
|||
continue;
|
||||
}
|
||||
|
||||
if (slot.state == IDLE || slot.command == RELEASE)
|
||||
if (slot.state == IDLE)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
@ -1530,6 +1502,17 @@ struct llama_server_context
|
|||
{
|
||||
for (auto & slot : slots)
|
||||
{
|
||||
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
|
||||
|
||||
// empty prompt passed -> release the slot and send empty response
|
||||
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
|
||||
{
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
continue;
|
||||
}
|
||||
|
||||
// need process the prompt
|
||||
if (slot.state == IDLE && slot.command == LOAD_PROMPT)
|
||||
{
|
||||
|
@ -1555,11 +1538,11 @@ struct llama_server_context
|
|||
suffix_tokens.erase(suffix_tokens.begin());
|
||||
}
|
||||
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
|
||||
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
|
||||
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
|
||||
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
||||
prefix_tokens.push_back(llama_token_middle(ctx));
|
||||
prefix_tokens.push_back(llama_token_middle(model));
|
||||
prompt_tokens = prefix_tokens;
|
||||
}
|
||||
else
|
||||
|
@ -1749,8 +1732,8 @@ struct llama_server_context
|
|||
if (!process_token(result, slot))
|
||||
{
|
||||
slot.release();
|
||||
send_final_response(slot);
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
}
|
||||
|
||||
slot.i_batch = -1;
|
||||
|
@ -1766,21 +1749,22 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
printf("usage: %s [options]\n", argv0);
|
||||
printf("\n");
|
||||
printf("options:\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
printf(" --rope-scaling {none,linear,yarn}\n");
|
||||
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
|
||||
printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
|
||||
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
|
||||
printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
|
||||
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
if (llama_mlock_supported())
|
||||
{
|
||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
|
@ -1975,6 +1959,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
}
|
||||
params.n_threads = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--threads-batch" || arg == "-tb")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads_batch = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "-b" || arg == "--batch-size")
|
||||
{
|
||||
if (++i >= argc)
|
||||
|
@ -2336,7 +2329,7 @@ int main(int argc, char **argv)
|
|||
if (!json_value(data, "stream", false)) {
|
||||
std::string completion_text;
|
||||
task_result result = llama.next_result(task_id);
|
||||
if(!result.error && result.stop) {
|
||||
if (!result.error && result.stop) {
|
||||
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
|
||||
}
|
||||
else
|
||||
|
@ -2363,7 +2356,7 @@ int main(int argc, char **argv)
|
|||
{
|
||||
return false;
|
||||
}
|
||||
if(result.stop) {
|
||||
if (result.stop) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -138,7 +138,7 @@ int main(int argc, char ** argv) {
|
|||
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
|
||||
// is it an end of stream?
|
||||
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
|
||||
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
|
||||
LOG_TEE("\n");
|
||||
|
||||
break;
|
||||
|
|
|
@ -163,7 +163,7 @@ int main(int argc, char ** argv) {
|
|||
printf("%s", token_str.c_str());
|
||||
fflush(stdout);
|
||||
|
||||
if (id == llama_token_eos(ctx_tgt)) {
|
||||
if (id == llama_token_eos(model_tgt)) {
|
||||
has_eos = true;
|
||||
}
|
||||
|
||||
|
|
200
ggml-cuda.cu
200
ggml-cuda.cu
|
@ -29,6 +29,8 @@
|
|||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
#define cublasCreate hipblasCreate
|
||||
#define cublasGemmEx hipblasGemmEx
|
||||
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
||||
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
||||
#define cublasHandle_t hipblasHandle_t
|
||||
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
||||
#define cublasSetStream hipblasSetStream
|
||||
|
@ -4326,13 +4328,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|||
|
||||
const half * x = (const half *) vx;
|
||||
|
||||
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
||||
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
||||
const int channel_x = channel / channel_x_divisor;
|
||||
|
||||
const int nrows_y = ncols_x;
|
||||
const int nrows_y = ncols_x;
|
||||
const int nrows_dst = nrows_x;
|
||||
const int row_dst = row_x;
|
||||
const int row_dst = row_x;
|
||||
|
||||
const int idst = channel*nrows_dst + row_dst;
|
||||
|
||||
|
@ -4345,13 +4347,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|||
break;
|
||||
}
|
||||
|
||||
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
||||
const float xi = __half2float(x[ix]);
|
||||
|
||||
const int row_y = col_x;
|
||||
|
||||
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
||||
const int iy = channel*nrows_y + row_y;
|
||||
|
||||
const float xi = __half2float(x[ix]);
|
||||
|
||||
tmp += xi * y[iy];
|
||||
}
|
||||
|
||||
|
@ -5698,10 +5700,10 @@ void ggml_init_cublas() {
|
|||
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
||||
int64_t total_vram = 0;
|
||||
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
||||
fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
||||
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
||||
|
||||
g_tensor_split[id] = total_vram;
|
||||
total_vram += prop.totalGlobalMem;
|
||||
|
@ -5711,15 +5713,15 @@ void ggml_init_cublas() {
|
|||
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
g_tensor_split[id] /= total_vram;
|
||||
}
|
||||
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
||||
|
||||
// create cuda streams
|
||||
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
||||
for (int is = 0; is < MAX_STREAMS; ++is) {
|
||||
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
|
||||
}
|
||||
|
||||
|
@ -7063,7 +7065,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
|||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
||||
GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(!ggml_is_permuted(src0));
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
|
@ -7073,11 +7076,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
|
||||
const int64_t nb01 = src0->nb[1];
|
||||
const int64_t nb02 = src0->nb[2];
|
||||
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
|
||||
|
@ -7096,6 +7099,159 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|||
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
|
||||
const int64_t nb01 = src0->nb[1];
|
||||
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
||||
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
const int64_t ne13 = src1->ne[3];
|
||||
|
||||
const int64_t nb11 = src1->nb[1];
|
||||
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
||||
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
||||
|
||||
const int64_t ne1 = ggml_nelements(src1);
|
||||
const int64_t ne = ggml_nelements(dst);
|
||||
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
|
||||
int id;
|
||||
CUDA_CHECK(cudaGetDevice(&id));
|
||||
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
|
||||
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
void * src0_ddq = src0_extra->data_device[g_main_device];
|
||||
half * src0_as_f16 = (half *) src0_ddq;
|
||||
|
||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
||||
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
||||
|
||||
// convert src1 to fp16
|
||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||
|
||||
size_t src1_as = 0;
|
||||
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
||||
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
||||
|
||||
size_t dst_as = 0;
|
||||
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
||||
|
||||
GGML_ASSERT(ne12 % ne02 == 0);
|
||||
GGML_ASSERT(ne13 % ne03 == 0);
|
||||
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12/ne02;
|
||||
const int64_t r3 = ne13/ne03;
|
||||
|
||||
const half alpha_f16 = 1.0f;
|
||||
const half beta_f16 = 0.0f;
|
||||
|
||||
#if 0
|
||||
// use cublasGemmEx
|
||||
{
|
||||
for (int i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int i12 = 0; i12 < ne12; ++i12) {
|
||||
int i03 = i13 / r3;
|
||||
int i02 = i12 / r2;
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
||||
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
||||
&beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
// use cublasGemmStridedBatchedEx
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
||||
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
||||
&beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
|
||||
ne12*ne13,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
} else {
|
||||
// use cublasGemmBatchedEx
|
||||
// TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
|
||||
const int ne23 = ne12*ne13;
|
||||
|
||||
// TODO: avoid this alloc
|
||||
void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
|
||||
|
||||
for (int i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int i12 = 0; i12 < ne12; ++i12) {
|
||||
int i03 = i13 / r3;
|
||||
int i02 = i12 / r2;
|
||||
|
||||
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
|
||||
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
|
||||
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
|
||||
}
|
||||
}
|
||||
|
||||
// allocate device memory for pointers
|
||||
void ** ptrs_as = nullptr;
|
||||
CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
|
||||
|
||||
// TODO: this does not work for some reason -- not sure why?
|
||||
//size_t ptrs_s = 0;
|
||||
//ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
|
||||
|
||||
// copy pointers to device
|
||||
CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
|
||||
|
||||
free(ptrs);
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
||||
(const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
||||
&beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
|
||||
ne23,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
|
||||
// free device memory for pointers
|
||||
CUDA_CHECK(cudaFree(ptrs_as));
|
||||
//ggml_cuda_pool_free(ptrs_as, ptrs_s);
|
||||
}
|
||||
#endif
|
||||
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
||||
|
||||
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
||||
ggml_cuda_pool_free(dst_f16, dst_as);
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
||||
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
||||
|
@ -7108,10 +7264,22 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|||
}
|
||||
}
|
||||
|
||||
// debug helpers
|
||||
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
||||
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
||||
//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
|
||||
//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
|
||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||
|
||||
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
// KQ
|
||||
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
||||
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
||||
} else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// KQV
|
||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
||||
} else if (src0->type == GGML_TYPE_F32) {
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
||||
|
|
18
ggml-metal.m
18
ggml-metal.m
|
@ -62,6 +62,7 @@ struct ggml_metal_context {
|
|||
GGML_METAL_DECL_KERNEL(mul);
|
||||
GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
|
||||
GGML_METAL_DECL_KERNEL(scale);
|
||||
GGML_METAL_DECL_KERNEL(scale_4);
|
||||
GGML_METAL_DECL_KERNEL(silu);
|
||||
GGML_METAL_DECL_KERNEL(relu);
|
||||
GGML_METAL_DECL_KERNEL(gelu);
|
||||
|
@ -249,6 +250,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(mul);
|
||||
GGML_METAL_ADD_KERNEL(mul_row);
|
||||
GGML_METAL_ADD_KERNEL(scale);
|
||||
GGML_METAL_ADD_KERNEL(scale_4);
|
||||
GGML_METAL_ADD_KERNEL(silu);
|
||||
GGML_METAL_ADD_KERNEL(relu);
|
||||
GGML_METAL_ADD_KERNEL(gelu);
|
||||
|
@ -347,6 +349,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||
GGML_METAL_DEL_KERNEL(mul);
|
||||
GGML_METAL_DEL_KERNEL(mul_row);
|
||||
GGML_METAL_DEL_KERNEL(scale);
|
||||
GGML_METAL_DEL_KERNEL(scale_4);
|
||||
GGML_METAL_DEL_KERNEL(silu);
|
||||
GGML_METAL_DEL_KERNEL(relu);
|
||||
GGML_METAL_DEL_KERNEL(gelu);
|
||||
|
@ -923,15 +926,20 @@ void ggml_metal_graph_compute(
|
|||
|
||||
const float scale = *(const float *) src1->data;
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_scale];
|
||||
int64_t n = ggml_nelements(dst);
|
||||
|
||||
if (n % 4 == 0) {
|
||||
n /= 4;
|
||||
[encoder setComputePipelineState:ctx->pipeline_scale_4];
|
||||
} else {
|
||||
[encoder setComputePipelineState:ctx->pipeline_scale];
|
||||
}
|
||||
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
GGML_ASSERT(n % 4 == 0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(gf->nodes[i])) {
|
||||
|
|
|
@ -125,9 +125,17 @@ kernel void kernel_mul_row(
|
|||
}
|
||||
|
||||
kernel void kernel_scale(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
constant float & scale,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] * scale;
|
||||
}
|
||||
|
||||
kernel void kernel_scale_4(
|
||||
device const float4 * src0,
|
||||
device float4 * dst,
|
||||
constant float & scale,
|
||||
constant float & scale,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] * scale;
|
||||
}
|
||||
|
|
442
ggml.c
442
ggml.c
|
@ -572,7 +572,6 @@ int64_t ggml_cycles_per_ms(void) {
|
|||
#define ggml_perf_cycles_per_ms() 0
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// cache line
|
||||
//
|
||||
|
@ -1829,7 +1828,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|||
return type_traits[type];
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// simd mappings
|
||||
//
|
||||
|
@ -4058,16 +4056,17 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"ALIBI",
|
||||
"CLAMP",
|
||||
"CONV_1D",
|
||||
"CONV_1D_STAGE_0",
|
||||
"CONV_1D_STAGE_1",
|
||||
"CONV_TRANSPOSE_1D",
|
||||
"CONV_2D",
|
||||
"CONV_2D_STAGE_0",
|
||||
"CONV_2D_STAGE_1",
|
||||
"CONV_TRANSPOSE_2D",
|
||||
"POOL_1D",
|
||||
"POOL_2D",
|
||||
"UPSCALE",
|
||||
|
||||
"CONV_1D_STAGE_0",
|
||||
"CONV_1D_STAGE_1",
|
||||
|
||||
"FLASH_ATTN",
|
||||
"FLASH_FF",
|
||||
"FLASH_ATTN_BACK",
|
||||
|
@ -4093,7 +4092,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"CROSS_ENTROPY_LOSS_BACK",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
|
||||
static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
|
@ -4144,16 +4143,17 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"alibi(x)",
|
||||
"clamp(x)",
|
||||
"conv_1d(x)",
|
||||
"conv_1d_stage_0(x)",
|
||||
"conv_1d_stage_1(x)",
|
||||
"conv_transpose_1d(x)",
|
||||
"conv_2d(x)",
|
||||
"conv_2d_stage_0(x)",
|
||||
"conv_2d_stage_1(x)",
|
||||
"conv_transpose_2d(x)",
|
||||
"pool_1d(x)",
|
||||
"pool_2d(x)",
|
||||
"upscale(x)",
|
||||
|
||||
"conv_1d_stage_0(x)",
|
||||
"conv_1d_stage_1(x)",
|
||||
|
||||
"flash_attn(x)",
|
||||
"flash_ff(x)",
|
||||
"flash_attn_back(x)",
|
||||
|
@ -4179,7 +4179,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"cross_entropy_loss_back(x,y)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
|
||||
static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
|
@ -4210,8 +4210,10 @@ static void ggml_setup_op_has_task_pass(void) {
|
|||
p[GGML_OP_CONV_1D ] = true;
|
||||
p[GGML_OP_CONV_1D_STAGE_0 ] = true;
|
||||
p[GGML_OP_CONV_1D_STAGE_1 ] = true;
|
||||
p[GGML_OP_CONV_2D ] = true;
|
||||
p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
|
||||
p[GGML_OP_CONV_2D ] = true;
|
||||
p[GGML_OP_CONV_2D_STAGE_0 ] = true;
|
||||
p[GGML_OP_CONV_2D_STAGE_1 ] = true;
|
||||
p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
|
||||
p[GGML_OP_FLASH_ATTN_BACK ] = true;
|
||||
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
|
||||
|
@ -5955,7 +5957,6 @@ struct ggml_tensor * ggml_sqrt_inplace(
|
|||
return ggml_sqrt_impl(ctx, a, true);
|
||||
}
|
||||
|
||||
|
||||
// ggml_log
|
||||
|
||||
static struct ggml_tensor * ggml_log_impl(
|
||||
|
@ -6009,7 +6010,6 @@ struct ggml_tensor * ggml_sum(
|
|||
return result;
|
||||
}
|
||||
|
||||
|
||||
// ggml_sum_rows
|
||||
|
||||
struct ggml_tensor * ggml_sum_rows(
|
||||
|
@ -6641,7 +6641,6 @@ struct ggml_tensor * ggml_set_2d_inplace(
|
|||
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
|
||||
}
|
||||
|
||||
|
||||
// ggml_cpy
|
||||
|
||||
static struct ggml_tensor * ggml_cpy_impl(
|
||||
|
@ -6721,7 +6720,6 @@ struct ggml_tensor * ggml_cont_inplace(
|
|||
return ggml_cont_impl(ctx, a, true);
|
||||
}
|
||||
|
||||
|
||||
// make contiguous, with new shape
|
||||
GGML_API struct ggml_tensor * ggml_cont_1d(
|
||||
struct ggml_context * ctx,
|
||||
|
@ -7174,7 +7172,6 @@ struct ggml_tensor * ggml_diag(
|
|||
return result;
|
||||
}
|
||||
|
||||
|
||||
// ggml_diag_mask_inf
|
||||
|
||||
static struct ggml_tensor * ggml_diag_mask_inf_impl(
|
||||
|
@ -7286,7 +7283,6 @@ struct ggml_tensor * ggml_soft_max_inplace(
|
|||
return ggml_soft_max_impl(ctx, a, true);
|
||||
}
|
||||
|
||||
|
||||
// ggml_soft_max_back
|
||||
|
||||
static struct ggml_tensor * ggml_soft_max_back_impl(
|
||||
|
@ -7732,7 +7728,11 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
|||
|
||||
// ggml_conv_2d
|
||||
|
||||
struct ggml_tensor * ggml_conv_2d(
|
||||
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
||||
// a: [OC,IC, KH, KW]
|
||||
// b: [N, IC, IH, IW]
|
||||
// result: [N, OH, OW, IC*KH*KW]
|
||||
static struct ggml_tensor * ggml_conv_2d_stage_0(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
|
@ -7751,17 +7751,21 @@ struct ggml_tensor * ggml_conv_2d(
|
|||
is_node = true;
|
||||
}
|
||||
|
||||
const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
|
||||
const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
|
||||
|
||||
const int64_t ne[4] = {
|
||||
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
|
||||
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
|
||||
a->ne[3], b->ne[3],
|
||||
a->ne[2] * a->ne[1] * a->ne[0],
|
||||
OW,
|
||||
OH,
|
||||
b->ne[3],
|
||||
};
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
|
||||
|
||||
int32_t params[] = { s0, s1, p0, p1, d0, d1 };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_CONV_2D;
|
||||
result->op = GGML_OP_CONV_2D_STAGE_0;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
|
@ -7770,8 +7774,61 @@ struct ggml_tensor * ggml_conv_2d(
|
|||
|
||||
}
|
||||
|
||||
// ggml_conv_2d_sk_p0
|
||||
// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
|
||||
// a: [OC, IC, KH, KW]
|
||||
// b: [N, OH, OW, IC * KH * KW]
|
||||
// result: [N, OC, OH, OW]
|
||||
static struct ggml_tensor * ggml_conv_2d_stage_1(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b) {
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad || b->grad) {
|
||||
GGML_ASSERT(false); // TODO: implement backward
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
const int64_t ne[4] = {
|
||||
b->ne[1],
|
||||
b->ne[2],
|
||||
a->ne[3],
|
||||
b->ne[3],
|
||||
};
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
|
||||
result->op = GGML_OP_CONV_2D_STAGE_1;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
// a: [OC,IC, KH, KW]
|
||||
// b: [N, IC, IH, IW]
|
||||
// result: [N, OC, OH, OW]
|
||||
struct ggml_tensor * ggml_conv_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1) {
|
||||
|
||||
struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
|
||||
result = ggml_conv_2d_stage_1(ctx, a, result);
|
||||
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
// ggml_conv_2d_sk_p0
|
||||
struct ggml_tensor * ggml_conv_2d_sk_p0(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
@ -8210,7 +8267,6 @@ static struct ggml_tensor * ggml_add_rel_pos_impl(
|
|||
return result;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * ggml_add_rel_pos(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
@ -8655,8 +8711,6 @@ struct ggml_tensor * ggml_map_custom3_inplace(
|
|||
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ggml_cross_entropy_loss
|
||||
|
||||
struct ggml_tensor * ggml_cross_entropy_loss(
|
||||
|
@ -9858,7 +9912,6 @@ static void ggml_compute_forward_add1(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_acc
|
||||
|
||||
static void ggml_compute_forward_acc_f32(
|
||||
|
@ -9998,7 +10051,6 @@ static void ggml_compute_forward_sub_f32(
|
|||
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
||||
|
||||
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
vDSP_vsub(
|
||||
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
||||
|
@ -10179,7 +10231,6 @@ static void ggml_compute_forward_div_f32(
|
|||
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
||||
|
||||
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
UNUSED(ggml_vec_div_f32);
|
||||
|
||||
|
@ -10317,7 +10368,6 @@ static void ggml_compute_forward_sqrt(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_log
|
||||
|
||||
static void ggml_compute_forward_log_f32(
|
||||
|
@ -12150,7 +12200,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
//int64_t t1 = ggml_perf_time_us();
|
||||
//static int64_t acc = 0;
|
||||
//acc += t1 - t0;
|
||||
|
@ -12346,7 +12395,6 @@ static void ggml_compute_forward_scale_f32(
|
|||
|
||||
const size_t nb1 = dst->nb[1];
|
||||
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
if (dst->data != src0->data) {
|
||||
// src0 is same shape as dst => same indices
|
||||
|
@ -12744,7 +12792,6 @@ static void ggml_compute_forward_get_rows_back_f32(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static void ggml_compute_forward_get_rows_back(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
|
@ -14087,6 +14134,7 @@ static void ggml_compute_forward_conv_1d_f32(
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
|
||||
static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
|
||||
ggml_fp16_t * A,
|
||||
ggml_fp16_t * B,
|
||||
|
@ -14388,6 +14436,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
|||
}
|
||||
}
|
||||
|
||||
// need to zero dst since we are accumulating into it
|
||||
memset(dst->data, 0, ggml_nbytes(dst));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -14460,7 +14511,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
|||
const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
|
||||
float * dst_data = wdata + i01*ne00*ne02;
|
||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||
dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00];
|
||||
dst_data[i00*ne02 + i02] = src[i00];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -14479,6 +14530,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
|||
}
|
||||
}
|
||||
|
||||
// need to zero dst since we are accumulating into it
|
||||
memset(dst->data, 0, ggml_nbytes(dst));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -14540,6 +14594,144 @@ static void ggml_compute_forward_conv_transpose_1d(
|
|||
|
||||
// ggml_compute_forward_conv_2d
|
||||
|
||||
// src0: kernel [OC, IC, KH, KW]
|
||||
// src1: image [N, IC, IH, IW]
|
||||
// dst: result [N, OH, OW, IC*KH*KW]
|
||||
static void ggml_compute_forward_conv_2d_stage_0_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
||||
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
const int64_t N = ne13;
|
||||
const int64_t IC = ne12;
|
||||
const int64_t IH = ne11;
|
||||
const int64_t IW = ne10;
|
||||
|
||||
// const int64_t OC = ne03;
|
||||
// const int64_t IC = ne02;
|
||||
const int64_t KH = ne01;
|
||||
const int64_t KW = ne00;
|
||||
|
||||
const int64_t OH = ne2;
|
||||
const int64_t OW = ne1;
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
||||
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
||||
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
||||
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
||||
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
memset(dst->data, 0, ggml_nbytes(dst));
|
||||
return;
|
||||
}
|
||||
|
||||
if (params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
||||
{
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
|
||||
|
||||
for (int64_t in = 0; in < N; in++) {
|
||||
for (int64_t ioh = 0; ioh < OH; ioh++) {
|
||||
for (int64_t iow = 0; iow < OW; iow++) {
|
||||
for (int64_t iic = ith; iic < IC; iic+=nth) {
|
||||
|
||||
// micro kernel
|
||||
ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
||||
const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
|
||||
|
||||
for (int64_t ikh = 0; ikh < KH; ikh++) {
|
||||
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
||||
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
||||
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
||||
|
||||
if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
|
||||
dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
|
||||
// src0: [OC, IC, KH, KW]
|
||||
// src1: [N, OH, OW, IC * KH * KW]
|
||||
// result: [N, OC, OH, OW]
|
||||
static void ggml_compute_forward_conv_2d_stage_1_f16(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
|
||||
const int N = ne13;
|
||||
const int OH = ne12;
|
||||
const int OW = ne11;
|
||||
|
||||
const int OC = ne03;
|
||||
const int IC = ne02;
|
||||
const int KH = ne01;
|
||||
const int KW = ne00;
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
int64_t m = OC;
|
||||
int64_t n = OH * OW;
|
||||
int64_t k = IC * KH * KW;
|
||||
|
||||
// [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
|
||||
for (int i = 0; i < N; i++) {
|
||||
ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
|
||||
ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
|
||||
float * C = (float *)dst->data + i * m * n; // [m, n]
|
||||
|
||||
gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_conv_2d_f16_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
|
@ -14552,16 +14744,40 @@ static void ggml_compute_forward_conv_2d_f16_f32(
|
|||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
// src1: image [N, IC, IH, IW]
|
||||
// src0: kernel [OC, IC, KH, KW]
|
||||
// dst: result [N, OC, OH, OW]
|
||||
// ne12: IC
|
||||
// ne0: OW
|
||||
// ne1: OH
|
||||
// nk0: KW
|
||||
// nk1: KH
|
||||
// ne13: N
|
||||
|
||||
const int N = ne13;
|
||||
const int IC = ne12;
|
||||
const int IH = ne11;
|
||||
const int IW = ne10;
|
||||
|
||||
const int OC = ne03;
|
||||
// const int IC = ne02;
|
||||
const int KH = ne01;
|
||||
const int KW = ne00;
|
||||
|
||||
const int OH = ne1;
|
||||
const int OW = ne0;
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nk0 = ne00;
|
||||
const int nk1 = ne01;
|
||||
// const int nk0 = ne00;
|
||||
// const int nk1 = ne01;
|
||||
|
||||
// size of the convolution row - the kernel size unrolled across all channels
|
||||
const int ew0 = nk0*nk1*ne02;
|
||||
// const int ew0 = nk0*nk1*ne02;
|
||||
// ew0: IC*KH*KW
|
||||
|
||||
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
||||
|
@ -14577,24 +14793,27 @@ static void ggml_compute_forward_conv_2d_f16_f32(
|
|||
memset(params->wdata, 0, params->wsize);
|
||||
|
||||
// prepare source data (src1)
|
||||
// im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
|
||||
|
||||
{
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
||||
|
||||
for (int i13 = 0; i13 < ne13; i13++) {
|
||||
for (int i12 = 0; i12 < ne12; i12++) {
|
||||
const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
|
||||
ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
|
||||
for (int in = 0; in < N; in++) {
|
||||
for (int iic = 0; iic < IC; iic++) {
|
||||
for (int ioh = 0; ioh < OH; ioh++) {
|
||||
for (int iow = 0; iow < OW; iow++) {
|
||||
|
||||
for (int i1 = 0; i1 < ne1; i1++) {
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
||||
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
||||
const int idx0 = i0*s0 + ik0*d0 - p0;
|
||||
const int idx1 = i1*s1 + ik1*d1 - p1;
|
||||
// micro kernel
|
||||
ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
||||
const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
|
||||
|
||||
if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
|
||||
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
|
||||
GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
|
||||
for (int ikh = 0; ikh < KH; ikh++) {
|
||||
for (int ikw = 0; ikw < KW; ikw++) {
|
||||
const int iiw = iow*s0 + ikw*d0 - p0;
|
||||
const int iih = ioh*s1 + ikh*d1 - p1;
|
||||
|
||||
if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
|
||||
dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -14611,30 +14830,22 @@ static void ggml_compute_forward_conv_2d_f16_f32(
|
|||
return;
|
||||
}
|
||||
|
||||
// total patches in dst
|
||||
const int np = ne2;
|
||||
|
||||
// patches per thread
|
||||
const int dp = (np + nth - 1)/nth;
|
||||
|
||||
// patch range for this thread
|
||||
const int ip0 = dp*ith;
|
||||
const int ip1 = MIN(ip0 + dp, np);
|
||||
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
||||
// wdata: [N*OH*OW, IC*KH*KW]
|
||||
// dst: result [N, OC, OH, OW]
|
||||
// src0: kernel [OC, IC, KH, KW]
|
||||
|
||||
for (int i3 = 0; i3 < ne3; i3++) {
|
||||
for (int i2 = ip0; i2 < ip1; i2++) {
|
||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
|
||||
int64_t m = OC;
|
||||
int64_t n = OH * OW;
|
||||
int64_t k = IC * KH * KW;
|
||||
|
||||
for (int i1 = 0; i1 < ne1; ++i1) {
|
||||
for (int i0 = 0; i0 < ne0; ++i0) {
|
||||
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
|
||||
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
|
||||
(ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
|
||||
}
|
||||
}
|
||||
}
|
||||
// [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
|
||||
for (int i = 0; i < N; i++) {
|
||||
ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
|
||||
ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
|
||||
float * C = (float *)dst->data + i * m * n; // [m * k]
|
||||
|
||||
gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -14660,6 +14871,48 @@ static void ggml_compute_forward_conv_2d(
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_conv_2d_stage_0(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_conv_2d_stage_1(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_conv_transpose_2d
|
||||
|
||||
static void ggml_compute_forward_conv_transpose_2d(
|
||||
|
@ -14718,6 +14971,8 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|||
}
|
||||
}
|
||||
|
||||
memset(dst->data, 0, ggml_nbytes(dst));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -16216,7 +16471,6 @@ static void ggml_compute_forward_add_rel_pos_f32(
|
|||
const int ip0 = dp*ith;
|
||||
const int ip1 = MIN(ip0 + dp, np);
|
||||
|
||||
|
||||
for (int64_t i13 = ip0; i13 < ip1; ++i13) {
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
||||
|
@ -16283,7 +16537,6 @@ static void ggml_compute_forward_map_unary_f32(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static void ggml_compute_forward_map_unary(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
|
@ -16331,7 +16584,6 @@ static void ggml_compute_forward_map_binary_f32(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static void ggml_compute_forward_map_binary(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
|
@ -16383,7 +16635,6 @@ static void ggml_compute_forward_map_custom2_f32(
|
|||
fun(dst, a, b);
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_map_custom3
|
||||
|
||||
static void ggml_compute_forward_map_custom3_f32(
|
||||
|
@ -16658,7 +16909,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|||
ggml_vec_sub_f32(nc, ds0, ds0, s1);
|
||||
ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
|
||||
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
assert(!isnan(ds0[i]));
|
||||
|
@ -16686,12 +16936,15 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////
|
||||
|
||||
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(params);
|
||||
|
||||
if (tensor->op == GGML_OP_NONE) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
||||
if (skip_cpu) {
|
||||
|
@ -16894,6 +17147,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
{
|
||||
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_2D_STAGE_0:
|
||||
{
|
||||
ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_2D_STAGE_1:
|
||||
{
|
||||
ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
|
||||
|
@ -17828,11 +18089,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_CONV_2D:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_CONV_2D_STAGE_0:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_CONV_2D_STAGE_1:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
|
@ -18761,6 +19030,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|||
const int64_t ne0 = node->ne[0];
|
||||
const int64_t ne1 = node->ne[1];
|
||||
const int64_t ne2 = node->ne[2];
|
||||
const int64_t ne3 = node->ne[3];
|
||||
const int64_t nk = ne00*ne01;
|
||||
const int64_t ew0 = nk * ne02;
|
||||
|
||||
|
@ -18771,7 +19041,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|||
|
||||
if (node->src[0]->type == GGML_TYPE_F16 &&
|
||||
node->src[1]->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
|
||||
// im2col: [N*OH*OW, IC*KH*KW]
|
||||
cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
|
||||
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
||||
node->src[1]->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(float)* (ne10*ne11*ne12);
|
||||
|
@ -18781,6 +19052,14 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_CONV_2D_STAGE_0:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_CONV_2D_STAGE_1:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
|
@ -19969,7 +20248,6 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
|
||||
opt->loss_after = fx;
|
||||
|
||||
|
||||
// check convergence
|
||||
if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
|
||||
GGML_PRINT_DEBUG("converged\n");
|
||||
|
|
15
ggml.h
15
ggml.h
|
@ -401,15 +401,16 @@ extern "C" {
|
|||
GGML_OP_ALIBI,
|
||||
GGML_OP_CLAMP,
|
||||
GGML_OP_CONV_1D,
|
||||
GGML_OP_CONV_2D,
|
||||
GGML_OP_CONV_1D_STAGE_0, // internal
|
||||
GGML_OP_CONV_1D_STAGE_1, // internal
|
||||
GGML_OP_CONV_TRANSPOSE_1D,
|
||||
GGML_OP_CONV_2D,
|
||||
GGML_OP_CONV_2D_STAGE_0, // internal
|
||||
GGML_OP_CONV_2D_STAGE_1, // internal
|
||||
GGML_OP_CONV_TRANSPOSE_2D,
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
|
||||
GGML_OP_CONV_1D_STAGE_0, // internal
|
||||
GGML_OP_CONV_1D_STAGE_1, // internal
|
||||
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
|
@ -1020,9 +1021,9 @@ extern "C" {
|
|||
struct ggml_tensor * b,
|
||||
float eps);
|
||||
|
||||
// A: n columns, m rows
|
||||
// B: n columns, p rows (i.e. we transpose it internally)
|
||||
// result is m columns, p rows
|
||||
// A: k columns, n rows => [ne03, ne02, n, k]
|
||||
// B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
|
||||
// result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
|
||||
GGML_API struct ggml_tensor * ggml_mul_mat(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
|
49
llama.cpp
49
llama.cpp
|
@ -7593,7 +7593,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|||
}
|
||||
}
|
||||
|
||||
const llama_token eos = llama_token_eos(ctx);
|
||||
const llama_token eos = llama_token_eos(&ctx->model);
|
||||
|
||||
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
||||
std::vector<llama_grammar_candidate> candidates_grammar;
|
||||
|
@ -7803,7 +7803,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|||
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
if (token == llama_token_eos(ctx)) {
|
||||
if (token == llama_token_eos(&ctx->model)) {
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
if (stack.empty()) {
|
||||
return;
|
||||
|
@ -9033,7 +9033,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
// build worst-case graph
|
||||
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
||||
int n_past = cparams.n_ctx - n_tokens;
|
||||
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
|
@ -9794,43 +9794,44 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|||
return ctx->embedding.data();
|
||||
}
|
||||
|
||||
const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
|
||||
return ctx->model.vocab.id_to_token[token].text.c_str();
|
||||
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
||||
return model->vocab.id_to_token[token].text.c_str();
|
||||
}
|
||||
|
||||
float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
|
||||
return ctx->model.vocab.id_to_token[token].score;
|
||||
float llama_token_get_score(const struct llama_model * model, llama_token token) {
|
||||
return model->vocab.id_to_token[token].score;
|
||||
}
|
||||
|
||||
llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
|
||||
return ctx->model.vocab.id_to_token[token].type;
|
||||
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
|
||||
return model->vocab.id_to_token[token].type;
|
||||
}
|
||||
|
||||
llama_token llama_token_bos(const struct llama_context * ctx) {
|
||||
return ctx->model.vocab.special_bos_id;
|
||||
llama_token llama_token_bos(const struct llama_model * model) {
|
||||
return model->vocab.special_bos_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_eos(const struct llama_context * ctx) {
|
||||
return ctx->model.vocab.special_eos_id;
|
||||
llama_token llama_token_eos(const struct llama_model * model) {
|
||||
return model->vocab.special_eos_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_nl(const struct llama_context * ctx) {
|
||||
return ctx->model.vocab.linefeed_id;
|
||||
}
|
||||
llama_token llama_token_prefix(const struct llama_context * ctx) {
|
||||
return ctx->model.vocab.special_prefix_id;
|
||||
llama_token llama_token_nl(const struct llama_model * model) {
|
||||
return model->vocab.linefeed_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_middle(const struct llama_context * ctx) {
|
||||
return ctx->model.vocab.special_middle_id;
|
||||
llama_token llama_token_prefix(const struct llama_model * model) {
|
||||
return model->vocab.special_prefix_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_suffix(const struct llama_context * ctx) {
|
||||
return ctx->model.vocab.special_suffix_id;
|
||||
llama_token llama_token_middle(const struct llama_model * model) {
|
||||
return model->vocab.special_middle_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_eot(const struct llama_context * ctx) {
|
||||
return ctx->model.vocab.special_eot_id;
|
||||
llama_token llama_token_suffix(const struct llama_model * model) {
|
||||
return model->vocab.special_suffix_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_eot(const struct llama_model * model) {
|
||||
return model->vocab.special_eot_id;
|
||||
}
|
||||
|
||||
int llama_tokenize(
|
||||
|
|
21
llama.h
21
llama.h
|
@ -508,21 +508,22 @@ extern "C" {
|
|||
// Vocab
|
||||
//
|
||||
|
||||
LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
|
||||
LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
|
||||
|
||||
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
|
||||
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
||||
|
||||
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
|
||||
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||
|
||||
// codellama infill tokens
|
||||
LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
|
||||
LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
|
||||
LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
|
||||
LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of infill middle
|
||||
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
||||
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
||||
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
||||
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
|
||||
|
||||
//
|
||||
// Tokenization
|
||||
|
|
BIN
models/ggml-vocab-baichuan.gguf
Normal file
BIN
models/ggml-vocab-baichuan.gguf
Normal file
Binary file not shown.
BIN
models/ggml-vocab-gpt-neox.gguf
Normal file
BIN
models/ggml-vocab-gpt-neox.gguf
Normal file
Binary file not shown.
BIN
models/ggml-vocab-refact.gguf
Normal file
BIN
models/ggml-vocab-refact.gguf
Normal file
Binary file not shown.
BIN
models/ggml-vocab-starcoder.gguf
Normal file
BIN
models/ggml-vocab-starcoder.gguf
Normal file
Binary file not shown.
|
@ -28,10 +28,14 @@ llama_build_executable(test-tokenizer-0-falcon.cpp)
|
|||
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_build_executable(test-tokenizer-1-llama.cpp)
|
||||
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
||||
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
||||
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||
llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
||||
llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||
llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
||||
|
|
|
@ -91,9 +91,19 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
}
|
||||
}
|
||||
// TODO: why doesn't this work for the full range of Unicodes?
|
||||
// Restrict to assigned unicode planes
|
||||
// for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
||||
for (uint32_t cp = 0x10000; cp < 0x00080000; ++cp) {
|
||||
for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||
if (str != check) {
|
||||
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||
|
@ -103,7 +113,6 @@ int main(int argc, char **argv) {
|
|||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue