Merge remote-tracking branch 'origin/master' into sl/cuda-virt-pool
This commit is contained in:
commit
26e97b5861
7 changed files with 123 additions and 19 deletions
2
.github/workflows/docker.yml
vendored
2
.github/workflows/docker.yml
vendored
|
@ -98,5 +98,5 @@ jobs:
|
||||||
context: .
|
context: .
|
||||||
push: ${{ github.event_name == 'push' }}
|
push: ${{ github.event_name == 'push' }}
|
||||||
platforms: ${{ matrix.config.platforms }}
|
platforms: ${{ matrix.config.platforms }}
|
||||||
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}" , "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
|
@ -149,11 +149,12 @@ static void sampler_queue(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_sampling_sample(
|
static llama_token llama_sampling_sample_impl(
|
||||||
struct llama_sampling_context * ctx_sampling,
|
struct llama_sampling_context * ctx_sampling,
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
struct llama_context * ctx_cfg,
|
struct llama_context * ctx_cfg,
|
||||||
const int idx) {
|
const int idx,
|
||||||
|
bool is_resampling) { // Add a parameter to indicate if we are resampling
|
||||||
const llama_sampling_params & params = ctx_sampling->params;
|
const llama_sampling_params & params = ctx_sampling->params;
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||||
|
@ -173,8 +174,17 @@ llama_token llama_sampling_sample(
|
||||||
|
|
||||||
llama_token id = 0;
|
llama_token id = 0;
|
||||||
|
|
||||||
|
// Get a pointer to the logits
|
||||||
float * logits = llama_get_logits_ith(ctx_main, idx);
|
float * logits = llama_get_logits_ith(ctx_main, idx);
|
||||||
|
|
||||||
|
// Declare original_logits at the beginning of the function scope
|
||||||
|
std::vector<float> original_logits;
|
||||||
|
|
||||||
|
if (!is_resampling) {
|
||||||
|
// Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this.
|
||||||
|
original_logits = std::vector<float>(logits, logits + llama_n_vocab(llama_get_model(ctx_main)));
|
||||||
|
}
|
||||||
|
|
||||||
// apply params.logit_bias map
|
// apply params.logit_bias map
|
||||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
||||||
logits[it->first] += it->second;
|
logits[it->first] += it->second;
|
||||||
|
@ -193,12 +203,14 @@ llama_token llama_sampling_sample(
|
||||||
}
|
}
|
||||||
|
|
||||||
// apply penalties
|
// apply penalties
|
||||||
if (!prev.empty()) {
|
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
|
||||||
|
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
|
||||||
|
if (penalty_tokens_used_size) {
|
||||||
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
|
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
|
||||||
|
|
||||||
llama_sample_repetition_penalties(ctx_main, &cur_p,
|
llama_sample_repetition_penalties(ctx_main, &cur_p,
|
||||||
prev.data() + prev.size() - penalty_last_n,
|
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
|
||||||
penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
|
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
|
||||||
|
|
||||||
if (!penalize_nl) {
|
if (!penalize_nl) {
|
||||||
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
||||||
|
@ -210,7 +222,8 @@ llama_token llama_sampling_sample(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx_sampling->grammar != NULL) {
|
// If we are in the resampling phase, apply grammar checks before sampling logic
|
||||||
|
if (is_resampling && ctx_sampling->grammar != NULL) {
|
||||||
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -252,9 +265,40 @@ llama_token llama_sampling_sample(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ctx_sampling->grammar != NULL && !is_resampling) {
|
||||||
|
// Create an array with a single token data element for the sampled id
|
||||||
|
llama_token_data single_token_data = {id, logits[id], 0.0f};
|
||||||
|
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
|
||||||
|
|
||||||
|
// Apply grammar constraints to the single token
|
||||||
|
llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
|
||||||
|
|
||||||
|
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
|
||||||
|
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||||
|
|
||||||
|
// If the token is not valid according to the grammar, perform resampling
|
||||||
|
if (!is_valid) {
|
||||||
|
LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
||||||
|
|
||||||
|
// Restore logits from the copy
|
||||||
|
std::copy(original_logits.begin(), original_logits.end(), logits);
|
||||||
|
|
||||||
|
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_token llama_sampling_sample(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
struct llama_context * ctx_cfg,
|
||||||
|
const int idx) {
|
||||||
|
// Call the implementation function with is_resampling set to false by default
|
||||||
|
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
|
||||||
|
}
|
||||||
|
|
||||||
void llama_sampling_accept(
|
void llama_sampling_accept(
|
||||||
struct llama_sampling_context * ctx_sampling,
|
struct llama_sampling_context * ctx_sampling,
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
|
|
|
@ -36,6 +36,9 @@ typedef struct llama_sampling_params {
|
||||||
float cfg_scale = 1.f; // how strong is guidance
|
float cfg_scale = 1.f; // how strong is guidance
|
||||||
|
|
||||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||||
|
|
||||||
|
std::vector<llama_token> penalty_prompt_tokens;
|
||||||
|
bool use_penalty_prompt_tokens = false;
|
||||||
} llama_sampling_params;
|
} llama_sampling_params;
|
||||||
|
|
||||||
// general sampler context
|
// general sampler context
|
||||||
|
|
|
@ -148,6 +148,8 @@ node index.js
|
||||||
|
|
||||||
`frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled);
|
`frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled);
|
||||||
|
|
||||||
|
`penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens (default: `null` = use the original `prompt`).
|
||||||
|
|
||||||
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
|
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
|
||||||
|
|
||||||
`mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0).
|
`mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0).
|
||||||
|
|
|
@ -761,6 +761,42 @@ struct llama_server_context
|
||||||
slot->prompt = "";
|
slot->prompt = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slot->sparams.penalty_prompt_tokens.clear();
|
||||||
|
slot->sparams.use_penalty_prompt_tokens = false;
|
||||||
|
const auto &penalty_prompt = data.find("penalty_prompt");
|
||||||
|
if (penalty_prompt != data.end())
|
||||||
|
{
|
||||||
|
if (penalty_prompt->is_string())
|
||||||
|
{
|
||||||
|
const auto penalty_prompt_string = penalty_prompt->get<std::string>();
|
||||||
|
auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
|
||||||
|
slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
|
||||||
|
if (slot->params.n_predict > 0)
|
||||||
|
{
|
||||||
|
slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
|
||||||
|
}
|
||||||
|
slot->sparams.use_penalty_prompt_tokens = true;
|
||||||
|
}
|
||||||
|
else if (penalty_prompt->is_array())
|
||||||
|
{
|
||||||
|
const auto n_tokens = penalty_prompt->size();
|
||||||
|
slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
|
||||||
|
const int n_vocab = llama_n_vocab(model);
|
||||||
|
for (const auto &penalty_token : *penalty_prompt)
|
||||||
|
{
|
||||||
|
if (penalty_token.is_number_integer())
|
||||||
|
{
|
||||||
|
const auto tok = penalty_token.get<llama_token>();
|
||||||
|
if (tok >= 0 && tok < n_vocab)
|
||||||
|
{
|
||||||
|
slot->sparams.penalty_prompt_tokens.push_back(tok);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slot->sparams.use_penalty_prompt_tokens = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
slot->sparams.logit_bias.clear();
|
slot->sparams.logit_bias.clear();
|
||||||
|
|
||||||
if (json_value(data, "ignore_eos", false))
|
if (json_value(data, "ignore_eos", false))
|
||||||
|
@ -992,6 +1028,12 @@ struct llama_server_context
|
||||||
slot.generated_text += token_str;
|
slot.generated_text += token_str;
|
||||||
slot.has_next_token = true;
|
slot.has_next_token = true;
|
||||||
|
|
||||||
|
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
||||||
|
{
|
||||||
|
// we can change penalty_prompt_tokens because it is always created from scratch each request
|
||||||
|
slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
|
||||||
|
}
|
||||||
|
|
||||||
// check if there is incomplete UTF-8 character at the end
|
// check if there is incomplete UTF-8 character at the end
|
||||||
bool incomplete = false;
|
bool incomplete = false;
|
||||||
for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
|
for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
|
||||||
|
@ -1183,6 +1225,8 @@ struct llama_server_context
|
||||||
{"repeat_penalty", slot.sparams.penalty_repeat},
|
{"repeat_penalty", slot.sparams.penalty_repeat},
|
||||||
{"presence_penalty", slot.sparams.penalty_present},
|
{"presence_penalty", slot.sparams.penalty_present},
|
||||||
{"frequency_penalty", slot.sparams.penalty_freq},
|
{"frequency_penalty", slot.sparams.penalty_freq},
|
||||||
|
{"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
|
||||||
|
{"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
|
||||||
{"mirostat", slot.sparams.mirostat},
|
{"mirostat", slot.sparams.mirostat},
|
||||||
{"mirostat_tau", slot.sparams.mirostat_tau},
|
{"mirostat_tau", slot.sparams.mirostat_tau},
|
||||||
{"mirostat_eta", slot.sparams.mirostat_eta},
|
{"mirostat_eta", slot.sparams.mirostat_eta},
|
||||||
|
|
15
ggml-cuda.cu
15
ggml-cuda.cu
|
@ -6908,8 +6908,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
||||||
void * ptr = nullptr;
|
void * ptr = nullptr;
|
||||||
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
// clear the error
|
||||||
// This can fixed the OOM error in WSL.
|
|
||||||
cudaGetLastError();
|
cudaGetLastError();
|
||||||
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
||||||
size/1024.0/1024.0, cudaGetErrorString(err));
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
||||||
|
@ -8081,15 +8080,19 @@ static void ggml_cuda_op_mul_mat(
|
||||||
|
|
||||||
if (id != 0) {
|
if (id != 0) {
|
||||||
row_low[id] = ne01*g_tensor_split[id];
|
row_low[id] = ne01*g_tensor_split[id];
|
||||||
|
if (row_low[id] < ne01) {
|
||||||
row_low[id] -= row_low[id] % rounding;
|
row_low[id] -= row_low[id] % rounding;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (id != g_device_count - 1) {
|
if (id != g_device_count - 1) {
|
||||||
row_high[id] = ne01*g_tensor_split[id + 1];
|
row_high[id] = ne01*g_tensor_split[id + 1];
|
||||||
|
if (row_high[id] < ne01) {
|
||||||
row_high[id] -= row_high[id] % rounding;
|
row_high[id] -= row_high[id] % rounding;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
||||||
|
@ -9789,12 +9792,14 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||||
// host buffer type
|
// host buffer type
|
||||||
|
|
||||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
CUDA_CHECK(cudaFreeHost(buffer->context));
|
ggml_cuda_host_free(buffer->context);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
void * ptr;
|
void * ptr = ggml_cuda_host_malloc(size);
|
||||||
CUDA_CHECK(cudaMallocHost(&ptr, size));
|
if (ptr == nullptr) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
// FIXME: this is a hack to avoid having to implement a new buffer type
|
// FIXME: this is a hack to avoid having to implement a new buffer type
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
|
|
16
llama.cpp
16
llama.cpp
|
@ -1177,21 +1177,27 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
|
||||||
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (n_gpu_layers > 0) {
|
if (n_gpu_layers > 0) {
|
||||||
return ggml_backend_metal_buffer_type();
|
buft = ggml_backend_metal_buffer_type();
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
||||||
if (n_gpu_layers > 0) {
|
if (n_gpu_layers > 0) {
|
||||||
return ggml_backend_cuda_buffer_type(0);
|
buft = ggml_backend_cuda_buffer_type(0);
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUBLAS)
|
||||||
return ggml_backend_cuda_host_buffer_type();
|
buft = ggml_backend_cuda_host_buffer_type();
|
||||||
#elif defined(GGML_USE_CPU_HBM)
|
#elif defined(GGML_USE_CPU_HBM)
|
||||||
return ggml_backend_cpu_hbm_buffer_type();
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return ggml_backend_cpu_buffer_type();
|
if (buft == nullptr) {
|
||||||
|
buft = ggml_backend_cpu_buffer_type();
|
||||||
|
}
|
||||||
|
|
||||||
|
return buft;
|
||||||
|
|
||||||
GGML_UNUSED(n_gpu_layers);
|
GGML_UNUSED(n_gpu_layers);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue