Replace static_cast with function-style cast
This commit is contained in:
parent
fcfdc56cc2
commit
fceae56a86
25 changed files with 95 additions and 96 deletions
|
@ -1056,7 +1056,7 @@ std::string get_system_info(const gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string gpt_random_prompt(std::mt19937 & rng) {
|
std::string gpt_random_prompt(std::mt19937 & rng) {
|
||||||
const int r = static_cast<int>(rng() % 10);
|
const int r = int(rng() % 10);
|
||||||
switch (r) {
|
switch (r) {
|
||||||
case 0: return "So";
|
case 0: return "So";
|
||||||
case 1: return "Once upon a time";
|
case 1: return "Once upon a time";
|
||||||
|
|
|
@ -250,7 +250,7 @@ namespace console {
|
||||||
return expectedWidth;
|
return expectedWidth;
|
||||||
}
|
}
|
||||||
COORD initialPosition = bufferInfo.dwCursorPosition;
|
COORD initialPosition = bufferInfo.dwCursorPosition;
|
||||||
DWORD nNumberOfChars = static_cast<DWORD>(length);
|
DWORD nNumberOfChars = DWORD(length);
|
||||||
WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
|
WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
|
||||||
|
|
||||||
CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
|
CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
|
||||||
|
@ -404,7 +404,7 @@ namespace console {
|
||||||
} while (count == 0 && !widths.empty());
|
} while (count == 0 && !widths.empty());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
int offset = static_cast<int>(line.length());
|
int offset = int(line.length());
|
||||||
append_utf8(input_char, line);
|
append_utf8(input_char, line);
|
||||||
int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
|
int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
|
||||||
if (width < 0) {
|
if (width < 0) {
|
||||||
|
|
|
@ -73,7 +73,7 @@ llama_token llama_sampling_last(llama_sampling_context * ctx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
|
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
|
||||||
const int size = static_cast<int>(ctx_sampling->prev.size());
|
const int size = int(ctx_sampling->prev.size());
|
||||||
|
|
||||||
n = std::min(n, size);
|
n = std::min(n, size);
|
||||||
|
|
||||||
|
|
|
@ -70,7 +70,7 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<llama_token> tokens_list;
|
std::vector<llama_token> tokens_list;
|
||||||
tokens_list = ::llama_tokenize(model, params.prompt, true);
|
tokens_list = ::llama_tokenize(model, params.prompt, true);
|
||||||
|
|
||||||
const int n_kv_req = static_cast<int>(tokens_list.size() + (n_len - tokens_list.size())*n_parallel);
|
const int n_kv_req = int(tokens_list.size() + (n_len - tokens_list.size())*n_parallel);
|
||||||
|
|
||||||
// initialize the context
|
// initialize the context
|
||||||
|
|
||||||
|
@ -112,11 +112,11 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// create a llama_batch
|
// create a llama_batch
|
||||||
// we use this object to submit token data for decoding
|
// we use this object to submit token data for decoding
|
||||||
llama_batch batch = llama_batch_init(std::max(static_cast<int32_t>(tokens_list.size()), n_parallel), 0, 1);
|
llama_batch batch = llama_batch_init(std::max(int32_t(tokens_list.size()), n_parallel), 0, 1);
|
||||||
|
|
||||||
// evaluate the initial prompt
|
// evaluate the initial prompt
|
||||||
for (size_t i = 0; i < tokens_list.size(); ++i) {
|
for (size_t i = 0; i < tokens_list.size(); ++i) {
|
||||||
llama_batch_add(batch, tokens_list[i], static_cast<llama_pos>(i), { 0 }, false);
|
llama_batch_add(batch, tokens_list[i], llama_pos(i), { 0 }, false);
|
||||||
}
|
}
|
||||||
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
|
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
|
||||||
|
|
||||||
|
|
|
@ -160,12 +160,12 @@ int main(int argc, char ** argv)
|
||||||
|
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), static_cast<int32_t>(tokens_list.size()), n_past, 0)))
|
if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), int32_t(tokens_list.size()), n_past, 0)))
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
|
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
n_past += static_cast<int>(tokens_list.size());
|
n_past += int(tokens_list.size());
|
||||||
|
|
||||||
beam_search_callback_data callback_data{ctx, {}};
|
beam_search_callback_data callback_data{ctx, {}};
|
||||||
size_t const beam_width = static_cast<size_t>(params.n_beams);
|
size_t const beam_width = static_cast<size_t>(params.n_beams);
|
||||||
|
|
|
@ -64,8 +64,8 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i : embd_inp) {
|
for (int embd : embd_inp) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", i, llama_token_to_piece(ctx, i).c_str());
|
fprintf(stderr, "%6d -> '%s'\n", embd, llama_token_to_piece(ctx, embd).c_str());
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -217,7 +217,7 @@ static std::vector<float> softmax(const std::vector<float>& logits) {
|
||||||
probs[i] = exp_logit;
|
probs[i] = exp_logit;
|
||||||
}
|
}
|
||||||
for (float& prob : probs) {
|
for (float& prob : probs) {
|
||||||
prob /= static_cast<float>(sum_exp);
|
prob /= float(sum_exp);
|
||||||
}
|
}
|
||||||
return probs;
|
return probs;
|
||||||
}
|
}
|
||||||
|
|
|
@ -313,16 +313,16 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i : embd_inp) {
|
for (int embd : embd_inp) {
|
||||||
LOG_TEE("%6d -> '%s'\n", i, llama_token_to_piece(ctx, i).c_str());
|
LOG_TEE("%6d -> '%s'\n", embd, llama_token_to_piece(ctx, embd).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
||||||
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||||
for (int i : guidance_inp) {
|
for (int inp : guidance_inp) {
|
||||||
LOG_TEE("%6d -> '%s'\n", i, llama_token_to_piece(ctx, i).c_str());
|
LOG_TEE("%6d -> '%s'\n", inp, llama_token_to_piece(ctx, inp).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -76,7 +76,7 @@ static T stdev(const std::vector<T> & v) {
|
||||||
}
|
}
|
||||||
T mean = avg(v);
|
T mean = avg(v);
|
||||||
T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
|
T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
|
||||||
T stdev = static_cast<T>(std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1)));
|
T stdev = T(std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1)));
|
||||||
return stdev;
|
return stdev;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -381,7 +381,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
//const int n_intermediate = hparams.n_intermediate;
|
//const int n_intermediate = hparams.n_intermediate;
|
||||||
//const int projection_dim = hparams.projection_dim;
|
//const int projection_dim = hparams.projection_dim;
|
||||||
const float eps = hparams.eps;
|
const float eps = hparams.eps;
|
||||||
int batch_size = static_cast<int>(imgs->size);
|
int batch_size = int(imgs->size);
|
||||||
if (ctx->has_llava_projector) {
|
if (ctx->has_llava_projector) {
|
||||||
GGML_ASSERT(batch_size == 1);
|
GGML_ASSERT(batch_size == 1);
|
||||||
}
|
}
|
||||||
|
@ -607,8 +607,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
// hardswish
|
// hardswish
|
||||||
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
||||||
|
|
||||||
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, static_cast<int>(block_1_hw->ne[0]), static_cast<int>(block_1_hw->ne[1]),
|
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, int(block_1_hw->ne[0]), int(block_1_hw->ne[1]),
|
||||||
static_cast<int>(block_1_hw->ne[0]), static_cast<int>(block_1_hw->ne[1]), 0, 0);
|
int(block_1_hw->ne[0]), int(block_1_hw->ne[1]), 0, 0);
|
||||||
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||||
// pointwise conv
|
// pointwise conv
|
||||||
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||||
|
@ -622,8 +622,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
||||||
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
||||||
|
|
||||||
int w = static_cast<int>(block_1->ne[0]);
|
int w = int(block_1->ne[0]);
|
||||||
int h = static_cast<int>(block_1->ne[1]);
|
int h = int(block_1->ne[1]);
|
||||||
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||||
|
|
||||||
|
@ -657,8 +657,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
||||||
|
|
||||||
// not sure the parameters is right for globalAvgPooling
|
// not sure the parameters is right for globalAvgPooling
|
||||||
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, static_cast<int>(block_1_hw->ne[0]), static_cast<int>(block_1_hw->ne[1]),
|
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, int(block_1_hw->ne[0]), int(block_1_hw->ne[1]),
|
||||||
static_cast<int>(block_1_hw->ne[0]), static_cast<int>(block_1_hw->ne[1]), 0, 0);
|
int(block_1_hw->ne[0]), int(block_1_hw->ne[1]), 0, 0);
|
||||||
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||||
// pointwise conv
|
// pointwise conv
|
||||||
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||||
|
@ -673,8 +673,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
||||||
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
||||||
|
|
||||||
int w = static_cast<int>(block_1->ne[0]);
|
int w = int(block_1->ne[0]);
|
||||||
int h = static_cast<int>(block_1->ne[1]);
|
int h = int(block_1->ne[1]);
|
||||||
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||||
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||||
|
@ -906,7 +906,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
int num_bytes = static_cast<int>(ggml_nbytes(cur));
|
int num_bytes = int(ggml_nbytes(cur));
|
||||||
if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
|
if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
|
||||||
// for the CPU and Metal backend, we can read directly into the tensor
|
// for the CPU and Metal backend, we can read directly into the tensor
|
||||||
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
||||||
|
@ -1074,7 +1074,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
||||||
|
|
||||||
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
|
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
|
||||||
int nx, ny, nc;
|
int nx, ny, nc;
|
||||||
auto * data = stbi_load_from_memory(bytes, static_cast<int>(bytes_length), &nx, &ny, &nc, 3);
|
auto * data = stbi_load_from_memory(bytes, int(bytes_length), &nx, &ny, &nc, 3);
|
||||||
if (!data) {
|
if (!data) {
|
||||||
fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
|
fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
|
@ -1174,7 +1174,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||||
|
|
||||||
const float v = v0 * (1.0f - dy) + v1 * dy;
|
const float v = v0 * (1.0f - dy) + v1 * dy;
|
||||||
|
|
||||||
const uint8_t v2 = static_cast<std::uint8_t>(std::min(std::max(std::round(v), 0.0f), 255.0f));
|
const uint8_t v2 = std::uint8_t(std::min(std::max(std::round(v), 0.0f), 255.0f));
|
||||||
|
|
||||||
const int i = 3 * (y * nx3 + x) + c;
|
const int i = 3 * (y * nx3 + x) + c;
|
||||||
|
|
||||||
|
@ -1212,7 +1212,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int batch_size = static_cast<int>(imgs->size);
|
int batch_size = int(imgs->size);
|
||||||
if(ctx->has_llava_projector) {
|
if(ctx->has_llava_projector) {
|
||||||
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
||||||
}
|
}
|
||||||
|
@ -1342,34 +1342,34 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||||
|
|
||||||
switch (new_type) {
|
switch (new_type) {
|
||||||
case GGML_TYPE_Q4_0: {
|
case GGML_TYPE_Q4_0: {
|
||||||
new_size = ggml_quantize_q4_0(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
|
new_size = ggml_quantize_q4_0(f32_data, new_data, int(n_elms), int(cur->ne[0]), hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_1: {
|
case GGML_TYPE_Q4_1: {
|
||||||
new_size = ggml_quantize_q4_1(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
|
new_size = ggml_quantize_q4_1(f32_data, new_data, int(n_elms), int(cur->ne[0]), hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_0: {
|
case GGML_TYPE_Q5_0: {
|
||||||
new_size = ggml_quantize_q5_0(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
|
new_size = ggml_quantize_q5_0(f32_data, new_data, int(n_elms), int(cur->ne[0]), hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_1: {
|
case GGML_TYPE_Q5_1: {
|
||||||
new_size = ggml_quantize_q5_1(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
|
new_size = ggml_quantize_q5_1(f32_data, new_data, int(n_elms), int(cur->ne[0]), hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q8_0: {
|
case GGML_TYPE_Q8_0: {
|
||||||
new_size = ggml_quantize_q8_0(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
|
new_size = ggml_quantize_q8_0(f32_data, new_data, int(n_elms), int(cur->ne[0]), hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q2_K: {
|
case GGML_TYPE_Q2_K: {
|
||||||
new_size = ggml_quantize_q2_K(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
|
new_size = ggml_quantize_q2_K(f32_data, new_data, int(n_elms), int(cur->ne[0]), hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q3_K: {
|
case GGML_TYPE_Q3_K: {
|
||||||
new_size = ggml_quantize_q3_K(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
|
new_size = ggml_quantize_q3_K(f32_data, new_data, int(n_elms), int(cur->ne[0]), hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_K: {
|
case GGML_TYPE_Q4_K: {
|
||||||
new_size = ggml_quantize_q4_K(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
|
new_size = ggml_quantize_q4_K(f32_data, new_data, int(n_elms), int(cur->ne[0]), hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_K: {
|
case GGML_TYPE_Q5_K: {
|
||||||
new_size = ggml_quantize_q5_K(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
|
new_size = ggml_quantize_q5_K(f32_data, new_data, int(n_elms), int(cur->ne[0]), hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q6_K: {
|
case GGML_TYPE_Q6_K: {
|
||||||
new_size = ggml_quantize_q6_K(f32_data, new_data, static_cast<int>(n_elms), static_cast<int>(cur->ne[0]), hist_cur.data());
|
new_size = ggml_quantize_q6_K(f32_data, new_data, int(n_elms), int(cur->ne[0]), hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
default: {
|
default: {
|
||||||
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
|
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
|
||||||
|
@ -1432,10 +1432,10 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||||
|
|
||||||
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
||||||
return static_cast<int>(ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]);
|
return int(ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]);
|
||||||
}
|
}
|
||||||
else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
|
else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
|
||||||
return static_cast<int>(ctx->vision_model.mm_2_b->ne[0]);
|
return int(ctx->vision_model.mm_2_b->ne[0]);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
||||||
|
|
|
@ -85,7 +85,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
|
||||||
auto img_bytes = std::vector<unsigned char>(required_bytes);
|
auto img_bytes = std::vector<unsigned char>(required_bytes);
|
||||||
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
|
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
|
||||||
|
|
||||||
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), static_cast<int>(img_bytes.size()));
|
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), int(img_bytes.size()));
|
||||||
if (!embed) {
|
if (!embed) {
|
||||||
fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
|
fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
|
@ -88,7 +88,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
|
||||||
const int n_input = static_cast<int>(inp.size());
|
const int n_input = int(inp.size());
|
||||||
|
|
||||||
const auto t_enc_start = ggml_time_us();
|
const auto t_enc_start = ggml_time_us();
|
||||||
|
|
||||||
|
@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
|
||||||
int n_predict = 0;
|
int n_predict = 0;
|
||||||
int n_accept = 0;
|
int n_accept = 0;
|
||||||
|
|
||||||
int n_past = static_cast<int>(inp.size());
|
int n_past = int(inp.size());
|
||||||
|
|
||||||
llama_token id = 0;
|
llama_token id = 0;
|
||||||
|
|
||||||
|
@ -362,7 +362,7 @@ int main(int argc, char ** argv) {
|
||||||
if (v == 0) {
|
if (v == 0) {
|
||||||
// sample from the last level
|
// sample from the last level
|
||||||
for (int i = 0; i < W; i++) {
|
for (int i = 0; i < W; i++) {
|
||||||
tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, static_cast<int>(ngrams_cur.size()*(N-1) + W*(N - 2) + i));
|
tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, int(ngrams_cur.size()*(N-1) + W*(N - 2) + i));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < W; i++) {
|
for (int i = 0; i < W; i++) {
|
||||||
|
|
|
@ -60,7 +60,7 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
|
||||||
const int n_input = static_cast<int>(inp.size());
|
const int n_input = int(inp.size());
|
||||||
|
|
||||||
const auto t_enc_start = ggml_time_us();
|
const auto t_enc_start = ggml_time_us();
|
||||||
|
|
||||||
|
@ -73,7 +73,7 @@ int main(int argc, char ** argv){
|
||||||
int n_drafted = 0;
|
int n_drafted = 0;
|
||||||
int n_accept = 0;
|
int n_accept = 0;
|
||||||
|
|
||||||
int n_past = static_cast<int>(inp.size());
|
int n_past = int(inp.size());
|
||||||
|
|
||||||
bool has_eos = false;
|
bool has_eos = false;
|
||||||
|
|
||||||
|
@ -160,7 +160,7 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
// generate n_pred tokens through prompt lookup
|
// generate n_pred tokens through prompt lookup
|
||||||
auto prompt_lookup = [&]() -> void {
|
auto prompt_lookup = [&]() -> void {
|
||||||
int inp_size = static_cast<int>(inp.size());
|
int inp_size = int(inp.size());
|
||||||
for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
|
for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
|
||||||
const llama_token * ngram = &inp[inp_size - ngram_size];
|
const llama_token * ngram = &inp[inp_size - ngram_size];
|
||||||
|
|
||||||
|
|
|
@ -361,16 +361,16 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i : embd_inp) {
|
for (int embd : embd_inp) {
|
||||||
LOG_TEE("%6d -> '%s'\n", i, llama_token_to_piece(ctx, i).c_str());
|
LOG_TEE("%6d -> '%s'\n", embd, llama_token_to_piece(ctx, embd).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
||||||
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||||
for (int i : guidance_inp) {
|
for (int inp : guidance_inp) {
|
||||||
LOG_TEE("%6d -> '%s'\n", i, llama_token_to_piece(ctx, i).c_str());
|
LOG_TEE("%6d -> '%s'\n", inp, llama_token_to_piece(ctx, inp).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -156,13 +156,13 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<client> clients(n_clients);
|
std::vector<client> clients(n_clients);
|
||||||
for (size_t i = 0; i < clients.size(); ++i) {
|
for (size_t i = 0; i < clients.size(); ++i) {
|
||||||
auto & client = clients[i];
|
auto & client = clients[i];
|
||||||
client.id = static_cast<int32_t>(i);
|
client.id = int32_t(i);
|
||||||
client.ctx_sampling = llama_sampling_init(params.sparams);
|
client.ctx_sampling = llama_sampling_init(params.sparams);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokens_system;
|
std::vector<llama_token> tokens_system;
|
||||||
tokens_system = ::llama_tokenize(ctx, k_system, true);
|
tokens_system = ::llama_tokenize(ctx, k_system, true);
|
||||||
const int32_t n_tokens_system = static_cast<int32_t>(tokens_system.size());
|
const int32_t n_tokens_system = int32_t(tokens_system.size());
|
||||||
|
|
||||||
llama_seq_id g_seq_id = 0;
|
llama_seq_id g_seq_id = 0;
|
||||||
|
|
||||||
|
@ -254,7 +254,7 @@ int main(int argc, char ** argv) {
|
||||||
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
|
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
|
||||||
|
|
||||||
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
|
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
|
||||||
llama_batch_add(batch, tokens_prompt[i], static_cast<llama_pos>(i + n_tokens_system), { client.id }, false);
|
llama_batch_add(batch, tokens_prompt[i], llama_pos(i + n_tokens_system), { client.id }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// extract the logits only for the last token
|
// extract the logits only for the last token
|
||||||
|
@ -262,7 +262,7 @@ int main(int argc, char ** argv) {
|
||||||
batch.logits[batch.n_tokens - 1] = true;
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
client.n_prompt = static_cast<int32_t>(tokens_prompt.size());
|
client.n_prompt = int32_t(tokens_prompt.size());
|
||||||
client.n_decoded = 0;
|
client.n_decoded = 0;
|
||||||
client.i_batch = batch.n_tokens - 1;
|
client.i_batch = batch.n_tokens - 1;
|
||||||
|
|
||||||
|
|
|
@ -42,7 +42,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (seed == -1) {
|
if (seed == -1) {
|
||||||
seed = static_cast<int>(time(NULL));
|
seed = int(time(NULL));
|
||||||
}
|
}
|
||||||
|
|
||||||
srand(seed);
|
srand(seed);
|
||||||
|
@ -110,9 +110,9 @@ int main(int argc, char ** argv) {
|
||||||
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
// tokenize the prefix and use it as a sink
|
// tokenize the prefix and use it as a sink
|
||||||
const int n_tokens_prefix = static_cast<int>(::llama_tokenize(ctx, prompt_prefix, true).size());
|
const int n_tokens_prefix = int(::llama_tokenize(ctx, prompt_prefix, true).size());
|
||||||
|
|
||||||
const int n_tokens_all = static_cast<int>(tokens_list.size());
|
const int n_tokens_all = int(tokens_list.size());
|
||||||
|
|
||||||
// we leave a margin of 16 tokens for the generated text - it should contain just the passkey
|
// we leave a margin of 16 tokens for the generated text - it should contain just the passkey
|
||||||
const int n_predict = 16;
|
const int n_predict = 16;
|
||||||
|
|
|
@ -95,7 +95,7 @@ static std::vector<float> softmax(const std::vector<float>& logits) {
|
||||||
probs[i] = exp_logit;
|
probs[i] = exp_logit;
|
||||||
}
|
}
|
||||||
for (float& prob : probs) {
|
for (float& prob : probs) {
|
||||||
prob /= static_cast<float>(sum_exp);
|
prob /= float(sum_exp);
|
||||||
}
|
}
|
||||||
return probs;
|
return probs;
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,8 +39,8 @@ int main(int argc, char ** argv) {
|
||||||
auto tokens = llama_tokenize(ctx, params.prompt, true);
|
auto tokens = llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
// evaluate prompt
|
// evaluate prompt
|
||||||
llama_decode(ctx, llama_batch_get_one(tokens.data(), static_cast<llama_pos>(tokens.size()), n_past, 0));
|
llama_decode(ctx, llama_batch_get_one(tokens.data(), llama_pos(tokens.size()), n_past, 0));
|
||||||
n_past += static_cast<int>(tokens.size());
|
n_past += int(tokens.size());
|
||||||
|
|
||||||
// save state (rng, logits, embedding and kv_cache) to file
|
// save state (rng, logits, embedding and kv_cache) to file
|
||||||
{
|
{
|
||||||
|
|
|
@ -3495,7 +3495,7 @@ inline bool read_content_with_length(Stream &strm, uint64_t len,
|
||||||
|
|
||||||
uint64_t r = 0;
|
uint64_t r = 0;
|
||||||
while (r < len) {
|
while (r < len) {
|
||||||
auto read_len = static_cast<std::size_t>(len - r);
|
auto read_len = static_cast<size_t>(len - r);
|
||||||
auto n = strm.read(buf, (std::min)(read_len, CPPHTTPLIB_RECV_BUFSIZ));
|
auto n = strm.read(buf, (std::min)(read_len, CPPHTTPLIB_RECV_BUFSIZ));
|
||||||
if (n <= 0) { return false; }
|
if (n <= 0) { return false; }
|
||||||
|
|
||||||
|
@ -3514,7 +3514,7 @@ inline void skip_content_with_length(Stream &strm, uint64_t len) {
|
||||||
char buf[CPPHTTPLIB_RECV_BUFSIZ];
|
char buf[CPPHTTPLIB_RECV_BUFSIZ];
|
||||||
uint64_t r = 0;
|
uint64_t r = 0;
|
||||||
while (r < len) {
|
while (r < len) {
|
||||||
auto read_len = static_cast<std::size_t>(len - r);
|
auto read_len = static_cast<size_t>(len - r);
|
||||||
auto n = strm.read(buf, (std::min)(read_len, CPPHTTPLIB_RECV_BUFSIZ));
|
auto n = strm.read(buf, (std::min)(read_len, CPPHTTPLIB_RECV_BUFSIZ));
|
||||||
if (n <= 0) { return; }
|
if (n <= 0) { return; }
|
||||||
r += static_cast<uint64_t>(n);
|
r += static_cast<uint64_t>(n);
|
||||||
|
|
|
@ -636,7 +636,7 @@ struct llama_server_context
|
||||||
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
|
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
|
||||||
|
|
||||||
slot_image img_sl;
|
slot_image img_sl;
|
||||||
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : static_cast<int>(slot->images.size());
|
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : int(slot->images.size());
|
||||||
img_sl.img_data = clip_image_u8_init();
|
img_sl.img_data = clip_image_u8_init();
|
||||||
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
||||||
{
|
{
|
||||||
|
@ -736,7 +736,7 @@ struct llama_server_context
|
||||||
// assign the system KV cache to all parallel sequences
|
// assign the system KV cache to all parallel sequences
|
||||||
for (int32_t i = 1; i < params.n_parallel; ++i)
|
for (int32_t i = 1; i < params.n_parallel; ++i)
|
||||||
{
|
{
|
||||||
llama_kv_cache_seq_cp(ctx, 0, i, 0, static_cast<llama_pos>(system_tokens.size()));
|
llama_kv_cache_seq_cp(ctx, 0, i, 0, llama_pos(system_tokens.size()));
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("system prompt updated\n");
|
LOG_TEE("system prompt updated\n");
|
||||||
|
@ -1401,7 +1401,7 @@ struct llama_server_context
|
||||||
|
|
||||||
slot.i_batch = batch.n_tokens;
|
slot.i_batch = batch.n_tokens;
|
||||||
|
|
||||||
llama_batch_add(batch, slot.sampled, static_cast<llama_pos>(system_tokens.size() + slot.n_past), { slot.id }, true);
|
llama_batch_add(batch, slot.sampled, llama_pos(system_tokens.size() + slot.n_past), { slot.id }, true);
|
||||||
|
|
||||||
slot.n_past += 1;
|
slot.n_past += 1;
|
||||||
}
|
}
|
||||||
|
@ -1463,7 +1463,7 @@ struct llama_server_context
|
||||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.num_prompt_tokens = static_cast<int32_t>(prompt_tokens.size());
|
slot.num_prompt_tokens = int32_t(prompt_tokens.size());
|
||||||
|
|
||||||
if (slot.params.n_keep < 0)
|
if (slot.params.n_keep < 0)
|
||||||
{
|
{
|
||||||
|
@ -1490,7 +1490,7 @@ struct llama_server_context
|
||||||
slot.truncated = true;
|
slot.truncated = true;
|
||||||
prompt_tokens = new_tokens;
|
prompt_tokens = new_tokens;
|
||||||
|
|
||||||
slot.num_prompt_tokens = static_cast<int32_t>(prompt_tokens.size());
|
slot.num_prompt_tokens = int32_t(prompt_tokens.size());
|
||||||
GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
|
GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1509,7 +1509,7 @@ struct llama_server_context
|
||||||
llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
|
llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_past = static_cast<int32_t>(common_part(slot.cache_tokens, prompt_tokens));
|
slot.n_past = int32_t(common_part(slot.cache_tokens, prompt_tokens));
|
||||||
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
|
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
|
||||||
|
|
||||||
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
|
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
|
||||||
|
@ -1517,7 +1517,7 @@ struct llama_server_context
|
||||||
|
|
||||||
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
|
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm(ctx, slot.id, static_cast<llama_pos>(system_tokens.size() + slot.n_past), -1);
|
llama_kv_cache_seq_rm(ctx, slot.id, llama_pos(system_tokens.size() + slot.n_past), -1);
|
||||||
|
|
||||||
slot.cache_tokens = prompt_tokens;
|
slot.cache_tokens = prompt_tokens;
|
||||||
|
|
||||||
|
@ -1540,7 +1540,7 @@ struct llama_server_context
|
||||||
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
|
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
|
||||||
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
|
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
|
||||||
{
|
{
|
||||||
llama_batch_add(batch, prefix_tokens[slot.n_past], static_cast<llama_pos>(system_tokens.size() + slot.n_past), { slot.id }, false);
|
llama_batch_add(batch, prefix_tokens[slot.n_past], llama_pos(system_tokens.size() + slot.n_past), { slot.id }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (has_images && !ingest_images(slot, n_batch))
|
if (has_images && !ingest_images(slot, n_batch))
|
||||||
|
|
|
@ -441,7 +441,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
|
||||||
{
|
{
|
||||||
for (i = 0; i <4; i++)
|
for (i = 0; i <4; i++)
|
||||||
{
|
{
|
||||||
char_array_4[i] = static_cast<uint8_t>(base64_chars.find(char_array_4[i]));
|
char_array_4[i] = uint8_t(base64_chars.find(char_array_4[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
|
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
|
||||||
|
@ -465,7 +465,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
|
||||||
|
|
||||||
for (j = 0; j <4; j++)
|
for (j = 0; j <4; j++)
|
||||||
{
|
{
|
||||||
char_array_4[j] = static_cast<uint8_t>(base64_chars.find(char_array_4[j]));
|
char_array_4[j] = uint8_t(base64_chars.find(char_array_4[j]));
|
||||||
}
|
}
|
||||||
|
|
||||||
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
|
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
|
||||||
|
|
|
@ -67,8 +67,8 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<llama_token> tokens_list;
|
std::vector<llama_token> tokens_list;
|
||||||
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
const int n_ctx = static_cast<int>(llama_n_ctx(ctx));
|
const int n_ctx = int(llama_n_ctx(ctx));
|
||||||
const int n_kv_req = static_cast<int>(tokens_list.size() + (n_len - tokens_list.size()));
|
const int n_kv_req = int(tokens_list.size() + (n_len - tokens_list.size()));
|
||||||
|
|
||||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
|
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
|
||||||
|
|
||||||
|
@ -96,7 +96,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// evaluate the initial prompt
|
// evaluate the initial prompt
|
||||||
for (size_t i = 0; i < tokens_list.size(); i++) {
|
for (size_t i = 0; i < tokens_list.size(); i++) {
|
||||||
llama_batch_add(batch, tokens_list[i], static_cast<llama_pos>(i), { 0 }, false);
|
llama_batch_add(batch, tokens_list[i], llama_pos(i), { 0 }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// llama_decode will output logits only for the last token of the prompt
|
// llama_decode will output logits only for the last token of the prompt
|
||||||
|
|
|
@ -131,7 +131,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
|
||||||
const int n_input = static_cast<int>(inp.size());
|
const int n_input = int(inp.size());
|
||||||
|
|
||||||
const auto t_enc_start = ggml_time_us();
|
const auto t_enc_start = ggml_time_us();
|
||||||
|
|
||||||
|
@ -152,8 +152,8 @@ int main(int argc, char ** argv) {
|
||||||
int n_drafted = 0;
|
int n_drafted = 0;
|
||||||
int n_accept = 0;
|
int n_accept = 0;
|
||||||
|
|
||||||
int n_past_tgt = static_cast<int>(inp.size());
|
int n_past_tgt = int(inp.size());
|
||||||
int n_past_dft = static_cast<int>(inp.size());
|
int n_past_dft = int(inp.size());
|
||||||
|
|
||||||
// used to determine end of generation
|
// used to determine end of generation
|
||||||
bool has_eos = false;
|
bool has_eos = false;
|
||||||
|
|
16
llama.cpp
16
llama.cpp
|
@ -10992,15 +10992,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
||||||
if (llama_is_normal_token(model->vocab, token)) {
|
if (llama_is_normal_token(model->vocab, token)) {
|
||||||
std::string result = model->vocab.id_to_token[token].text;
|
std::string result = model->vocab.id_to_token[token].text;
|
||||||
llama_unescape_whitespace(result);
|
llama_unescape_whitespace(result);
|
||||||
if (length < static_cast<int32_t>(result.length())) {
|
if (length < int32_t(result.length())) {
|
||||||
return -static_cast<int32_t>(result.length());
|
return -int32_t(result.length());
|
||||||
}
|
}
|
||||||
memcpy(buf, result.c_str(), result.length());
|
memcpy(buf, result.c_str(), result.length());
|
||||||
return result.length();
|
return result.length();
|
||||||
} else if (llama_is_user_defined_token(model->vocab, token)) {
|
} else if (llama_is_user_defined_token(model->vocab, token)) {
|
||||||
std::string result = model->vocab.id_to_token[token].text;
|
std::string result = model->vocab.id_to_token[token].text;
|
||||||
if (length < static_cast<int32_t>(result.length())) {
|
if (length < int32_t(result.length())) {
|
||||||
return -static_cast<int32_t>(result.length());
|
return -int32_t(result.length());
|
||||||
}
|
}
|
||||||
memcpy(buf, result.c_str(), result.length());
|
memcpy(buf, result.c_str(), result.length());
|
||||||
return result.length();
|
return result.length();
|
||||||
|
@ -11027,15 +11027,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
||||||
if (llama_is_normal_token(model->vocab, token)) {
|
if (llama_is_normal_token(model->vocab, token)) {
|
||||||
std::string result = model->vocab.id_to_token[token].text;
|
std::string result = model->vocab.id_to_token[token].text;
|
||||||
result = llama_decode_text(result);
|
result = llama_decode_text(result);
|
||||||
if (length < static_cast<int32_t>(result.length())) {
|
if (length < int32_t(result.length())) {
|
||||||
return -static_cast<int32_t>(result.length());
|
return -int32_t(result.length());
|
||||||
}
|
}
|
||||||
memcpy(buf, result.c_str(), result.length());
|
memcpy(buf, result.c_str(), result.length());
|
||||||
return result.length();
|
return result.length();
|
||||||
} else if (llama_is_user_defined_token(model->vocab, token)) {
|
} else if (llama_is_user_defined_token(model->vocab, token)) {
|
||||||
std::string result = model->vocab.id_to_token[token].text;
|
std::string result = model->vocab.id_to_token[token].text;
|
||||||
if (length < static_cast<int32_t>(result.length())) {
|
if (length < int32_t(result.length())) {
|
||||||
return -static_cast<int32_t>(result.length());
|
return -int32_t(result.length());
|
||||||
}
|
}
|
||||||
memcpy(buf, result.c_str(), result.length());
|
memcpy(buf, result.c_str(), result.length());
|
||||||
return result.length();
|
return result.length();
|
||||||
|
|
|
@ -63,7 +63,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||||
im = nullptr;
|
im = nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, static_cast<int>(size/tensor->ne[0]),
|
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, int(size/tensor->ne[0]),
|
||||||
static_cast<int>(tensor->ne[0]), hist, im);
|
static_cast<int>(tensor->ne[0]), hist, im);
|
||||||
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||||
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
||||||
|
@ -553,7 +553,7 @@ struct test_case {
|
||||||
|
|
||||||
// duplicate the op
|
// duplicate the op
|
||||||
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
|
size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
|
||||||
int n_runs = static_cast<int>(std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1);
|
int n_runs = int(std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1);
|
||||||
for (int i = 1; i < n_runs; i++) {
|
for (int i = 1; i < n_runs; i++) {
|
||||||
gf->nodes[gf->n_nodes++] = out;
|
gf->nodes[gf->n_nodes++] = out;
|
||||||
}
|
}
|
||||||
|
@ -584,7 +584,7 @@ struct test_case {
|
||||||
ggml_backend_graph_compute(backend, gf);
|
ggml_backend_graph_compute(backend, gf);
|
||||||
ggml_backend_synchronize(backend);
|
ggml_backend_synchronize(backend);
|
||||||
int64_t end_time = ggml_time_us();
|
int64_t end_time = ggml_time_us();
|
||||||
double time_us = static_cast<double>(end_time - start_time);
|
double time_us = double(end_time - start_time);
|
||||||
|
|
||||||
printf(" %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
|
printf(" %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
|
||||||
n_runs,
|
n_runs,
|
||||||
|
@ -714,8 +714,7 @@ struct test_dup : public test_case {
|
||||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||||
ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
|
ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||||
if (_use_permute) {
|
if (_use_permute) {
|
||||||
src = ggml_permute(ctx, src, static_cast<int>(permute[0]), static_cast<int>(permute[1]),
|
src = ggml_permute(ctx, src, int(permute[0]), int(permute[1]), int(permute[2]), int(permute[3]));
|
||||||
static_cast<int>(permute[2]), static_cast<int>(permute[3]));
|
|
||||||
}
|
}
|
||||||
ggml_tensor * out = ggml_dup(ctx, src);
|
ggml_tensor * out = ggml_dup(ctx, src);
|
||||||
return out;
|
return out;
|
||||||
|
@ -1241,7 +1240,7 @@ struct test_argsort : public test_case {
|
||||||
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
||||||
std::vector<float> data(t->ne[0]);
|
std::vector<float> data(t->ne[0]);
|
||||||
for (int i = 0; i < t->ne[0]; i++) {
|
for (int i = 0; i < t->ne[0]; i++) {
|
||||||
data[i] = static_cast<float>(i);
|
data[i] = float(i);
|
||||||
}
|
}
|
||||||
std::shuffle(data.begin(), data.end(), rng);
|
std::shuffle(data.begin(), data.end(), rng);
|
||||||
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
|
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
|
||||||
|
@ -1423,7 +1422,7 @@ struct test_moe : public test_case {
|
||||||
ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
|
ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
|
||||||
|
|
||||||
ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
|
ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
|
||||||
ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(static_cast<float>(n_embd)));
|
ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(float(n_embd)));
|
||||||
|
|
||||||
// select experts
|
// select experts
|
||||||
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);
|
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue