common, examples, llama : optimize using reserve if possible
This commit is contained in:
parent
0d4177126b
commit
f104678afc
8 changed files with 26 additions and 3 deletions
|
@ -200,7 +200,8 @@ static llama_token llama_sampling_sample_impl(
|
|||
}
|
||||
|
||||
cur.clear();
|
||||
|
||||
cur.reserve(n_vocab);
|
||||
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
|
|
|
@ -883,9 +883,11 @@ size_t tokenize_file(
|
|||
|
||||
// generate sample starts at all token positions
|
||||
out_samples_begin.clear();
|
||||
out_samples_begin.push_back(0);
|
||||
out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size()));
|
||||
size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0;
|
||||
out_samples_begin.reserve(end);
|
||||
out_samples_begin.push_back(0);
|
||||
out_samples_size.reserve(end);
|
||||
out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size()));
|
||||
for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) {
|
||||
out_samples_begin.push_back(sample_begin);
|
||||
out_samples_size.push_back(context_length);
|
||||
|
|
|
@ -1473,6 +1473,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
|
|||
std::vector<clip_image_u8*> patches;
|
||||
int width = image.nx;
|
||||
int height = image.ny;
|
||||
patches.reserve((height / patch_size) * (width / patch_size));
|
||||
for (int i = 0; i < height; i += patch_size) {
|
||||
for (int j = 0; j < width; j += patch_size) {
|
||||
clip_image_u8 *patch = clip_image_u8_init();
|
||||
|
@ -1542,6 +1543,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||
if (params.image_grid_pinpoints[0] != 0) {
|
||||
// "spatial_unpad" with "anyres" processing for llava-1.6
|
||||
std::vector<std::pair<int, int>> possible_resolutions;
|
||||
possible_resolutions.reserve(16);
|
||||
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
|
||||
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
|
||||
}
|
||||
|
|
|
@ -262,6 +262,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
||||
|
||||
std::vector<std::pair<int, int>> grid_pinpoints;
|
||||
grid_pinpoints.reserve(16);
|
||||
for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
|
||||
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
|
||||
}
|
||||
|
|
|
@ -181,6 +181,7 @@ int main(int argc, char ** argv){
|
|||
const int startIdx = i + ngram_size;
|
||||
const int endIdx = startIdx + n_draft;
|
||||
if (endIdx < inp_size) {
|
||||
draft.reserve(endIdx - startIdx);
|
||||
for (int j = startIdx; j < endIdx; ++j) {
|
||||
LOG(" - draft candidate %d: %d\n", j, inp[j]);
|
||||
draft.push_back(inp[j]);
|
||||
|
|
|
@ -876,10 +876,12 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||
// Compute log-probs in parallel
|
||||
// First we collect all tasks
|
||||
eval_pairs.clear();
|
||||
eval_pairs.reserve((i1 - i0) * 4);
|
||||
for (size_t i = i0; i < i1; ++i) {
|
||||
auto & hs_cur = hs_data[i];
|
||||
size_t li = hs_cur.common_prefix;
|
||||
for (int s = 0; s < 4; ++s) {
|
||||
eval_pairs.reserve((hs_cur.seq_tokens[s].size() - 1) - hs_cur.common_prefix);
|
||||
for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
|
||||
eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]);
|
||||
}
|
||||
|
@ -1148,6 +1150,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||
}
|
||||
|
||||
eval_pairs.clear();
|
||||
eval_pairs.reserve((i1 - i0));
|
||||
for (size_t i = i0; i < i1; ++i) {
|
||||
auto & task = data[i];
|
||||
|
||||
|
@ -1158,12 +1161,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||
const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
|
||||
const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
|
||||
size_t li = n_base1 - 1;
|
||||
eval_pairs.reserve((task.seq_tokens[0].size() - 1 - last_1st) - (n_base1 - 1));
|
||||
for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
|
||||
eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]);
|
||||
}
|
||||
const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
|
||||
const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
|
||||
li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
|
||||
eval_pairs.reserve((task.seq_tokens[1].size() - 1 - last_2nd) - (n_base2 - 1));
|
||||
for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
|
||||
eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]);
|
||||
}
|
||||
|
@ -1519,10 +1524,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||
// Compute log-probs in parallel
|
||||
// First we collect all tasks
|
||||
eval_pairs.clear();
|
||||
eval_pairs.reserve(i1 - i0);
|
||||
for (size_t i = i0; i < i1; ++i) {
|
||||
auto& cur_task = tasks[i];
|
||||
size_t li = cur_task.common_prefix;
|
||||
eval_pairs.reserve(cur_task.seq_tokens.size());
|
||||
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
|
||||
eval_pairs.reserve((cur_task.seq_tokens[s].size() - 1) - cur_task.common_prefix);
|
||||
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
|
||||
eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]);
|
||||
}
|
||||
|
|
|
@ -49,6 +49,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||
std::string ftype_str;
|
||||
|
||||
ftype_str.reserve(ftype_str_in.size());
|
||||
for (auto ch : ftype_str_in) {
|
||||
ftype_str.push_back(std::toupper(ch));
|
||||
}
|
||||
|
|
|
@ -1107,6 +1107,7 @@ struct llama_mmap {
|
|||
|
||||
// update the list of mapped fragments to avoid unmapping the same range again in the destructor
|
||||
std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
|
||||
new_mapped_fragments.reserve(mapped_fragments.size());
|
||||
for (const auto & frag : mapped_fragments) {
|
||||
if (frag.first < first && frag.second > last) {
|
||||
// the range is in the middle of the fragment, split it
|
||||
|
@ -7908,6 +7909,7 @@ struct llm_tokenizer_spm {
|
|||
// split string into utf8 chars
|
||||
int index = 0;
|
||||
size_t offs = 0;
|
||||
symbols.reserve(text.size());
|
||||
while (offs < text.size()) {
|
||||
llm_symbol sym;
|
||||
size_t len = utf8_len(text[offs]);
|
||||
|
@ -8065,6 +8067,7 @@ struct llm_tokenizer_bpe {
|
|||
int index = 0;
|
||||
size_t offset = 0;
|
||||
|
||||
symbols.reserve(word.size());
|
||||
while (offset < word.size()) {
|
||||
llm_symbol sym;
|
||||
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
||||
|
@ -8138,6 +8141,7 @@ struct llm_tokenizer_bpe {
|
|||
const auto token = vocab.token_to_id.find(str);
|
||||
|
||||
if (token == vocab.token_to_id.end()) {
|
||||
output.reserve(str.end() - str.begin());
|
||||
for (auto j = str.begin(); j != str.end(); ++j) {
|
||||
std::string byte_str(1, *j);
|
||||
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
||||
|
@ -8309,6 +8313,7 @@ private:
|
|||
}
|
||||
}
|
||||
|
||||
bpe_encoded_words.reserve(bpe_words.size());
|
||||
for (std::string & word : bpe_words) {
|
||||
std::string encoded_token = "";
|
||||
for (char & c : word) {
|
||||
|
@ -10194,6 +10199,7 @@ static void llama_convert_tensor_internal(
|
|||
size_t in_buff_offs = 0;
|
||||
size_t out_buff_offs = 0;
|
||||
|
||||
workers.reserve(nthread);
|
||||
for (int tnum = 0; tnum < nthread; tnum++) {
|
||||
size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
||||
size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
|
||||
|
@ -10697,6 +10703,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
|
||||
}
|
||||
};
|
||||
workers.reserve(nthread_use - 1);
|
||||
for (int it = 0; it < nthread_use - 1; ++it) {
|
||||
workers.emplace_back(compute);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue