diff --git a/common/sampling.cpp b/common/sampling.cpp index a001750da..e72b970d3 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -200,7 +200,8 @@ static llama_token llama_sampling_sample_impl( } cur.clear(); - + cur.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); } diff --git a/common/train.cpp b/common/train.cpp index e4c3d5df6..199450fa0 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -883,9 +883,11 @@ size_t tokenize_file( // generate sample starts at all token positions out_samples_begin.clear(); - out_samples_begin.push_back(0); - out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size())); size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0; + out_samples_begin.reserve(end); + out_samples_begin.push_back(0); + out_samples_size.reserve(end); + out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size())); for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) { out_samples_begin.push_back(sample_begin); out_samples_size.push_back(context_length); diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 2cad27e82..d75308dd7 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1473,6 +1473,7 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im std::vector patches; int width = image.nx; int height = image.ny; + patches.reserve((height / patch_size) * (width / patch_size)); for (int i = 0; i < height; i += patch_size) { for (int j = 0; j < width; j += patch_size) { clip_image_u8 *patch = clip_image_u8_init(); @@ -1542,6 +1543,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli if (params.image_grid_pinpoints[0] != 0) { // "spatial_unpad" with "anyres" processing for llava-1.6 std::vector> possible_resolutions; + possible_resolutions.reserve(16); for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 4ed310a0e..e1ff19d6f 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -262,6 +262,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli const int32_t * image_grid = clip_image_grid(ctx_clip); std::vector> grid_pinpoints; + grid_pinpoints.reserve(16); for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) { grid_pinpoints.push_back({image_grid[i], image_grid[i+1]}); } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 18235b8a1..ed111c401 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -181,6 +181,7 @@ int main(int argc, char ** argv){ const int startIdx = i + ngram_size; const int endIdx = startIdx + n_draft; if (endIdx < inp_size) { + draft.reserve(endIdx - startIdx); for (int j = startIdx; j < endIdx; ++j) { LOG(" - draft candidate %d: %d\n", j, inp[j]); draft.push_back(inp[j]); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index b2c131d4c..6122bc162 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -876,10 +876,12 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { // Compute log-probs in parallel // First we collect all tasks eval_pairs.clear(); + eval_pairs.reserve((i1 - i0) * 4); for (size_t i = i0; i < i1; ++i) { auto & hs_cur = hs_data[i]; size_t li = hs_cur.common_prefix; for (int s = 0; s < 4; ++s) { + eval_pairs.reserve((hs_cur.seq_tokens[s].size() - 1) - hs_cur.common_prefix); for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) { eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]); } @@ -1148,6 +1150,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { } eval_pairs.clear(); + eval_pairs.reserve((i1 - i0)); for (size_t i = i0; i < i1; ++i) { auto & task = data[i]; @@ -1158,12 +1161,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix; const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0; size_t li = n_base1 - 1; + eval_pairs.reserve((task.seq_tokens[0].size() - 1 - last_1st) - (n_base1 - 1)); for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) { eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]); } const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix; const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0; li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1; + eval_pairs.reserve((task.seq_tokens[1].size() - 1 - last_2nd) - (n_base2 - 1)); for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) { eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]); } @@ -1519,10 +1524,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params // Compute log-probs in parallel // First we collect all tasks eval_pairs.clear(); + eval_pairs.reserve(i1 - i0); for (size_t i = i0; i < i1; ++i) { auto& cur_task = tasks[i]; size_t li = cur_task.common_prefix; + eval_pairs.reserve(cur_task.seq_tokens.size()); for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { + eval_pairs.reserve((cur_task.seq_tokens[s].size() - 1) - cur_task.common_prefix); for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]); } diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 85f403ffc..296251be5 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -49,6 +49,7 @@ static const std::vector QUANT_OPTIONS = { static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { std::string ftype_str; + ftype_str.reserve(ftype_str_in.size()); for (auto ch : ftype_str_in) { ftype_str.push_back(std::toupper(ch)); } diff --git a/llama.cpp b/llama.cpp index 14e8821cd..e9f7298fe 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1107,6 +1107,7 @@ struct llama_mmap { // update the list of mapped fragments to avoid unmapping the same range again in the destructor std::vector> new_mapped_fragments; + new_mapped_fragments.reserve(mapped_fragments.size()); for (const auto & frag : mapped_fragments) { if (frag.first < first && frag.second > last) { // the range is in the middle of the fragment, split it @@ -7908,6 +7909,7 @@ struct llm_tokenizer_spm { // split string into utf8 chars int index = 0; size_t offs = 0; + symbols.reserve(text.size()); while (offs < text.size()) { llm_symbol sym; size_t len = utf8_len(text[offs]); @@ -8065,6 +8067,7 @@ struct llm_tokenizer_bpe { int index = 0; size_t offset = 0; + symbols.reserve(word.size()); while (offset < word.size()) { llm_symbol sym; size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset])); @@ -8138,6 +8141,7 @@ struct llm_tokenizer_bpe { const auto token = vocab.token_to_id.find(str); if (token == vocab.token_to_id.end()) { + output.reserve(str.end() - str.begin()); for (auto j = str.begin(); j != str.end(); ++j) { std::string byte_str(1, *j); auto token_multibyte = vocab.token_to_id.find(byte_str); @@ -8309,6 +8313,7 @@ private: } } + bpe_encoded_words.reserve(bpe_words.size()); for (std::string & word : bpe_words) { std::string encoded_token = ""; for (char & c : word) { @@ -10194,6 +10199,7 @@ static void llama_convert_tensor_internal( size_t in_buff_offs = 0; size_t out_buff_offs = 0; + workers.reserve(nthread); for (int tnum = 0; tnum < nthread; tnum++) { size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread size_t thr_elems = thr_blocks * block_size; // number of elements for this thread @@ -10697,6 +10703,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix); } }; + workers.reserve(nthread_use - 1); for (int it = 0; it < nthread_use - 1; ++it) { workers.emplace_back(compute); }