common, examples, llama : optimize using reserve if possible

2024-02-16 16:58:45 +03:00 · 2024-02-16 16:58:45 +03:00 · f104678afc
commit f104678afc
parent 0d4177126b
8 changed files with 26 additions and 3 deletions
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -200,7 +200,8 @@ static llama_token llama_sampling_sample_impl(
    }

    cur.clear();
-
+    cur.reserve(n_vocab);
+ 
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
    }
--- a/common/train.cpp
+++ b/common/train.cpp
@ -883,9 +883,11 @@ size_t tokenize_file(

        // generate sample starts at all token positions
        out_samples_begin.clear();
-        out_samples_begin.push_back(0);
-        out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size()));
        size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0;
+        out_samples_begin.reserve(end);
+        out_samples_begin.push_back(0);
+        out_samples_size.reserve(end);
+        out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size()));
        for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) {
            out_samples_begin.push_back(sample_begin);
            out_samples_size.push_back(context_length);
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -1473,6 +1473,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
    std::vector<clip_image_u8*> patches;
    int width = image.nx;
    int height = image.ny;
+    patches.reserve((height / patch_size) * (width / patch_size));
    for (int i = 0; i < height; i += patch_size) {
        for (int j = 0; j < width; j += patch_size) {
            clip_image_u8 *patch = clip_image_u8_init();
@ -1542,6 +1543,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
        if (params.image_grid_pinpoints[0] != 0) {
            // "spatial_unpad" with "anyres" processing for llava-1.6
            std::vector<std::pair<int, int>> possible_resolutions;
+            possible_resolutions.reserve(16);
            for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
                possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
            }
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -262,6 +262,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        const int32_t * image_grid = clip_image_grid(ctx_clip);

        std::vector<std::pair<int, int>> grid_pinpoints;
+        grid_pinpoints.reserve(16);
        for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
            grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
        }
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -181,6 +181,7 @@ int main(int argc, char ** argv){
                        const int startIdx = i + ngram_size;
                        const int endIdx = startIdx + n_draft;
                        if (endIdx < inp_size) {
+                            draft.reserve(endIdx - startIdx);
                            for (int j = startIdx; j < endIdx; ++j) {
                                LOG(" - draft candidate %d: %d\n", j, inp[j]);
                                draft.push_back(inp[j]);
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -876,10 +876,12 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        // Compute log-probs in parallel
        // First we collect all tasks
        eval_pairs.clear();
+        eval_pairs.reserve((i1 - i0) * 4);
        for (size_t i = i0; i < i1; ++i) {
            auto & hs_cur = hs_data[i];
            size_t li = hs_cur.common_prefix;
            for (int s = 0; s < 4; ++s) {
+                eval_pairs.reserve((hs_cur.seq_tokens[s].size() - 1) - hs_cur.common_prefix);
                for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
                    eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]);
                }
@ -1148,6 +1150,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
        }

        eval_pairs.clear();
+        eval_pairs.reserve((i1 - i0));
        for (size_t i = i0; i < i1; ++i) {
            auto & task = data[i];

@ -1158,12 +1161,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
            const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
            const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
            size_t li = n_base1 - 1;
+            eval_pairs.reserve((task.seq_tokens[0].size() - 1 - last_1st) - (n_base1 - 1));
            for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
                eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]);
            }
            const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
            const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
            li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
+            eval_pairs.reserve((task.seq_tokens[1].size() - 1 - last_2nd) - (n_base2 - 1));
            for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
                eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]);
            }
@ -1519,10 +1524,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        // Compute log-probs in parallel
        // First we collect all tasks
        eval_pairs.clear();
+        eval_pairs.reserve(i1 - i0);
        for (size_t i = i0; i < i1; ++i) {
            auto& cur_task = tasks[i];
            size_t li = cur_task.common_prefix;
+            eval_pairs.reserve(cur_task.seq_tokens.size());
            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
+                eval_pairs.reserve((cur_task.seq_tokens[s].size() - 1) - cur_task.common_prefix);
                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
                    eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]);
                }
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -49,6 +49,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
    std::string ftype_str;

+    ftype_str.reserve(ftype_str_in.size());
    for (auto ch : ftype_str_in) {
        ftype_str.push_back(std::toupper(ch));
    }
--- a/llama.cpp
+++ b/llama.cpp
@ -1107,6 +1107,7 @@ struct llama_mmap {

        // update the list of mapped fragments to avoid unmapping the same range again in the destructor
        std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
+        new_mapped_fragments.reserve(mapped_fragments.size());
        for (const auto & frag : mapped_fragments) {
            if (frag.first < first && frag.second > last) {
                // the range is in the middle of the fragment, split it
@ -7908,6 +7909,7 @@ struct llm_tokenizer_spm {
        // split string into utf8 chars
        int index = 0;
        size_t offs = 0;
+        symbols.reserve(text.size());
        while (offs < text.size()) {
            llm_symbol sym;
            size_t len = utf8_len(text[offs]);
@ -8065,6 +8067,7 @@ struct llm_tokenizer_bpe {
            int index = 0;
            size_t offset = 0;

+            symbols.reserve(word.size());
            while (offset < word.size()) {
                llm_symbol sym;
                size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
@ -8138,6 +8141,7 @@ struct llm_tokenizer_bpe {
                const auto token = vocab.token_to_id.find(str);

                if (token == vocab.token_to_id.end()) {
+                    output.reserve(str.end() - str.begin());
                    for (auto j = str.begin(); j != str.end(); ++j) {
                        std::string byte_str(1, *j);
                        auto token_multibyte = vocab.token_to_id.find(byte_str);
@ -8309,6 +8313,7 @@ private:
            }
        }

+        bpe_encoded_words.reserve(bpe_words.size());
        for (std::string & word : bpe_words) {
            std::string encoded_token = "";
            for (char & c : word) {
@ -10194,6 +10199,7 @@ static void llama_convert_tensor_internal(
    size_t in_buff_offs = 0;
    size_t out_buff_offs = 0;

+    workers.reserve(nthread);
    for (int tnum = 0; tnum < nthread; tnum++) {
        size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
        size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
@ -10697,6 +10703,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                                first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
                    }
                };
+                workers.reserve(nthread_use - 1);
                for (int it = 0; it < nthread_use - 1; ++it) {
                    workers.emplace_back(compute);
                }