We could use std::unordered_map over std::map (#305)

* Improve performance by changing std::map to std::unordered_map and std::map<id, token> id_to_token; to std::vector<token> id_to_token; * fix last commit on gpt_vocab_init add vocab.id_to_token.resize(vocab.token_to_id.size()); * Removed include <map> * Nest struct token score inside gpt_vocab * renamed token to tok
2023-03-21 14:21:50 -03:00 · 2023-03-21 14:21:50 -03:00 · 353ec251a4
commit 353ec251a4
parent 89d5d90f3b
4 changed files with 36 additions and 24 deletions
--- a/utils.h
+++ b/utils.h
@ -3,7 +3,7 @@
 #pragma once

 #include <string>
-#include <map>
+#include <unordered_map>
 #include <vector>
 #include <random>
 #include <thread>
@ -65,15 +65,19 @@ struct llama_vocab {
    using id    = int32_t;
    using token = std::string;

-    std::map<token, id> token_to_id;
-    std::map<id, token> id_to_token;
-    std::map<id, float> score;
+    struct token_score {
+        token tok;
+        float score;
+    };
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_score> id_to_token;
 };

 void replace(std::string & str, const std::string & needle, const std::string & replacement);

 // poor-man's JSON parsing
-std::map<std::string, int32_t> json_parse(const std::string & fname);
+std::unordered_map<std::string, int32_t> json_parse(const std::string & fname);

 // TODO: temporary until #77 is merged, need this now for some tokenizer tests
 bool llama_vocab_load(const std::string & fname, llama_vocab & vocab);