We could use std::unordered_map over std::map (#305)

* Improve performance by changing std::map to std::unordered_map and std::map<id, token> id_to_token; to std::vector<token> id_to_token;

* fix last commit on gpt_vocab_init add vocab.id_to_token.resize(vocab.token_to_id.size());

* Removed include <map>

* Nest struct token score inside gpt_vocab

* renamed token to tok
This commit is contained in:
Fabio R. Sluzala 2023-03-21 14:21:50 -03:00 committed by GitHub
parent 89d5d90f3b
commit 353ec251a4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 36 additions and 24 deletions

14
utils.h
View file

@ -3,7 +3,7 @@
#pragma once
#include <string>
#include <map>
#include <unordered_map>
#include <vector>
#include <random>
#include <thread>
@ -65,15 +65,19 @@ struct llama_vocab {
using id = int32_t;
using token = std::string;
std::map<token, id> token_to_id;
std::map<id, token> id_to_token;
std::map<id, float> score;
struct token_score {
token tok;
float score;
};
std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;
};
void replace(std::string & str, const std::string & needle, const std::string & replacement);
// poor-man's JSON parsing
std::map<std::string, int32_t> json_parse(const std::string & fname);
std::unordered_map<std::string, int32_t> json_parse(const std::string & fname);
// TODO: temporary until #77 is merged, need this now for some tokenizer tests
bool llama_vocab_load(const std::string & fname, llama_vocab & vocab);