Introduce C-style API (#370)
* Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning
This commit is contained in:
parent
da0e9fe90c
commit
f5a77a629b
14 changed files with 1954 additions and 1752 deletions
61
utils.h
61
utils.h
|
@ -2,8 +2,9 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <thread>
|
||||
|
@ -49,64 +50,8 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
|||
|
||||
std::string gpt_random_prompt(std::mt19937 & rng);
|
||||
|
||||
//
|
||||
// Model file parsing
|
||||
//
|
||||
|
||||
#define FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
|
||||
#define FILE_MAGIC 0x67676d66 // 'ggmf' in hex
|
||||
#define FILE_VERSION 1
|
||||
|
||||
//
|
||||
// Vocab utils
|
||||
//
|
||||
|
||||
struct llama_vocab {
|
||||
using id = int32_t;
|
||||
using token = std::string;
|
||||
|
||||
struct token_score {
|
||||
token tok;
|
||||
float score;
|
||||
};
|
||||
|
||||
std::unordered_map<token, id> token_to_id;
|
||||
std::vector<token_score> id_to_token;
|
||||
};
|
||||
|
||||
void replace(std::string & str, const std::string & needle, const std::string & replacement);
|
||||
|
||||
// poor-man's JSON parsing
|
||||
std::unordered_map<std::string, int32_t> json_parse(const std::string & fname);
|
||||
|
||||
// TODO: temporary until #77 is merged, need this now for some tokenizer tests
|
||||
bool llama_vocab_load(const std::string & fname, llama_vocab & vocab);
|
||||
|
||||
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
|
||||
// ref: https://github.com/google/sentencepiece
|
||||
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos);
|
||||
|
||||
// sample next token given probabilities for each embedding
|
||||
//
|
||||
// - consider only the top K tokens
|
||||
// - from them, consider only the top tokens with cumulative probability > P
|
||||
//
|
||||
llama_vocab::id llama_sample_top_p_top_k(
|
||||
const llama_vocab & vocab,
|
||||
const float * logits,
|
||||
std::vector<llama_vocab::id> & last_n_tokens,
|
||||
double repeat_penalty,
|
||||
int top_k,
|
||||
double top_p,
|
||||
double temp,
|
||||
std::mt19937 & rng);
|
||||
|
||||
// filer to top K tokens from list of logits
|
||||
void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k);
|
||||
|
||||
//
|
||||
// Quantization
|
||||
//
|
||||
|
||||
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
||||
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
||||
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue