Introduce C-style API (#370)

* Major refactoring - introduce C-style API * Clean up * Add <cassert> * Add <iterator> * Add <algorithm> .... * Fix timing reporting and accumulation * Measure eval time only for single-token calls * Change llama_tokenize return meaning
2023-03-22 07:32:36 +02:00 · 2023-03-22 07:32:36 +02:00 · f5a77a629b
commit f5a77a629b
parent da0e9fe90c
14 changed files with 1954 additions and 1752 deletions
--- a/utils.h
+++ b/utils.h
@ -2,8 +2,9 @@

 #pragma once

+#include "llama.h"
+
 #include <string>
-#include <unordered_map>
 #include <vector>
 #include <random>
 #include <thread>
@ -49,64 +50,8 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

 std::string gpt_random_prompt(std::mt19937 & rng);

-//
-// Model file parsing
-//
-
-#define FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
-#define FILE_MAGIC 0x67676d66 // 'ggmf' in hex
-#define FILE_VERSION 1
-
 //
 // Vocab utils
 //

-struct llama_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    struct token_score {
-        token tok;
-        float score;
-    };
-
-    std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
-};
-
-void replace(std::string & str, const std::string & needle, const std::string & replacement);
-
-// poor-man's JSON parsing
-std::unordered_map<std::string, int32_t> json_parse(const std::string & fname);
-
-// TODO: temporary until #77 is merged, need this now for some tokenizer tests
-bool llama_vocab_load(const std::string & fname, llama_vocab & vocab);
-
-// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
-// ref: https://github.com/google/sentencepiece
-std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos);
-
-// sample next token given probabilities for each embedding
-//
-//   - consider only the top K tokens
-//   - from them, consider only the top tokens with cumulative probability > P
-//
-llama_vocab::id llama_sample_top_p_top_k(
-        const llama_vocab & vocab,
-        const float * logits,
-        std::vector<llama_vocab::id> & last_n_tokens,
-        double repeat_penalty,
-        int top_k,
-        double top_p,
-        double temp,
-        std::mt19937 & rng);
-
-// filer to top K tokens from list of logits
-void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k);
-
-//
-// Quantization
-//
-
-size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
-size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
+std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);