From 1e7a033f10891e502e19b19ebc7da918409fe21e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 26 Aug 2023 17:42:33 +0300 Subject: [PATCH] common : add comments --- common/common.h | 6 ++++++ llama.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/common/common.h b/common/common.h index cb1627fc6..1c1acf989 100644 --- a/common/common.h +++ b/common/common.h @@ -116,15 +116,21 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param // Vocab utils // +// tokenizes a string into a vector of tokens +// should work similar to Python's `tokenizer.encode` std::vector llama_tokenize( struct llama_context * ctx, const std::string & text, bool add_bos); +// tokenizes a token into a piece +// should work similar to Python's `tokenizer.id_to_piece` std::string llama_token_to_piece( const struct llama_context * ctx, llama_token token); +// detokenizes a vector of tokens into a string +// should work similar to Python's `tokenizer.decode` // removes the leading space from the first non-BOS token std::string llama_detokenize( llama_context * ctx, diff --git a/llama.h b/llama.h index f9a7300ea..b084fe23c 100644 --- a/llama.h +++ b/llama.h @@ -384,7 +384,7 @@ extern "C" { // Token Id -> Piece. // Uses the vocabulary in the provided context. // Does not write null terminator to the buffer. - // Use code is responsible to remove the leading whitespace of the first non-BOS token. + // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. LLAMA_API int llama_token_to_piece( const struct llama_context * ctx, llama_token token,