common : add comments

This commit is contained in:
Georgi Gerganov 2023-08-26 17:42:33 +03:00
parent 9668aa115c
commit 1e7a033f10
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 7 additions and 1 deletions

View file

@ -116,15 +116,21 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
// Vocab utils // Vocab utils
// //
// tokenizes a string into a vector of tokens
// should work similar to Python's `tokenizer.encode`
std::vector<llama_token> llama_tokenize( std::vector<llama_token> llama_tokenize(
struct llama_context * ctx, struct llama_context * ctx,
const std::string & text, const std::string & text,
bool add_bos); bool add_bos);
// tokenizes a token into a piece
// should work similar to Python's `tokenizer.id_to_piece`
std::string llama_token_to_piece( std::string llama_token_to_piece(
const struct llama_context * ctx, const struct llama_context * ctx,
llama_token token); llama_token token);
// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
// removes the leading space from the first non-BOS token // removes the leading space from the first non-BOS token
std::string llama_detokenize( std::string llama_detokenize(
llama_context * ctx, llama_context * ctx,

View file

@ -384,7 +384,7 @@ extern "C" {
// Token Id -> Piece. // Token Id -> Piece.
// Uses the vocabulary in the provided context. // Uses the vocabulary in the provided context.
// Does not write null terminator to the buffer. // Does not write null terminator to the buffer.
// Use code is responsible to remove the leading whitespace of the first non-BOS token. // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
LLAMA_API int llama_token_to_piece( LLAMA_API int llama_token_to_piece(
const struct llama_context * ctx, const struct llama_context * ctx,
llama_token token, llama_token token,