gguf : new file format with flexible meta data (beta) (#2398)
* gguf : first API pass
* gguf : read header + meta data
* gguf : read tensor info
* gguf : initial model loading - not tested
* gguf : add gguf_get_tensor_name()
* gguf : do not support passing existing ggml_context to gguf_init
* gguf : simplify gguf_get_val
* gguf : gguf.c is now part of ggml.c
* gguf : read / write sample models
* gguf : add comments
* refactor : reduce code duplication and better API (#2415)
* gguf : expose the gguf_type enum through the API for now
* gguf : add array support
* gguf.py : some code style changes
* convert.py : start a new simplified implementation by removing old stuff
* convert.py : remove GGML vocab + other obsolete stuff
* GGUF : write tensor (#2426)
* WIP: Write tensor
* GGUF : Support writing tensors in Python
* refactor : rm unused import and upd todos
* fix : fix errors upd writing example
* rm example.gguf
* gitignore *.gguf
* undo formatting
* gguf : add gguf_find_key (#2438)
* gguf.cpp : find key example
* ggml.h : add gguf_find_key
* ggml.c : add gguf_find_key
* gguf : fix writing tensors
* gguf : do not hardcode tensor names to read
* gguf : write sample tensors to read
* gguf : add tokenization constants
* quick and dirty conversion example
* gguf : fix writing gguf arrays
* gguf : write tensors one by one and code reuse
* gguf : fix writing gguf arrays
* gguf : write tensors one by one
* gguf : write tensors one by one
* gguf : write tokenizer data
* gguf : upd gguf conversion script
* Update convert-llama-h5-to-gguf.py
* gguf : handle already encoded string
* ggml.h : get array str and f32
* ggml.c : get arr str and f32
* gguf.py : support any type
* Update convert-llama-h5-to-gguf.py
* gguf : fix set is not subscriptable
* gguf : update convert-llama-h5-to-gguf.py
* constants.py : add layer norm eps
* gguf.py : add layer norm eps and merges
* ggml.h : increase GGML_MAX_NAME to 64
* ggml.c : add gguf_get_arr_n
* Update convert-llama-h5-to-gguf.py
* add gptneox gguf example
* Makefile : add gptneox gguf example
* Update convert-llama-h5-to-gguf.py
* add gptneox gguf example
* Update convert-llama-h5-to-gguf.py
* Update convert-gptneox-h5-to-gguf.py
* Update convert-gptneox-h5-to-gguf.py
* Update convert-llama-h5-to-gguf.py
* gguf : support custom alignment value
* gguf : fix typo in function call
* gguf : mmap tensor data example
* fix : update convert-llama-h5-to-gguf.py
* Update convert-llama-h5-to-gguf.py
* convert-gptneox-h5-to-gguf.py : Special tokens
* gptneox-main.cpp : special tokens
* Update gptneox-main.cpp
* constants.py : special tokens
* gguf.py : accumulate kv and tensor info data + special tokens
* convert-gptneox-h5-to-gguf.py : accumulate kv and ti + special tokens
* gguf : gguf counterpart of llama-util.h
* gguf-util.h : update note
* convert-llama-h5-to-gguf.py : accumulate kv / ti + special tokens
* convert-llama-h5-to-gguf.py : special tokens
* Delete gptneox-common.cpp
* Delete gptneox-common.h
* convert-gptneox-h5-to-gguf.py : gpt2bpe tokenizer
* gptneox-main.cpp : gpt2 bpe tokenizer
* gpt2 bpe tokenizer (handles merges and unicode)
* Makefile : remove gptneox-common
* gguf.py : bytesarray for gpt2bpe tokenizer
* cmpnct_gpt2bpe.hpp : comments
* gguf.py : use custom alignment if present
* gguf : minor stuff
* Update gptneox-main.cpp
* map tensor names
* convert-gptneox-h5-to-gguf.py : map tensor names
* convert-llama-h5-to-gguf.py : map tensor names
* gptneox-main.cpp : map tensor names
* gguf : start implementing libllama in GGUF (WIP)
* gguf : start implementing libllama in GGUF (WIP)
* rm binary commited by mistake
* upd .gitignore
* gguf : calculate n_mult
* gguf : inference with 7B model working (WIP)
* gguf : rm deprecated function
* gguf : start implementing gguf_file_saver (WIP)
* gguf : start implementing gguf_file_saver (WIP)
* gguf : start implementing gguf_file_saver (WIP)
* gguf : add gguf_get_kv_type
* gguf : add gguf_get_kv_type
* gguf : write metadata in gguf_file_saver (WIP)
* gguf : write metadata in gguf_file_saver (WIP)
* gguf : write metadata in gguf_file_saver
* gguf : rm references to old file formats
* gguf : shorter name for member variable
* gguf : rm redundant method
* gguf : get rid of n_mult, read n_ff from file
* Update gguf_tensor_map.py
* Update gptneox-main.cpp
* gguf : rm references to old file magics
* gguf : start implementing quantization (WIP)
* gguf : start implementing quantization (WIP)
* gguf : start implementing quantization (WIP)
* gguf : start implementing quantization (WIP)
* gguf : start implementing quantization (WIP)
* gguf : start implementing quantization (WIP)
* gguf : quantization is working
* gguf : roper closing of file
* gguf.py : no need to convert tensors twice
* convert-gptneox-h5-to-gguf.py : no need to convert tensors twice
* convert-llama-h5-to-gguf.py : no need to convert tensors twice
* convert-gptneox-h5-to-gguf.py : simplify nbytes
* convert-llama-h5-to-gguf.py : simplify nbytes
* gptneox-main.cpp : n_layer --> n_block
* constants.py : n_layer --> n_block
* gguf.py : n_layer --> n_block
* convert-gptneox-h5-to-gguf.py : n_layer --> n_block
* convert-llama-h5-to-gguf.py : n_layer --> n_block
* gptneox-main.cpp : n_layer --> n_block
* Update gguf_tensor_map.py
* convert-gptneox-h5-to-gguf.py : load model in parts to save memory
* convert-llama-h5-to-gguf.py : load model in parts to save memory
* convert : write more metadata for LLaMA
* convert : rm quantization version
* convert-gptneox-h5-to-gguf.py : add file_type key
* gptneox-main.cpp : add file_type key
* fix conflicts
* gguf : add todos and comments
* convert-gptneox-h5-to-gguf.py : tensor name map changes
* Create gguf_namemap.py : tensor name map changes
* Delete gguf_tensor_map.py
* gptneox-main.cpp : tensor name map changes
* convert-llama-h5-to-gguf.py : fixes
* gguf.py : dont add empty strings
* simple : minor style changes
* gguf : use UNIX line ending
* Create convert-llama-7b-pth-to-gguf.py
* llama : sync gguf-llama.cpp with latest llama.cpp (#2608)
* llama : sync gguf-llama.cpp with latest llama.cpp
* minor : indentation + assert
* llama : refactor gguf_buffer and gguf_ctx_buffer
* llama : minor
* gitignore : add gptneox-main
* llama : tokenizer fixes (#2549)
* Merge tokenizer fixes into the gguf branch.
* Add test vocabularies
* convert : update convert-new.py with tokenizer fixes (#2614)
* Merge tokenizer fixes into the gguf branch.
* Add test vocabularies
* Adapt convert-new.py (and fix a clang-cl compiler error on windows)
* llama : sync gguf-llama with llama (#2613)
* llama : sync gguf-llama with llama
* tests : fix build + warnings (test-tokenizer-1 still fails)
* tests : fix wstring_convert
* convert : fix layer names
* llama : sync gguf-llama.cpp
* convert : update HF converter to new tokenizer voodoo magics
* llama : update tokenizer style
* convert-llama-h5-to-gguf.py : add token types
* constants.py : add token types
* gguf.py : add token types
* convert-llama-7b-pth-to-gguf.py : add token types
* gguf-llama.cpp : fix n_head_kv
* convert-llama-h5-to-gguf.py : add 70b gqa support
* gguf.py : add tensor data layout
* convert-llama-h5-to-gguf.py : add tensor data layout
* convert-llama-7b-pth-to-gguf.py : add tensor data layout
* gptneox-main.cpp : add tensor data layout
* convert-llama-h5-to-gguf.py : clarify the reverse permute
* llama : refactor model loading code (#2620)
* llama : style formatting + remove helper methods
* llama : fix quantization using gguf tool
* llama : simplify gguf_file_saver
* llama : fix method names
* llama : simplify write_header()
* llama : no need to pass full file loader to the file saver
just gguf_ctx
* llama : gguf_file_saver write I32
* llama : refactor tensor names (#2622)
* gguf: update tensor names searched in quantization
* gguf : define tensor names as constants
* gguf : initial write API (not tested yet)
* gguf : write to file API (not tested)
* gguf : initial write API ready + example
* gguf : fix header write
* gguf : fixes + simplify example + add ggml_nbytes_pad()
* gguf : minor
* llama : replace gguf_file_saver with new gguf write API
* gguf : streaming support when writing files
* gguf : remove oboslete write methods
* gguf : remove obosolete gguf_get_arr_xxx API
* llama : simplify gguf_file_loader
* llama : move hparams and vocab from gguf_file_loader to llama_model_loader
* llama : merge gguf-util.h in llama.cpp
* llama : reorder definitions in .cpp to match .h
* llama : minor simplifications
* llama : refactor llama_model_loader (WIP)
wip : remove ggml_ctx from llama_model_loader
wip : merge gguf_file_loader in llama_model_loader
* llama : fix shape prints
* llama : fix Windows build + fix norm_rms_eps key
* llama : throw error on missing KV paris in model meta data
* llama : improve printing + log meta data
* llama : switch print order of meta data
---------
Co-authored-by: M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
* gguf : deduplicate (#2629)
* gguf : better type names
* dedup : CPU + Metal is working
* ggml : fix warnings about unused results
* llama.cpp : fix line feed and compiler warning
* llama : fix strncpy warning + note token_to_str does not write null
* llama : restore the original load/save session implementation
Will migrate this to GGUF in the future
* convert-llama-h5-to-gguf.py : support alt ctx param name
* ggml : assert when using ggml_mul with non-F32 src1
* examples : dedup simple
---------
Co-authored-by: klosax <131523366+klosax@users.noreply.github.com>
* gguf.py : merge all files in gguf.py
* convert-new.py : pick #2427 for HF 70B support
* examples/gguf : no need to keep q option for quantization any more
* llama.cpp : print actual model size
* llama.cpp : use ggml_elements()
* convert-new.py : output gguf (#2635)
* convert-new.py : output gguf (WIP)
* convert-new.py : add gguf key-value pairs
* llama : add hparams.ctx_train + no longer print ftype
* convert-new.py : minor fixes
* convert-new.py : vocab-only option should work now
* llama : fix tokenizer to use llama_char_to_byte
* tests : add new ggml-vocab-llama.gguf
* convert-new.py : tensor name mapping
* convert-new.py : add map for skipping tensor serialization
* convert-new.py : convert script now works
* gguf.py : pick some of the refactoring from #2644
* convert-new.py : minor fixes
* convert.py : update to support GGUF output
* Revert "ci : disable CI temporary to not waste energy"
This reverts commit 7e82d25f40
.
* convert.py : n_head_kv optional and .gguf file extension
* convert.py : better always have n_head_kv and default it to n_head
* llama : sync with recent PRs on master
* editorconfig : ignore models folder
ggml-ci
* ci : update ".bin" to ".gguf" extension
ggml-ci
* llama : fix llama_model_loader memory leak
* gptneox : move as a WIP example
* llama : fix lambda capture
ggml-ci
* ggml : fix bug in gguf_set_kv
ggml-ci
* common.h : .bin --> .gguf
* quantize-stats.cpp : .bin --> .gguf
* convert.py : fix HF tensor permuting / unpacking
ggml-ci
* llama.cpp : typo
* llama : throw error if gguf fails to init from file
ggml-ci
* llama : fix tensor name grepping during quantization
ggml-ci
* gguf.py : write tensors in a single pass (#2644)
* gguf : single pass for writing tensors + refactoring writer
* gguf : single pass for writing tensors + refactoring writer
* gguf : single pass for writing tensors + refactoring writer
* gguf : style fixes in simple conversion script
* gguf : refactor gptneox conversion script
* gguf : rename h5 to hf (for HuggingFace)
* gguf : refactor pth to gguf conversion script
* gguf : rm file_type key and method
* gguf.py : fix vertical alignment
* gguf.py : indentation
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* convert-gptneox-hf-to-gguf.py : fixes
* gguf.py : gptneox mapping
* convert-llama-hf-to-gguf.py : fixes
* convert-llama-7b-pth-to-gguf.py : fixes
* ggml.h : reverse GGUF_MAGIC
* gguf.py : reverse GGUF_MAGIC
* test-tokenizer-0.cpp : fix warning
* llama.cpp : print kv general.name
* llama.cpp : get special token kv and linefeed token id
* llama : print number of tensors per type + print arch + style
* tests : update vocab file with new magic
* editorconfig : fix whitespaces
* llama : re-order functions
* llama : remove C++ API + reorganize common source in /common dir
* llama : minor API updates
* llama : avoid hardcoded special tokens
* llama : fix MPI build
ggml-ci
* llama : introduce enum llama_vocab_type + remove hardcoded string constants
* convert-falcon-hf-to-gguf.py : falcon HF --> gguf conversion, not tested
* falcon-main.cpp : falcon inference example
* convert-falcon-hf-to-gguf.py : remove extra kv
* convert-gptneox-hf-to-gguf.py : remove extra kv
* convert-llama-7b-pth-to-gguf.py : remove extra kv
* convert-llama-hf-to-gguf.py : remove extra kv
* gguf.py : fix for falcon 40b
* falcon-main.cpp : fix for falcon 40b
* convert-falcon-hf-to-gguf.py : update ref
* convert-falcon-hf-to-gguf.py : add tensor data layout
* cmpnct_gpt2bpe.hpp : fixes
* falcon-main.cpp : fixes
* gptneox-main.cpp : fixes
* cmpnct_gpt2bpe.hpp : remove non-general stuff
* Update examples/server/README.md
Co-authored-by: slaren <slarengh@gmail.com>
* cmpnct_gpt2bpe.hpp : cleanup
* convert-llama-hf-to-gguf.py : special tokens
* convert-llama-7b-pth-to-gguf.py : special tokens
* convert-permute-debug.py : permute debug print
* convert-permute-debug-master.py : permute debug for master
* convert-permute-debug.py : change permute type of attn_q
* convert.py : 70b model working (change attn_q permute)
* Delete convert-permute-debug-master.py
* Delete convert-permute-debug.py
* convert-llama-hf-to-gguf.py : fix attn_q permute
* gguf.py : fix rope scale kv
* convert-llama-hf-to-gguf.py : rope scale and added tokens
* convert-llama-7b-pth-to-gguf.py : rope scale and added tokens
* llama.cpp : use rope scale kv
* convert-llama-7b-pth-to-gguf.py : rope scale fix
* convert-llama-hf-to-gguf.py : rope scale fix
* py : fix whitespace
* gguf : add Python script to convert GGMLv3 LLaMA models to GGUF (#2682)
* First pass at converting GGMLv3 LLaMA models to GGUF
* Cleanups, better output during conversion
* Fix vocab space conversion logic
* More vocab conversion fixes
* Add description to converted GGUF files
* Improve help text, expand warning
* Allow specifying name and description for output GGUF
* Allow overriding vocab and hyperparams from original model metadata
* Use correct params override var name
* Fix wrong type size for Q8_K
Better handling of original style metadata
* Set default value for gguf add_tensor raw_shape KW arg
* llama : improve token type support (#2668)
* Merge tokenizer fixes into the gguf branch.
* Add test vocabularies
* Adapt convert-new.py (and fix a clang-cl compiler error on windows)
* Improved tokenizer test
But does it work on MacOS?
* Improve token type support
- Added @klosax code to convert.py
- Improved token type support in vocabulary
* Exclude platform dependent tests
* More sentencepiece compatibility by eliminating magic numbers
* Restored accidentally removed comment
* llama : add API for token type
ggml-ci
* tests : use new tokenizer type API (#2692)
* Merge tokenizer fixes into the gguf branch.
* Add test vocabularies
* Adapt convert-new.py (and fix a clang-cl compiler error on windows)
* Improved tokenizer test
But does it work on MacOS?
* Improve token type support
- Added @klosax code to convert.py
- Improved token type support in vocabulary
* Exclude platform dependent tests
* More sentencepiece compatibility by eliminating magic numbers
* Restored accidentally removed comment
* Improve commentary
* Use token type API in test-tokenizer-1.cpp
* py : cosmetics
* readme : add notice about new file format
ggml-ci
---------
Co-authored-by: M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
Co-authored-by: klosax <131523366+klosax@users.noreply.github.com>
Co-authored-by: goerch <jhr.walter@t-online.de>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
This commit is contained in:
parent
dadbed99e6
commit
6381d4e110
54 changed files with 10020 additions and 2955 deletions
267
llama.h
267
llama.h
|
@ -34,29 +34,18 @@
|
|||
# define DEPRECATED(func, hint) func
|
||||
#endif
|
||||
|
||||
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
||||
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
||||
|
||||
#define LLAMA_FILE_VERSION 3
|
||||
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
||||
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 1
|
||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
|
||||
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 1
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
#endif
|
||||
|
||||
#ifndef LLAMA_DEFAULT_RMS_EPS
|
||||
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
@ -72,6 +61,50 @@ extern "C" {
|
|||
|
||||
typedef int llama_token;
|
||||
|
||||
enum llama_log_level {
|
||||
LLAMA_LOG_LEVEL_ERROR = 2,
|
||||
LLAMA_LOG_LEVEL_WARN = 3,
|
||||
LLAMA_LOG_LEVEL_INFO = 4
|
||||
};
|
||||
|
||||
enum llama_vocab_type {
|
||||
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
||||
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
||||
};
|
||||
|
||||
enum llama_token_type {
|
||||
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
||||
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
||||
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
||||
LLAMA_TOKEN_TYPE_CONTROL = 3,
|
||||
LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
|
||||
LLAMA_TOKEN_TYPE_UNUSED = 5,
|
||||
LLAMA_TOKEN_TYPE_BYTE = 6,
|
||||
};
|
||||
|
||||
// model file types
|
||||
enum llama_ftype {
|
||||
LLAMA_FTYPE_ALL_F32 = 0,
|
||||
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
||||
};
|
||||
|
||||
typedef struct llama_token_data {
|
||||
llama_token id; // token id
|
||||
float logit; // log-odds of the token
|
||||
|
@ -86,25 +119,10 @@ extern "C" {
|
|||
|
||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||
|
||||
enum llama_log_level {
|
||||
LLAMA_LOG_LEVEL_ERROR = 2,
|
||||
LLAMA_LOG_LEVEL_WARN = 3,
|
||||
LLAMA_LOG_LEVEL_INFO = 4
|
||||
};
|
||||
|
||||
// Signature for logging events
|
||||
// Note that text includes the new line character at the end for most events.
|
||||
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
||||
// if it exists.
|
||||
// It might not exist for progress report where '.' is output repeatedly.
|
||||
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
||||
|
||||
struct llama_context_params {
|
||||
uint32_t seed; // RNG seed, -1 for random
|
||||
int32_t n_ctx; // text context
|
||||
int32_t n_batch; // prompt processing batch size
|
||||
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
||||
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||
|
||||
|
@ -129,33 +147,18 @@ extern "C" {
|
|||
bool use_mlock; // force system to keep model in RAM
|
||||
bool embedding; // embedding mode only
|
||||
};
|
||||
// model file types
|
||||
enum llama_ftype {
|
||||
LLAMA_FTYPE_ALL_F32 = 0,
|
||||
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
||||
};
|
||||
|
||||
// Signature for logging events
|
||||
// Note that text includes the new line character at the end for most events.
|
||||
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
||||
// if it exists.
|
||||
// It might not exist for progress report where '.' is output repeatedly.
|
||||
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
||||
|
||||
// model quantization parameters
|
||||
typedef struct llama_model_quantize_params {
|
||||
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
} llama_model_quantize_params;
|
||||
|
@ -208,27 +211,16 @@ extern "C" {
|
|||
int32_t n_eval;
|
||||
};
|
||||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
||||
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
||||
|
||||
LLAMA_API int llama_max_devices();
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
||||
|
||||
LLAMA_API bool llama_mmap_supported();
|
||||
LLAMA_API bool llama_mlock_supported();
|
||||
|
||||
// TODO: not great API - very likely to change
|
||||
// Initialize the llama + ggml backend
|
||||
// If numa is true, use NUMA optimizations
|
||||
// Call once at the start of the program
|
||||
LLAMA_API void llama_backend_init(bool numa);
|
||||
// Call once at the end of the program - currently only used for MPI
|
||||
LLAMA_API void llama_backend_free();
|
||||
|
||||
LLAMA_API int64_t llama_time_us();
|
||||
// Call once at the end of the program - currently only used for MPI
|
||||
LLAMA_API void llama_backend_free(void);
|
||||
|
||||
LLAMA_API struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
|
@ -240,17 +232,26 @@ extern "C" {
|
|||
struct llama_model * model,
|
||||
struct llama_context_params params);
|
||||
|
||||
// Various functions for loading a ggml llama model.
|
||||
// Allocate (almost) all memory needed for the model.
|
||||
// Return NULL on failure
|
||||
LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
|
||||
const char * path_model,
|
||||
struct llama_context_params params),
|
||||
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
|
||||
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
LLAMA_API int64_t llama_time_us(void);
|
||||
|
||||
LLAMA_API int llama_max_devices (void);
|
||||
LLAMA_API bool llama_mmap_supported (void);
|
||||
LLAMA_API bool llama_mlock_supported(void);
|
||||
|
||||
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
|
||||
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
|
||||
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
||||
|
||||
// Get a string describing the model type
|
||||
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
|
||||
|
||||
// Returns 0 on success
|
||||
LLAMA_API int llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
|
@ -272,9 +273,9 @@ extern "C" {
|
|||
|
||||
LLAMA_API int llama_model_apply_lora_from_file(
|
||||
const struct llama_model * model,
|
||||
const char * path_lora,
|
||||
const char * path_base_model,
|
||||
int n_threads);
|
||||
const char * path_lora,
|
||||
const char * path_base_model,
|
||||
int n_threads);
|
||||
|
||||
// Returns the number of tokens in the KV cache
|
||||
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
||||
|
@ -324,11 +325,40 @@ extern "C" {
|
|||
// IMPORTANT: do not use for anything else other than debugging and testing!
|
||||
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
||||
|
||||
// Token logits obtained from the last call to llama_eval()
|
||||
// The logits for the last token are stored in the last row
|
||||
// Can be mutated in order to change the probabilities of the next token
|
||||
// Rows: n_tokens
|
||||
// Cols: n_vocab
|
||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
|
||||
// Get the embeddings for the input
|
||||
// shape: [n_embd] (1-dimensional)
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// Vocab
|
||||
//
|
||||
|
||||
LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
|
||||
|
||||
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
|
||||
|
||||
LLAMA_API llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
|
||||
|
||||
//
|
||||
// Tokenization
|
||||
//
|
||||
|
||||
// Convert the provided text into tokens.
|
||||
// The tokens pointer must be large enough to hold the resulting tokens.
|
||||
// Returns the number of tokens on success, no more than n_max_tokens
|
||||
// Returns a negative number on failure - the number of tokens that would have been returned
|
||||
// TODO: not sure if correct
|
||||
LLAMA_API int llama_tokenize(
|
||||
struct llama_context * ctx,
|
||||
const char * text,
|
||||
|
@ -336,6 +366,13 @@ extern "C" {
|
|||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_tokenize_bpe(
|
||||
struct llama_context * ctx,
|
||||
const char * text,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_tokenize_with_model(
|
||||
const struct llama_model * model,
|
||||
const char * text,
|
||||
|
@ -343,57 +380,30 @@ extern "C" {
|
|||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
|
||||
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
||||
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
||||
|
||||
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
|
||||
|
||||
// Get the vocabulary as output parameters.
|
||||
// Returns number of results.
|
||||
LLAMA_API int llama_get_vocab(
|
||||
const struct llama_context * ctx,
|
||||
const char * * strings,
|
||||
float * scores,
|
||||
int capacity);
|
||||
|
||||
LLAMA_API int llama_get_vocab_from_model(
|
||||
const struct llama_model * model,
|
||||
const char * * strings,
|
||||
float * scores,
|
||||
int capacity);
|
||||
|
||||
// Token logits obtained from the last call to llama_eval()
|
||||
// The logits for the last token are stored in the last row
|
||||
// Can be mutated in order to change the probabilities of the next token
|
||||
// Rows: n_tokens
|
||||
// Cols: n_vocab
|
||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
|
||||
// Get the embeddings for the input
|
||||
// shape: [n_embd] (1-dimensional)
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
LLAMA_API const char * llama_token_to_str(
|
||||
// Does not write null terminator to the buffer
|
||||
LLAMA_API int llama_token_to_str(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token);
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int length);
|
||||
|
||||
LLAMA_API const char * llama_token_to_str_with_model(
|
||||
LLAMA_API int llama_token_to_str_bpe(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int length);
|
||||
|
||||
LLAMA_API int llama_token_to_str_with_model(
|
||||
const struct llama_model * model,
|
||||
llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_nl(); // next-line
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int length);
|
||||
|
||||
//
|
||||
// Grammar
|
||||
//
|
||||
|
||||
LLAMA_API struct llama_grammar * llama_grammar_init(
|
||||
const llama_grammar_element ** rules,
|
||||
size_t n_rules,
|
||||
|
@ -401,7 +411,9 @@ extern "C" {
|
|||
|
||||
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
||||
|
||||
//
|
||||
// Sampling functions
|
||||
//
|
||||
|
||||
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
||||
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
||||
|
@ -470,6 +482,10 @@ extern "C" {
|
|||
// Print system information
|
||||
LLAMA_API const char * llama_print_system_info(void);
|
||||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -479,10 +495,11 @@ extern "C" {
|
|||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
struct ggml_tensor;
|
||||
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||
|
||||
#endif
|
||||
#endif // LLAMA_API_INTERNAL
|
||||
|
||||
#endif // LLAMA_H
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue