llama : refactor src/llama.cpp
(#10902)
* llama : scatter llama.cpp into multiple modules (wip) * llama : control-vector -> adapter * llama : arch * llama : mmap ggml-ci * ci : remove BUILD_SHARED_LIBS=OFF ggml-ci * llama : arch (cont) ggml-ci * llama : chat ggml-ci * llama : model ggml-ci * llama : hparams ggml-ci * llama : adapter ggml-ci * examples : fix ggml-ci * rebase ggml-ci * minor * llama : kv cache ggml-ci * llama : impl ggml-ci * llama : batch ggml-ci * cont ggml-ci * llama : context ggml-ci * minor * llama : context (cont) ggml-ci * llama : model loader ggml-ci * common : update lora ggml-ci * llama : quant ggml-ci * llama : quant (cont) ggml-ci * minor [no ci]
This commit is contained in:
parent
2f0ee84b9b
commit
f66f582927
61 changed files with 12193 additions and 11649 deletions
389
src/llama-model.h
Normal file
389
src/llama-model.h
Normal file
|
@ -0,0 +1,389 @@
|
|||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-arch.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-mmap.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
// available models
|
||||
// TODO: this enum does not follow the enum naming convention
|
||||
enum llm_type {
|
||||
MODEL_UNKNOWN,
|
||||
MODEL_14M,
|
||||
MODEL_17M,
|
||||
MODEL_22M,
|
||||
MODEL_33M,
|
||||
MODEL_60M,
|
||||
MODEL_70M,
|
||||
MODEL_80M,
|
||||
MODEL_109M,
|
||||
MODEL_137M,
|
||||
MODEL_160M,
|
||||
MODEL_220M,
|
||||
MODEL_250M,
|
||||
MODEL_270M,
|
||||
MODEL_335M,
|
||||
MODEL_410M,
|
||||
MODEL_450M,
|
||||
MODEL_770M,
|
||||
MODEL_780M,
|
||||
MODEL_0_5B,
|
||||
MODEL_1B,
|
||||
MODEL_1_3B,
|
||||
MODEL_1_4B,
|
||||
MODEL_1_5B,
|
||||
MODEL_1_6B,
|
||||
MODEL_2B,
|
||||
MODEL_2_8B,
|
||||
MODEL_3B,
|
||||
MODEL_4B,
|
||||
MODEL_6B,
|
||||
MODEL_6_9B,
|
||||
MODEL_7B,
|
||||
MODEL_8B,
|
||||
MODEL_9B,
|
||||
MODEL_11B,
|
||||
MODEL_12B,
|
||||
MODEL_13B,
|
||||
MODEL_14B,
|
||||
MODEL_15B,
|
||||
MODEL_16B,
|
||||
MODEL_20B,
|
||||
MODEL_30B,
|
||||
MODEL_32B,
|
||||
MODEL_34B,
|
||||
MODEL_35B,
|
||||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
MODEL_236B,
|
||||
MODEL_314B,
|
||||
MODEL_SMALL,
|
||||
MODEL_MEDIUM,
|
||||
MODEL_LARGE,
|
||||
MODEL_XL,
|
||||
MODEL_A1_7B,
|
||||
MODEL_A2_7B,
|
||||
MODEL_8x7B,
|
||||
MODEL_8x22B,
|
||||
MODEL_16x12B,
|
||||
MODEL_10B_128x3_66B,
|
||||
MODEL_57B_A14B,
|
||||
MODEL_27B,
|
||||
};
|
||||
|
||||
struct llama_layer_posnet {
|
||||
// resnet
|
||||
struct ggml_tensor * norm1 = nullptr;
|
||||
struct ggml_tensor * norm1_b = nullptr;
|
||||
|
||||
struct ggml_tensor * conv1 = nullptr;
|
||||
struct ggml_tensor * conv1_b = nullptr;
|
||||
|
||||
struct ggml_tensor * norm2 = nullptr;
|
||||
struct ggml_tensor * norm2_b = nullptr;
|
||||
|
||||
struct ggml_tensor * conv2 = nullptr;
|
||||
struct ggml_tensor * conv2_b = nullptr;
|
||||
|
||||
// attention
|
||||
struct ggml_tensor * attn_norm = nullptr;
|
||||
struct ggml_tensor * attn_norm_b = nullptr;
|
||||
|
||||
struct ggml_tensor * attn_q = nullptr;
|
||||
struct ggml_tensor * attn_q_b = nullptr;
|
||||
|
||||
struct ggml_tensor * attn_k = nullptr;
|
||||
struct ggml_tensor * attn_k_b = nullptr;
|
||||
|
||||
struct ggml_tensor * attn_v = nullptr;
|
||||
struct ggml_tensor * attn_v_b = nullptr;
|
||||
|
||||
struct ggml_tensor * attn_o = nullptr;
|
||||
struct ggml_tensor * attn_o_b = nullptr;
|
||||
|
||||
// normalize
|
||||
struct ggml_tensor * norm = nullptr;
|
||||
struct ggml_tensor * norm_b = nullptr;
|
||||
};
|
||||
|
||||
struct llama_layer_convnext {
|
||||
struct ggml_tensor * dw = nullptr;
|
||||
struct ggml_tensor * dw_b = nullptr;
|
||||
|
||||
struct ggml_tensor * norm = nullptr;
|
||||
struct ggml_tensor * norm_b = nullptr;
|
||||
|
||||
struct ggml_tensor * pw1 = nullptr;
|
||||
struct ggml_tensor * pw1_b = nullptr;
|
||||
|
||||
struct ggml_tensor * pw2 = nullptr;
|
||||
struct ggml_tensor * pw2_b = nullptr;
|
||||
|
||||
struct ggml_tensor * gamma = nullptr;
|
||||
};
|
||||
|
||||
struct llama_layer {
|
||||
// normalization
|
||||
struct ggml_tensor * attn_norm = nullptr;
|
||||
struct ggml_tensor * attn_norm_b = nullptr;
|
||||
struct ggml_tensor * attn_norm_2 = nullptr;
|
||||
struct ggml_tensor * attn_norm_2_b = nullptr;
|
||||
struct ggml_tensor * attn_q_norm = nullptr;
|
||||
struct ggml_tensor * attn_q_norm_b = nullptr;
|
||||
struct ggml_tensor * attn_k_norm = nullptr;
|
||||
struct ggml_tensor * attn_k_norm_b = nullptr;
|
||||
struct ggml_tensor * attn_out_norm = nullptr;
|
||||
struct ggml_tensor * attn_out_norm_b = nullptr;
|
||||
struct ggml_tensor * attn_q_a_norm = nullptr;
|
||||
struct ggml_tensor * attn_kv_a_norm = nullptr;
|
||||
struct ggml_tensor * attn_sub_norm = nullptr;
|
||||
struct ggml_tensor * attn_post_norm = nullptr;
|
||||
struct ggml_tensor * ffn_sub_norm = nullptr;
|
||||
struct ggml_tensor * attn_norm_cross = nullptr;
|
||||
struct ggml_tensor * attn_norm_enc = nullptr;
|
||||
|
||||
// attention
|
||||
struct ggml_tensor * wq = nullptr;
|
||||
struct ggml_tensor * wk = nullptr;
|
||||
struct ggml_tensor * wv = nullptr;
|
||||
struct ggml_tensor * wo = nullptr;
|
||||
struct ggml_tensor * wqkv = nullptr;
|
||||
struct ggml_tensor * wq_a = nullptr;
|
||||
struct ggml_tensor * wq_b = nullptr;
|
||||
struct ggml_tensor * wkv_a_mqa = nullptr;
|
||||
struct ggml_tensor * wkv_b = nullptr;
|
||||
struct ggml_tensor * wq_cross = nullptr;
|
||||
struct ggml_tensor * wk_cross = nullptr;
|
||||
struct ggml_tensor * wv_cross = nullptr;
|
||||
struct ggml_tensor * wo_cross = nullptr;
|
||||
struct ggml_tensor * wq_enc = nullptr;
|
||||
struct ggml_tensor * wk_enc = nullptr;
|
||||
struct ggml_tensor * wv_enc = nullptr;
|
||||
struct ggml_tensor * wo_enc = nullptr;
|
||||
|
||||
// attention bias
|
||||
struct ggml_tensor * bq = nullptr;
|
||||
struct ggml_tensor * bk = nullptr;
|
||||
struct ggml_tensor * bv = nullptr;
|
||||
struct ggml_tensor * bo = nullptr;
|
||||
struct ggml_tensor * bqkv = nullptr;
|
||||
|
||||
// relative position bias
|
||||
struct ggml_tensor * attn_rel_b = nullptr;
|
||||
struct ggml_tensor * attn_rel_b_enc = nullptr;
|
||||
struct ggml_tensor * attn_rel_b_cross = nullptr;
|
||||
|
||||
// normalization
|
||||
struct ggml_tensor * ffn_norm = nullptr;
|
||||
struct ggml_tensor * ffn_norm_b = nullptr;
|
||||
struct ggml_tensor * ffn_post_norm = nullptr;
|
||||
struct ggml_tensor * layer_out_norm = nullptr;
|
||||
struct ggml_tensor * layer_out_norm_b = nullptr;
|
||||
struct ggml_tensor * ffn_norm_exps = nullptr;
|
||||
struct ggml_tensor * ffn_norm_enc = nullptr;
|
||||
|
||||
// ff
|
||||
struct ggml_tensor * ffn_gate = nullptr; // w1
|
||||
struct ggml_tensor * ffn_down = nullptr; // w2
|
||||
struct ggml_tensor * ffn_up = nullptr; // w3
|
||||
struct ggml_tensor * ffn_gate_enc = nullptr;
|
||||
struct ggml_tensor * ffn_down_enc = nullptr;
|
||||
struct ggml_tensor * ffn_up_enc = nullptr;
|
||||
|
||||
// ff MoE
|
||||
struct ggml_tensor * ffn_gate_inp = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps = nullptr;
|
||||
|
||||
// ff shared expert (shexp)
|
||||
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
|
||||
struct ggml_tensor * ffn_gate_shexp = nullptr;
|
||||
struct ggml_tensor * ffn_down_shexp = nullptr;
|
||||
struct ggml_tensor * ffn_up_shexp = nullptr;
|
||||
|
||||
// ff bias
|
||||
struct ggml_tensor * ffn_gate_b = nullptr;
|
||||
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
||||
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
||||
struct ggml_tensor * ffn_act = nullptr;
|
||||
|
||||
// mamba proj
|
||||
struct ggml_tensor * ssm_in = nullptr;
|
||||
struct ggml_tensor * ssm_x = nullptr;
|
||||
struct ggml_tensor * ssm_dt = nullptr;
|
||||
struct ggml_tensor * ssm_out = nullptr;
|
||||
|
||||
// mamba
|
||||
struct ggml_tensor * ssm_conv1d = nullptr;
|
||||
struct ggml_tensor * ssm_a = nullptr;
|
||||
struct ggml_tensor * ssm_d = nullptr;
|
||||
|
||||
// mamba bias
|
||||
struct ggml_tensor * ssm_conv1d_b = nullptr;
|
||||
struct ggml_tensor * ssm_dt_b = nullptr;
|
||||
|
||||
// rwkv
|
||||
struct ggml_tensor * time_mix_w1 = nullptr;
|
||||
struct ggml_tensor * time_mix_w2 = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_x = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_w = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_k = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_v = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_r = nullptr;
|
||||
struct ggml_tensor * time_mix_lerp_g = nullptr;
|
||||
|
||||
struct ggml_tensor * time_mix_first = nullptr;
|
||||
struct ggml_tensor * time_mix_decay = nullptr;
|
||||
struct ggml_tensor * time_mix_decay_w1 = nullptr;
|
||||
struct ggml_tensor * time_mix_decay_w2 = nullptr;
|
||||
struct ggml_tensor * time_mix_key = nullptr;
|
||||
struct ggml_tensor * time_mix_value = nullptr;
|
||||
struct ggml_tensor * time_mix_receptance = nullptr;
|
||||
struct ggml_tensor * time_mix_gate = nullptr;
|
||||
|
||||
struct ggml_tensor * time_mix_ln = nullptr;
|
||||
struct ggml_tensor * time_mix_ln_b = nullptr;
|
||||
struct ggml_tensor * time_mix_output = nullptr;
|
||||
|
||||
struct ggml_tensor * channel_mix_lerp_k = nullptr;
|
||||
struct ggml_tensor * channel_mix_lerp_r = nullptr;
|
||||
|
||||
struct ggml_tensor * channel_mix_key = nullptr;
|
||||
struct ggml_tensor * channel_mix_receptance = nullptr;
|
||||
struct ggml_tensor * channel_mix_value = nullptr;
|
||||
|
||||
// long rope factors
|
||||
struct ggml_tensor * rope_long = nullptr;
|
||||
struct ggml_tensor * rope_short = nullptr;
|
||||
struct ggml_tensor * rope_freqs = nullptr;
|
||||
|
||||
// bitnet scale
|
||||
struct ggml_tensor * wq_scale = nullptr;
|
||||
struct ggml_tensor * wk_scale = nullptr;
|
||||
struct ggml_tensor * wv_scale = nullptr;
|
||||
struct ggml_tensor * wo_scale = nullptr;
|
||||
struct ggml_tensor * ffn_gate_scale = nullptr;
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
};
|
||||
|
||||
struct llama_model {
|
||||
llm_type type = MODEL_UNKNOWN;
|
||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||
|
||||
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
||||
|
||||
std::string name = "n/a";
|
||||
|
||||
llama_hparams hparams = {};
|
||||
llama_vocab vocab;
|
||||
|
||||
struct ggml_tensor * tok_embd = nullptr;
|
||||
struct ggml_tensor * type_embd = nullptr;
|
||||
struct ggml_tensor * pos_embd = nullptr;
|
||||
struct ggml_tensor * tok_norm = nullptr;
|
||||
struct ggml_tensor * tok_norm_b = nullptr;
|
||||
|
||||
struct ggml_tensor * output_norm = nullptr;
|
||||
struct ggml_tensor * output_norm_b = nullptr;
|
||||
struct ggml_tensor * output = nullptr;
|
||||
struct ggml_tensor * output_b = nullptr;
|
||||
struct ggml_tensor * output_norm_enc = nullptr;
|
||||
|
||||
// classifier
|
||||
struct ggml_tensor * cls = nullptr;
|
||||
struct ggml_tensor * cls_b = nullptr;
|
||||
struct ggml_tensor * cls_out = nullptr;
|
||||
struct ggml_tensor * cls_out_b = nullptr;
|
||||
|
||||
struct ggml_tensor * conv1d = nullptr;
|
||||
struct ggml_tensor * conv1d_b = nullptr;
|
||||
|
||||
std::vector<llama_layer> layers;
|
||||
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
||||
llama_split_mode split_mode;
|
||||
int main_gpu;
|
||||
int n_gpu_layers;
|
||||
|
||||
std::vector<std::string> rpc_servers;
|
||||
|
||||
// list of devices used in this model
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
|
||||
|
||||
// lists of buffer types used for each layer
|
||||
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
||||
buft_list_t cpu_buft_list;
|
||||
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
|
||||
|
||||
struct layer_dev {
|
||||
ggml_backend_dev_t dev;
|
||||
buft_list_t * buft_list;
|
||||
};
|
||||
|
||||
layer_dev dev_input = {};
|
||||
layer_dev dev_output = {};
|
||||
std::vector<layer_dev> dev_layer;
|
||||
|
||||
// contexts where the model tensors metadata is stored
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
|
||||
// the model memory buffers for the tensor data
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
// model memory mapped files
|
||||
llama_mmaps mappings;
|
||||
|
||||
// objects representing data potentially being locked in memory
|
||||
llama_mlocks mlock_bufs;
|
||||
llama_mlocks mlock_mmaps;
|
||||
|
||||
// for quantize-stats only
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
||||
// total number of parameters in the model
|
||||
uint64_t n_elements = 0;
|
||||
|
||||
// total size of all the tensors in the model in bytes
|
||||
size_t n_bytes = 0;
|
||||
};
|
||||
|
||||
const char * llm_type_name(llm_type type);
|
||||
|
||||
std::string llama_model_arch_name (const llama_model & model);
|
||||
std::string llama_model_type_name (const llama_model & model);
|
||||
std::string llama_model_ftype_name(const llama_model & model);
|
||||
|
||||
// used by llama_adapter_cvec
|
||||
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
|
||||
|
||||
// used by llama_adapter_lora
|
||||
struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name);
|
||||
|
||||
size_t llama_model_max_nodes(const llama_model & model);
|
||||
|
||||
struct llama_model_loader;
|
||||
|
||||
// TODO: become llama_model methods
|
||||
void llm_load_stats (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_arch (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_hparams (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_vocab (llama_model_loader & ml, llama_model & model);
|
||||
void llm_load_print_meta(llama_model_loader & ml, llama_model & model);
|
Loading…
Add table
Add a link
Reference in a new issue