adding new header for llama internal

This commit is contained in:
mike dupont 2023-11-24 08:31:12 -05:00
parent e34fffc77b
commit ebea708561
4 changed files with 590 additions and 47 deletions

2
.gitignore vendored
View file

@ -100,3 +100,5 @@ tests/test-tokenizer-0-falcon
tests/test-tokenizer-1-llama tests/test-tokenizer-1-llama
tests/test-tokenizer-1-bpe tests/test-tokenizer-1-bpe
/#llama.cpp# /#llama.cpp#
#*
\\#*

View file

@ -108,10 +108,7 @@ int main(int argc, char ** argv) {
g_params = &params; g_params = &params;
//using Td = type_descriptor<gpt_params>; //using Td = type_descriptor<gpt_params>;
print_fields(params);
//constexpr auto tbl = descriptor::get_attribute<gpt_params>(Td{});
//constexpr auto tbl_name = REFL_MAKE_CONST_STRING(tbl.name);
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
@ -126,6 +123,7 @@ int main(int argc, char ** argv) {
// TODO: Dump params ? // TODO: Dump params ?
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity)); //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
print_fields(params);
// save choice to use color for later // save choice to use color for later
// (note for later: this is a slightly awkward choice) // (note for later: this is a slightly awkward choice)
@ -184,10 +182,6 @@ int main(int argc, char ** argv) {
g_model = &model; g_model = &model;
g_ctx = &ctx; g_ctx = &ctx;
print_fields(*model);
print_fields(*ctx);
print_fields(*ctx_guidance);
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__); LOG("%s: load the model and apply lora adapter, if any\n", __func__);
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
@ -247,7 +241,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
print_fields(*model);
if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) { if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n"); LOG("tokenize the prompt\n");
@ -268,7 +262,7 @@ int main(int argc, char ** argv) {
embd_inp.push_back(llama_token_bos(model)); embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
} }
//print_fields(embd_inp);
// Tokenize negative prompt // Tokenize negative prompt
std::vector<llama_token> guidance_inp; std::vector<llama_token> guidance_inp;
int guidance_offset = 0; int guidance_offset = 0;
@ -292,7 +286,8 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1; return 1;
} }
print_fields(*ctx);
print_fields(*ctx_guidance);
//print_fields(session_tokens); //print_fields(session_tokens);
// debug message about similarity of saved session, if applicable // debug message about similarity of saved session, if applicable
size_t n_matching_session_tokens = 0; size_t n_matching_session_tokens = 0;
@ -643,7 +638,7 @@ int main(int argc, char ** argv) {
} }
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
print_fields(id);
llama_sampling_accept(ctx_sampling, ctx, id, true); llama_sampling_accept(ctx_sampling, ctx, id, true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());

518
llama-internal.hpp Normal file
View file

@ -0,0 +1,518 @@
#include <set>
enum llm_arch {
LLM_ARCH_LLAMA,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
LLM_ARCH_GPT2,
LLM_ARCH_GPTJ,
LLM_ARCH_GPTNEOX,
LLM_ARCH_MPT,
LLM_ARCH_STARCODER,
LLM_ARCH_PERSIMMON,
LLM_ARCH_REFACT,
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
LLM_ARCH_UNKNOWN,
};
enum llm_kv {
LLM_KV_GENERAL_ARCHITECTURE,
LLM_KV_GENERAL_QUANTIZATION_VERSION,
LLM_KV_GENERAL_ALIGNMENT,
LLM_KV_GENERAL_NAME,
LLM_KV_GENERAL_AUTHOR,
LLM_KV_GENERAL_URL,
LLM_KV_GENERAL_DESCRIPTION,
LLM_KV_GENERAL_LICENSE,
LLM_KV_GENERAL_SOURCE_URL,
LLM_KV_GENERAL_SOURCE_HF_REPO,
LLM_KV_CONTEXT_LENGTH,
LLM_KV_EMBEDDING_LENGTH,
LLM_KV_BLOCK_COUNT,
LLM_KV_FEED_FORWARD_LENGTH,
LLM_KV_USE_PARALLEL_RESIDUAL,
LLM_KV_TENSOR_DATA_LAYOUT,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
LLM_KV_ATTENTION_CLAMP_KQV,
LLM_KV_ATTENTION_LAYERNORM_EPS,
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE,
LLM_KV_ROPE_SCALING_FACTOR,
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
LLM_KV_ROPE_SCALING_FINETUNED,
LLM_KV_TOKENIZER_MODEL,
LLM_KV_TOKENIZER_LIST,
LLM_KV_TOKENIZER_TOKEN_TYPE,
LLM_KV_TOKENIZER_SCORES,
LLM_KV_TOKENIZER_MERGES,
LLM_KV_TOKENIZER_BOS_ID,
LLM_KV_TOKENIZER_EOS_ID,
LLM_KV_TOKENIZER_UNK_ID,
LLM_KV_TOKENIZER_SEP_ID,
LLM_KV_TOKENIZER_PAD_ID,
LLM_KV_TOKENIZER_ADD_BOS,
LLM_KV_TOKENIZER_ADD_EOS,
LLM_KV_TOKENIZER_HF_JSON,
LLM_KV_TOKENIZER_RWKV,
};
// available llama models
enum e_model {
MODEL_UNKNOWN,
MODEL_1B,
MODEL_3B,
MODEL_7B,
MODEL_8B,
MODEL_13B,
MODEL_15B,
MODEL_30B,
MODEL_34B,
MODEL_40B,
MODEL_65B,
MODEL_70B,
};
enum llama_fver {
GGUF_FILE_VERSION_V1 = 1,
GGUF_FILE_VERSION_V2 = 2,
GGUF_FILE_VERSION_V3 = 3,
};
struct LLM_KV {
LLM_KV(llm_arch arch) : arch(arch) {}
llm_arch arch;
std::string operator()(llm_kv kv) const; // moved to llama.cpp file
};
enum llm_tensor {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_TOKEN_EMBD_NORM,
LLM_TENSOR_POS_EMBD,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
};
struct llama_cparams {
uint32_t n_ctx; // context size used during inference
uint32_t n_batch;
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
float rope_freq_base;
float rope_freq_scale;
uint32_t n_yarn_orig_ctx;
// These hyperparameters are not exposed in GGUF, because all
// existing YaRN models use the same values for them.
float yarn_ext_factor;
float yarn_attn_factor;
float yarn_beta_fast;
float yarn_beta_slow;
bool mul_mat_q;
};
struct llama_layer {
// normalization
struct ggml_tensor * attn_norm;
struct ggml_tensor * attn_norm_b;
struct ggml_tensor * attn_norm_2;
struct ggml_tensor * attn_norm_2_b;
struct ggml_tensor * attn_q_norm;
struct ggml_tensor * attn_q_norm_b;
struct ggml_tensor * attn_k_norm;
struct ggml_tensor * attn_k_norm_b;
// attention
struct ggml_tensor * wq;
struct ggml_tensor * wk;
struct ggml_tensor * wv;
struct ggml_tensor * wo;
struct ggml_tensor * wqkv;
// attention bias
struct ggml_tensor * bo;
struct ggml_tensor * bqkv;
// normalization
struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
// ff
struct ggml_tensor * ffn_gate; // w1
struct ggml_tensor * ffn_down; // w2
struct ggml_tensor * ffn_up; // w3
// ff bias
struct ggml_tensor * ffn_down_b; // b2
struct ggml_tensor * ffn_up_b; // b3
};
struct llama_kv_cell {
llama_pos pos = -1;
llama_pos delta = 0;
std::set<llama_seq_id> seq_id;
bool has_seq_id(const llama_seq_id & id) const {
return seq_id.find(id) != seq_id.end();
}
};
struct llama_buffer {
void * data = NULL;
size_t size = 0;
// fallback to malloc / free
// useful in cases where CUDA can try to allocate PINNED memory
bool fallback = false;
void resize(size_t n) ;
~llama_buffer();
};
// ring-buffer of cached KV data
struct llama_kv_cache {
bool has_shift = false;
// Note: The value of head isn't only used to optimize searching
// for a free KV slot. llama_decode_internal also uses it, so it
// cannot be freely changed after a slot has been allocated.
uint32_t head = 0;
uint32_t size = 0;
// computed before each graph build
uint32_t n = 0;
std::vector<llama_kv_cell> cells;
struct ggml_tensor * k = NULL;
struct ggml_tensor * v = NULL;
struct ggml_context * ctx = NULL;
llama_buffer buf;
~llama_kv_cache() {
if (ctx) {
ggml_free(ctx);
}
#ifdef GGML_USE_CUBLAS
if (ggml_cublas_loaded()) {
ggml_cuda_free_data(k);
ggml_cuda_free_data(v);
}
#endif
}
};
struct llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
struct token_data {
token text;
float score;
ttype type;
};
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
std::unordered_map<token, id> special_tokens_cache;
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
// default LLaMA special tokens
id special_bos_id = 1;
id special_eos_id = 2;
id special_unk_id = 0;
id special_sep_id = -1;
id special_pad_id = -1;
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
id linefeed_id = 13;
id special_prefix_id = 32007;
id special_middle_id = 32009;
id special_suffix_id = 32008;
id special_eot_id = 32010;
int find_bpe_rank(std::string token_left, std::string token_right) const {
GGML_ASSERT(token_left.find(" ") == std::string::npos);
GGML_ASSERT(token_left.find("\n") == std::string::npos);
GGML_ASSERT(token_right.find(" ") == std::string::npos);
GGML_ASSERT(token_right.find("\n") == std::string::npos);
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
if (it == bpe_ranks.end()) {
return -1;
}
return it->second;
}
};
struct llama_mmap {
void * addr;
size_t size;
llama_mmap(const llama_mmap &) = delete;
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false);
~llama_mmap();
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
#else
static constexpr bool SUPPORTED = false;
#endif
};
struct llama_hparams {
bool vocab_only;
uint32_t n_vocab;
uint32_t n_ctx_train; // context size the model was trained on
uint32_t n_embd;
uint32_t n_head;
uint32_t n_head_kv;
uint32_t n_layer;
uint32_t n_rot;
uint32_t n_ff;
float f_norm_eps;
float f_norm_rms_eps;
float rope_freq_base_train;
float rope_freq_scale_train;
uint32_t n_yarn_orig_ctx;
int8_t rope_scaling_type_train : 3;
bool rope_finetuned : 1;
float f_clamp_kqv;
float f_max_alibi_bias;
bool operator!=(const llama_hparams & other) const;
uint32_t n_gqa() const {
return n_head/n_head_kv;
}
uint32_t n_embd_head() const {
return n_embd/n_head;
}
uint32_t n_embd_gqa() const {
return n_embd/n_gqa();
}
};
struct llama_mlock {
void * addr = NULL;
size_t size = 0;
bool failed_already = false;
llama_mlock() ;
llama_mlock(const llama_mlock &) = delete;
~llama_mlock();
void init(void * ptr);
void grow_to(size_t target_size);
#ifdef _POSIX_MEMLOCK_RANGE
static constexpr bool SUPPORTED = true;
static size_t lock_granularity();
#ifdef __APPLE__
#define MLOCK_SUGGESTION \
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
#else
#define MLOCK_SUGGESTION \
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
#endif
bool raw_lock(const void * addr, size_t size) const ;
#undef MLOCK_SUGGESTION
static void raw_unlock(void * addr, size_t size);
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
static size_t lock_granularity();
bool raw_lock(void * ptr, size_t len) const ;
static void raw_unlock(void * ptr, size_t len);
#else
static constexpr bool SUPPORTED = false;
static size_t lock_granularity();
bool raw_lock(const void * addr, size_t len) const;
static void raw_unlock(const void * addr, size_t len);
#endif
};
struct llama_model {
e_model type = MODEL_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN;
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
std::string name = "n/a";
llama_hparams hparams = {};
llama_vocab vocab;
struct ggml_tensor * tok_embd;
struct ggml_tensor * pos_embd;
struct ggml_tensor * tok_norm;
struct ggml_tensor * tok_norm_b;
struct ggml_tensor * output_norm;
struct ggml_tensor * output_norm_b;
struct ggml_tensor * output;
std::vector<llama_layer> layers;
int n_gpu_layers;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
// context
struct ggml_context * ctx = NULL;
// the model memory buffer
llama_buffer buf;
// model memory mapped file
std::unique_ptr<llama_mmap> mapping;
// objects representing data potentially being locked in memory
llama_mlock mlock_buf;
llama_mlock mlock_mmap;
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
int64_t t_load_us = 0;
int64_t t_start_us = 0;
~llama_model() {
if (ctx) {
ggml_free(ctx);
}
#ifdef GGML_USE_CUBLAS
if (ggml_cublas_loaded()) {
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cuda_free_data(tensors_by_name[i].second);
}
ggml_cuda_free_scratch();
}
#endif
#if defined(GGML_USE_CLBLAST)
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cl_free_data(tensors_by_name[i].second);
}
#endif
}
};
struct llama_context {
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
~llama_context();
llama_cparams cparams;
const llama_model & model;
// key + value cache for the self attention
struct llama_kv_cache kv_self;
std::mt19937 rng;
bool has_evaluated_once = false;
int64_t t_start_us;
int64_t t_load_us;
int64_t t_sample_us = 0;
int64_t t_p_eval_us = 0;
int64_t t_eval_us = 0;
int32_t n_sample = 0; // number of tokens sampled
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
int32_t n_eval = 0; // number of eval calls
// decode output (2-dimensional array: [n_tokens][n_vocab])
std::vector<float> logits;
bool logits_all = false;
// input embedding (1-dimensional array: [n_embd])
std::vector<float> embedding;
// reusable buffer for `struct ggml_graph_plan.work_data`
std::vector<uint8_t> work_buffer;
// memory buffers used to evaluate the model
llama_buffer buf_compute;
llama_buffer buf_alloc;
ggml_allocr * alloc = NULL;
#ifdef GGML_USE_METAL
ggml_metal_context * ctx_metal = NULL;
#endif
#ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL;
#endif
};
struct LLM_TN {
LLM_TN(llm_arch arch) ;
llm_arch arch;
std::string operator()(llm_tensor tensor) const;
std::string operator()(llm_tensor tensor, const std::string & suffix) const ;
std::string operator()(llm_tensor tensor, int bid) const ;
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const ;
};

View file

@ -56,9 +56,9 @@ REFL_FIELD(prompt_file )
REFL_FIELD(path_prompt_cache ) REFL_FIELD(path_prompt_cache )
REFL_FIELD(input_prefix ) REFL_FIELD(input_prefix )
REFL_FIELD(input_suffix ) REFL_FIELD(input_suffix )
REFL_FIELD( antiprompt) //REFL_FIELD( antiprompt)
REFL_FIELD(logdir ) REFL_FIELD(logdir )
REFL_FIELD( lora_adapter) //REFL_FIELD( lora_adapter)
REFL_FIELD(lora_base ) REFL_FIELD(lora_base )
REFL_FIELD( ppl_stride ) REFL_FIELD( ppl_stride )
REFL_FIELD( ppl_output_type ) REFL_FIELD( ppl_output_type )
@ -92,14 +92,22 @@ REFL_FIELD( image)
REFL_END REFL_END
REFL_TYPE(llama_sampling_params)
REFL_END
REFL_TYPE(llama_buffer)
REFL_END
REFL_TYPE(llm_arch)
REFL_END
REFL_TYPE(llama_sampling_context ) REFL_TYPE(llama_sampling_context )
REFL_FIELD( params) REFL_FIELD( params)
REFL_FIELD( mirostat_mu) REFL_FIELD( mirostat_mu)
REFL_FIELD( grammar) REFL_FIELD( grammar)
REFL_FIELD( parsed_grammar) REFL_FIELD( parsed_grammar)
REFL_FIELD( prev) //REFL_FIELD( prev) vector of ints
REFL_FIELD( cur) //REFL_FIELD( cur)
REFL_END REFL_END
REFL_TYPE(llama_token_data ) REFL_TYPE(llama_token_data )
@ -333,10 +341,10 @@ REFL_END
// REFL_END // REFL_END
//REFL_TYPE(llama_cparams) REFL_TYPE(llama_cparams)
// REFL_FIELD(n_ctx) REFL_FIELD(n_ctx)
// REFL_FIELD(n_batch) REFL_FIELD(n_batch)
//REFL_END REFL_END
//REFL_TYPE(llama_layer) //REFL_TYPE(llama_layer)
// REFL_FIELD(attn_norm) // REFL_FIELD(attn_norm)
@ -348,14 +356,20 @@ REFL_END
// REFL_FIELD(delta) // REFL_FIELD(delta)
// REFL_END // REFL_END
// REFL_TYPE(llama_kv_cache) REFL_TYPE(llama_kv_cache)
// REFL_FIELD(has_shift) REFL_FIELD(has_shift)
// REFL_FIELD(head) REFL_FIELD(head)
// REFL_END REFL_END
// REFL_TYPE(llama_vocab) // REFL_TYPE(llama_vocab)
// REFL_END // REFL_END
REFL_TYPE(e_model)
REFL_END
REFL_TYPE(llama_ftype)
REFL_END
REFL_TYPE(llama_model) REFL_TYPE(llama_model)
REFL_FIELD(type) REFL_FIELD(type)
REFL_FIELD(arch) REFL_FIELD(arch)
@ -375,27 +389,40 @@ REFL_FIELD( output_norm)
REFL_FIELD( output_norm_b) REFL_FIELD( output_norm_b)
REFL_FIELD( output) REFL_FIELD( output)
REFL_FIELD( layers) //REFL_FIELD( layers)
REFL_FIELD( n_gpu_layers) REFL_FIELD( n_gpu_layers)
REFL_FIELD( gguf_kv) //REFL_FIELD( gguf_kv) unordered map
REFL_FIELD( ctx) REFL_FIELD( ctx)
REFL_FIELD( buf) REFL_FIELD( buf)
REFL_FIELD( mapping) //REFL_FIELD( mapping) std::unique_ptr
REFL_FIELD( mlock_buf) //REFL_FIELD( mlock_buf)
REFL_FIELD( mlock_mmap) //REFL_FIELD( mlock_mmap)
REFL_FIELD( tensors_by_name) //REFL_FIELD( tensors_by_name)
REFL_FIELD( t_load_us) REFL_FIELD( t_load_us)
REFL_FIELD( t_start_us) REFL_FIELD( t_start_us)
REFL_END REFL_END
REFL_TYPE(llama_hparams)
REFL_END
//REFL_TYPE(std::vector<int> >)
//REFL_END
REFL_TYPE(llama_vocab)
REFL_END
REFL_TYPE(grammar_parser::parse_state)
REFL_END
REFL_TYPE(llama_context) REFL_TYPE(llama_context)
REFL_FIELD( cparams) REFL_FIELD( cparams)
//REFL_FIELD(model) //REFL_FIELD(model)
REFL_FIELD(kv_self) REFL_FIELD(kv_self)
REFL_FIELD(rng) //REFL_FIELD(rng) random numbers
REFL_FIELD(has_evaluated_once ) REFL_FIELD(has_evaluated_once )
REFL_FIELD( t_start_us) REFL_FIELD( t_start_us)
REFL_FIELD( t_load_us) REFL_FIELD( t_load_us)
@ -405,10 +432,10 @@ REFL_FIELD( t_p_eval_us )
REFL_FIELD( n_sample ) REFL_FIELD( n_sample )
REFL_FIELD( n_p_eval ) REFL_FIELD( n_p_eval )
REFL_FIELD( n_eval ) REFL_FIELD( n_eval )
REFL_FIELD( logits) //REFL_FIELD( logits)
REFL_FIELD( logits_all ) REFL_FIELD( logits_all )
REFL_FIELD( embedding) //REFL_FIELD( embedding)
REFL_FIELD( work_buffer) //REFL_FIELD( work_buffer)
REFL_FIELD( buf_compute) REFL_FIELD( buf_compute)
REFL_FIELD( buf_alloc) REFL_FIELD( buf_alloc)
REFL_FIELD( alloc ) REFL_FIELD( alloc )
@ -524,7 +551,7 @@ struct hidden : refl::attr::usage::field {};
// // A generic function to print out the fields of any object // // A generic function to print out the fields of any object
template<typename T> template<typename T>
void print_fields(const T& ) { void print_fields(const T& t) {
// // Get the type descriptor of the object // // Get the type descriptor of the object
constexpr auto type = refl::reflect<T>(); constexpr auto type = refl::reflect<T>();
@ -533,20 +560,21 @@ void print_fields(const T& ) {
std::cout << "DEBUG Type: " << type.name.c_str() << "\n"; std::cout << "DEBUG Type: " << type.name.c_str() << "\n";
// T instance{}; // T instance{};
for_each(refl::reflect<T>().members, [&](auto member) { //for_each(refl::reflect<T>().members, [&](auto member) {
std::cout << "MEMBER:" << member.name.str() << "\n"; //std::cout << "MEMBER:" << member.name.str() << "\n";
}); //});
refl::util::for_each(type.members, [&](auto member) { refl::util::for_each(type.members, [&](auto member) {
// // Check if the member is a field and not hidden
//if ((refl::descriptor::is_field(member)) && (!member.has_attribute<hidden>()))) { auto member1 = member(t);
//if ((refl::descriptor::is_field(member))) { //if(member1){
// // Print the member name and value std::cout << "Auto:" << member.name <<"\n";
std::cout print_fields(member1);
<< "Auto:" << member.name << ": " << "\n"; //}
// refl::get(member, obj) //else {
//std::cout << "NULL:" << member.name <<"\n";
//} //}
}); });
std::cout << "\n"; std::cout << "\n";