This commit is contained in:
mike dupont 2023-11-23 09:55:19 -05:00
parent a08640c00d
commit df647db611
17 changed files with 568 additions and 182 deletions

View file

@ -1,3 +1,6 @@
tt:
clang++ -std=c++17 ggml.cpp
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = \ BUILD_TARGETS = \
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \

View file

@ -1527,11 +1527,14 @@ int main(int argc, char ** argv) {
std::vector<uint8_t> work_buffer; std::vector<uint8_t> work_buffer;
for (int ex=0; ex<n_examples; ++ex) { for (int ex=0; ex<n_examples; ++ex) {
struct ggml_init_params params = { struct ggml_init_params params(
.mem_size = compute_size, //.mem_size =
.mem_buffer = compute_addr, compute_size,
.no_alloc = false, //.mem_buffer =
}; compute_addr,
//.no_alloc =
false
);
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
@ -1602,11 +1605,14 @@ int main(int argc, char ** argv) {
} }
printf("---\n"); printf("---\n");
for (int i=0; i<n_gen; ++i) { for (int i=0; i<n_gen; ++i) {
struct ggml_init_params params = { struct ggml_init_params params(
.mem_size = compute_size, //.mem_size =
.mem_buffer = compute_addr, compute_size,
.no_alloc = false, //.mem_buffer =
}; compute_addr,
//.no_alloc =
false
);
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
ggml_cgraph gf = {}; ggml_cgraph gf = {};

View file

@ -140,11 +140,14 @@ int main(int argc, char ** argv) {
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
struct ggml_init_params params = { struct ggml_init_params params(
.mem_size = ctx_size, //.mem_size =
.mem_buffer = NULL, ctx_size,
.no_alloc = 0 //.mem_buffer =
}; NULL,
//.no_alloc =
0
);
ctx = ggml_init(params); ctx = ggml_init(params);
if (!ctx) { if (!ctx) {

View file

@ -553,10 +553,12 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
if (is_ggml_file(filename)) { if (is_ggml_file(filename)) {
struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = { struct gguf_init_params params(
.no_alloc = false, //.no_alloc =
.ctx = &ctx_data, false,
}; //.ctx =
&ctx_data
);
struct gguf_context * ctx = gguf_init_from_file(filename, params); struct gguf_context * ctx = gguf_init_from_file(filename, params);
GGML_ASSERT(ctx != NULL); GGML_ASSERT(ctx != NULL);

View file

@ -389,9 +389,11 @@ static void export_lora(struct export_lora_params * params) {
// open base model gguf, read tensors without their data // open base model gguf, read tensors without their data
struct ggml_context * ctx_in; struct ggml_context * ctx_in;
struct gguf_init_params params_gguf; struct gguf_init_params params_gguf(
params_gguf.no_alloc = true; //params_gguf.no_alloc =
params_gguf.ctx = &ctx_in; true,
//params_gguf.ctx =
&ctx_in);
struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf); struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
// create new gguf // create new gguf

View file

@ -294,10 +294,12 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
// get parameters directly from gguf file // get parameters directly from gguf file
{ {
struct gguf_init_params params = { struct gguf_init_params params(
.no_alloc = false, //.no_alloc =
.ctx = NULL, false,
}; //.ctx =
NULL
);
struct gguf_context * mctx = gguf_init_from_file(fn_model, params); struct gguf_context * mctx = gguf_init_from_file(fn_model, params);
load_model_hparams_gguf(mctx, &hparams, "llama"); load_model_hparams_gguf(mctx, &hparams, "llama");
@ -991,9 +993,11 @@ static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llam
static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
struct ggml_context * f_ggml_ctx; struct ggml_context * f_ggml_ctx;
struct gguf_init_params params; struct gguf_init_params params(
params.no_alloc = false; //params.no_alloc =
params.ctx = &f_ggml_ctx; false,
//params.ctx =
&f_ggml_ctx);
struct gguf_context * fctx = gguf_init_from_file(filename, params); struct gguf_context * fctx = gguf_init_from_file(filename, params);
if (fctx == NULL) { if (fctx == NULL) {
return false; return false;
@ -1708,11 +1712,14 @@ int main(int argc, char ** argv) {
std::vector<uint8_t> mem_compute_data; std::vector<uint8_t> mem_compute_data;
// context for input tensors without their data // context for input tensors without their data
struct ggml_init_params ctx_input_params = { struct ggml_init_params ctx_input_params(
.mem_size= ggml_tensor_overhead() * 2, // mem_size //.mem_size=
.mem_buffer=NULL, // mem_buffer ggml_tensor_overhead() * 2, // mem_size
.no_alloc=true, // no_alloc //.mem_buffer=
}; NULL, // mem_buffer
//.no_alloc=
true // no_alloc
);
struct ggml_context * ctx_input = ggml_init(ctx_input_params); struct ggml_context * ctx_input = ggml_init(ctx_input_params);
// the input tensors // the input tensors
@ -1737,11 +1744,14 @@ int main(int argc, char ** argv) {
2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() + 2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
(params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true)) (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
); );
struct ggml_init_params ctx_compute_params = { struct ggml_init_params ctx_compute_params(
.mem_size=estimated_compute_size_wo_data, // mem_size //.mem_size=
.mem_buffer=NULL, // mem_buffer estimated_compute_size_wo_data, // mem_size
.no_alloc=true, // no_alloc //.mem_buffer=
}; NULL, // mem_buffer
//.no_alloc=
true // no_alloc
);
struct ggml_context * ctx_compute = NULL; struct ggml_context * ctx_compute = NULL;
struct ggml_tensor * loss = NULL; struct ggml_tensor * loss = NULL;
@ -1904,11 +1914,14 @@ int main(int argc, char ** argv) {
printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
// context for work buffer // context for work buffer
struct ggml_init_params ctx_work_params = { struct ggml_init_params ctx_work_params(
.mem_size= max_work_size, // mem_size //.mem_size=
.mem_buffer = NULL, // mem_buffer max_work_size, // mem_size
.no_alloc = false, // no_alloc //.mem_buffer =
}; NULL, // mem_buffer
//.no_alloc =
false // no_alloc
);
struct ggml_context * ctx_work = ggml_init(ctx_work_params); struct ggml_context * ctx_work = ggml_init(ctx_work_params);
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();

View file

@ -255,11 +255,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
const auto & buf_compute = ctx->buf_compute; const auto & buf_compute = ctx->buf_compute;
struct ggml_init_params params = { struct ggml_init_params params(
.mem_size = buf_compute.size, //.mem_size =
.mem_buffer = buf_compute.data, buf_compute.size,
.no_alloc = false, //.mem_buffer =
}; buf_compute.data,
//.no_alloc =
false
);
params.no_alloc = true; params.no_alloc = true;
@ -455,10 +458,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
struct ggml_context * meta = NULL; struct ggml_context * meta = NULL;
struct gguf_init_params params = { struct gguf_init_params params(
.no_alloc = true, //.no_alloc =
.ctx = &meta, true,
}; //.ctx =
&meta);
struct gguf_context * ctx = gguf_init_from_file(fname, params); struct gguf_context * ctx = gguf_init_from_file(fname, params);
if (!ctx) { if (!ctx) {
@ -552,11 +557,14 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
// load tensors // load tensors
{ {
struct ggml_init_params params = { struct ggml_init_params params(
.mem_size = ctx_size, //.mem_size =
.mem_buffer = NULL, ctx_size,
.no_alloc = false, //.mem_buffer =
}; NULL,
//.no_alloc =
false
);
new_clip->ctx = ggml_init(params); new_clip->ctx = ggml_init(params);
if (!new_clip->ctx) { if (!new_clip->ctx) {

View file

@ -600,10 +600,12 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
// set vocab by copying from vocab_model gguf file // set vocab by copying from vocab_model gguf file
{ {
struct gguf_init_params params = { struct gguf_init_params params(
.no_alloc = false, //.no_alloc =
.ctx = NULL, false,
}; //.ctx =
NULL
);
struct gguf_context * vctx = gguf_init_from_file(fn_vocab_model, params); struct gguf_context * vctx = gguf_init_from_file(fn_vocab_model, params);
const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST)); const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST));
@ -745,9 +747,11 @@ static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_voc
static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct train_state * train) { static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct train_state * train) {
struct ggml_context * f_ggml_ctx; struct ggml_context * f_ggml_ctx;
struct gguf_init_params params; struct gguf_init_params params(
params.no_alloc = false; //params.no_alloc =
params.ctx = &f_ggml_ctx; false,
//params.ctx =
&f_ggml_ctx);
struct gguf_context * fctx = gguf_init_from_file(filename, params); struct gguf_context * fctx = gguf_init_from_file(filename, params);
if (fctx == NULL) { if (fctx == NULL) {
return false; return false;
@ -1085,11 +1089,14 @@ int main(int argc, char ** argv) {
ggml_allocr * alloc = NULL; ggml_allocr * alloc = NULL;
// context for input tensors without their data // context for input tensors without their data
struct ggml_init_params ctx_input_params = { struct ggml_init_params ctx_input_params (
.mem_size = ggml_tensor_overhead() * 2, // mem_size //.mem_size =
.mem_buffer = NULL, // mem_buffer ggml_tensor_overhead() * 2, // mem_size
.no_alloc = true, // no_alloc // .mem_buffer =
}; NULL, // mem_buffer
// .no_alloc =
true // no_alloc
);
struct ggml_context * ctx_input = ggml_init(ctx_input_params); struct ggml_context * ctx_input = ggml_init(ctx_input_params);
// the input tensors // the input tensors
@ -1114,11 +1121,14 @@ int main(int argc, char ** argv) {
2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() + 2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
(params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true)) (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
); );
struct ggml_init_params ctx_compute_params = { struct ggml_init_params ctx_compute_params(
.mem_size = estimated_compute_size_wo_data, // mem_size // .mem_size =
.mem_buffer= NULL, // mem_buffer estimated_compute_size_wo_data, // mem_size
.no_alloc = true, // no_alloc //.mem_buffer=
}; NULL, // mem_buffer
//.no_alloc =
true // no_alloc
);
struct ggml_context * ctx_compute = NULL; struct ggml_context * ctx_compute = NULL;
struct ggml_tensor * loss = NULL; struct ggml_tensor * loss = NULL;
@ -1267,11 +1277,14 @@ int main(int argc, char ** argv) {
printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
// context for work buffer // context for work buffer
struct ggml_init_params ctx_work_params = { struct ggml_init_params ctx_work_params(
.mem_size= max_work_size, // //.mem_size=
.mem_buffer= NULL, // max_work_size, //
.no_alloc=false, // //.mem_buffer=
}; NULL, //
//.no_alloc=
false //
);
struct ggml_context * ctx_work = ggml_init(ctx_work_params); struct ggml_context * ctx_work = ggml_init(ctx_work_params);
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();

View file

@ -586,11 +586,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
sched->n_splits = 0; sched->n_splits = 0;
struct ggml_init_params params = { struct ggml_init_params params(
.mem_size = sizeof(sched->context_buffer), //.mem_size =
.mem_buffer = sched->context_buffer, sizeof(sched->context_buffer),
.no_alloc = true //.mem_buffer =
}; sched->context_buffer,
//.no_alloc =
true
);
if (sched->ctx != NULL) { if (sched->ctx != NULL) {
ggml_free(sched->ctx); ggml_free(sched->ctx);

View file

@ -16446,7 +16446,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
/*.abort_callback =*/ NULL, /*.abort_callback =*/ NULL,
/*.abort_callback_data =*/ NULL, /*.abort_callback_data =*/ NULL,
}; };
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); struct ggml_compute_state * workers = (struct ggml_compute_state *)alloca(sizeof(struct ggml_compute_state)*n_threads);
// create thread pool // create thread pool
if (n_threads > 1) { if (n_threads > 1) {
@ -16775,11 +16775,11 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
const size_t overhead = 1*ggml_tensor_overhead(); const size_t overhead = 1*ggml_tensor_overhead();
GGML_ASSERT(0); GGML_ASSERT(0);
// FIXME // FIXME
struct ggml_init_params params;// = { struct ggml_init_params params(
params.mem_size = fsize + overhead, fsize + overhead,
params.mem_buffer = NULL, NULL,
params.no_alloc = false, false);
// };
*ctx_data = ggml_init(params); *ctx_data = ggml_init(params);
@ -16831,10 +16831,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
{ {
const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false); const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);
struct ggml_init_params params;// = { struct ggml_init_params params(
params.mem_size = size_eval + overhead, size_eval + overhead,
params.mem_buffer = NULL, NULL,
params.no_alloc = true, true);
*ctx_eval = ggml_init(params); *ctx_eval = ggml_init(params);
@ -17974,7 +17974,7 @@ GGML_API void ggml_opt_init(
opt->nx = nx; opt->nx = nx;
opt->just_initialized = true; opt->just_initialized = true;
if (opt->ctx == NULL) { if (opt->ctx == NULL) {
struct ggml_init_params ctx_opt_params; struct ggml_init_params ctx_opt_params;
if (opt->params.type == GGML_OPT_ADAM) { if (opt->params.type == GGML_OPT_ADAM) {
ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3; ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
if (opt->params.past > 0) { if (opt->params.past > 0) {
@ -18690,10 +18690,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size; (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
// FIXME // FIXME
struct ggml_init_params pdata; struct ggml_init_params pdata(
pdata.mem_size = mem_size, mem_size,
pdata.mem_buffer = NULL, NULL,
pdata.no_alloc = params.no_alloc, params.no_alloc);
*params.ctx = ggml_init(pdata); *params.ctx = ggml_init(pdata);

18
ggml.h
View file

@ -286,7 +286,7 @@
GGML_UNUSED(prefix##3); GGML_UNUSED(prefix##3);
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { //extern "C" {
#endif #endif
#if defined(__ARM_NEON) && defined(__CUDACC__) #if defined(__ARM_NEON) && defined(__CUDACC__)
@ -581,6 +581,18 @@ extern "C" {
}; };
struct ggml_init_params : refl::attr::usage::type{ struct ggml_init_params : refl::attr::usage::type{
ggml_init_params(size_t mem_size,
void * mem_buffer,
bool no_alloc):
mem_size( mem_size),
mem_buffer(mem_buffer),
no_alloc(no_alloc){}
ggml_init_params():
mem_size(0),
mem_buffer(0),
no_alloc(0){}
// memory pool // memory pool
size_t mem_size; // bytes size_t mem_size; // bytes
void * mem_buffer; // if NULL, memory will be allocated internally void * mem_buffer; // if NULL, memory will be allocated internally
@ -2013,6 +2025,8 @@ extern "C" {
struct gguf_context; struct gguf_context;
struct gguf_init_params : refl::attr::usage::type{ struct gguf_init_params : refl::attr::usage::type{
gguf_init_params(bool no_alloc, struct ggml_context ** ctx): no_alloc(no_alloc),ctx(ctx){}
bool no_alloc; bool no_alloc;
// if not NULL, create a ggml_context and allocate the tensor data in it // if not NULL, create a ggml_context and allocate the tensor data in it
@ -2164,5 +2178,5 @@ extern "C" {
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
#ifdef __cplusplus #ifdef __cplusplus
} //}
#endif #endif

View file

@ -1785,10 +1785,10 @@ struct llama_model_loader {
struct ggml_context * ctx_meta = NULL; struct ggml_context * ctx_meta = NULL;
llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") { llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
struct gguf_init_params params = { struct gguf_init_params params(
.no_alloc = true, /*.no_alloc =*/ true,
.ctx = &ctx_meta, /*.ctx = */ &ctx_meta
}; );
ctx_gguf = gguf_init_from_file(fname.c_str(), params); ctx_gguf = gguf_init_from_file(fname.c_str(), params);
if (!ctx_gguf) { if (!ctx_gguf) {
@ -2676,11 +2676,12 @@ static void llm_load_tensors(
model.mlock_buf.grow_to(model.buf.size); model.mlock_buf.grow_to(model.buf.size);
} }
struct ggml_init_params params = { struct ggml_init_params params(
.mem_size = model.buf.size, model.buf.size,
.mem_buffer = model.buf.data, model.buf.data,
.no_alloc = ml.use_mmap,
};
ml.use_mmap );
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
if (!model.ctx) { if (!model.ctx) {
@ -3842,11 +3843,14 @@ struct llm_build_context {
} }
void init() { void init() {
struct ggml_init_params params = { struct ggml_init_params params(
.mem_size = buf_compute.size, //.mem_size =
.mem_buffer = buf_compute.data, buf_compute.size,
.no_alloc = true, //.mem_buffer =
}; buf_compute.data,
//.no_alloc =
true
);
ctx0 = ggml_init(params); ctx0 = ggml_init(params);
} }
@ -8447,10 +8451,11 @@ void llama_backend_init(bool numa) {
// needed to initialize f16 tables // needed to initialize f16 tables
{ {
struct ggml_init_params params = { .mem_size = 0, struct ggml_init_params params(
.mem_buffer = NULL, 0,
.no_alloc = false NULL,
}; false
);
struct ggml_context * ctx = ggml_init(params); struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx); ggml_free(ctx);
} }
@ -9021,11 +9026,14 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
if (kv_buf_size) { if (kv_buf_size) {
const size_t elt_size = ggml_element_size(kv_self.k); const size_t elt_size = ggml_element_size(kv_self.k);
ggml_init_params ip = { ggml_init_params ip(
.mem_size = 6*ggml_tensor_overhead() + ggml_graph_overhead(), //.mem_size =
.mem_buffer =NULL, 6*ggml_tensor_overhead() + ggml_graph_overhead(),
.no_alloc = /* no_alloc */ true //.mem_buffer =
}; NULL,
//.no_alloc = /* no_alloc */
true
);
ggml_context * cpy_ctx = ggml_init( ip); ggml_context * cpy_ctx = ggml_init( ip);
ggml_cgraph * gf = ggml_new_graph(cpy_ctx); ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
@ -9155,10 +9163,13 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
const size_t elt_size = ggml_element_size(kv_self.k); const size_t elt_size = ggml_element_size(kv_self.k);
ggml_init_params ip { ggml_init_params ip(
.mem_size= 6*ggml_tensor_overhead() + ggml_graph_overhead(), //.mem_size=
.mem_buffer=NULL, 6*ggml_tensor_overhead() + ggml_graph_overhead(),
.no_alloc=true }; //.mem_buffer=
NULL,
//.no_alloc=
true );
ggml_context * cpy_ctx = ggml_init(ip); ggml_context * cpy_ctx = ggml_init(ip);
ggml_cgraph * gf = ggml_new_graph(cpy_ctx); ggml_cgraph * gf = ggml_new_graph(cpy_ctx);

10
llama.h
View file

@ -50,7 +50,7 @@
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { //extern "C" {
#endif #endif
// //
@ -189,7 +189,7 @@ extern "C" {
llama_seq_id all_seq_id; // used if seq_id == NULL llama_seq_id all_seq_id; // used if seq_id == NULL
} llama_batch; } llama_batch;
struct llama_model_params : refl::attr::usage::type{ struct llama_model_params {
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors int32_t main_gpu; // the GPU that is used for scratch and small tensors
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
@ -268,7 +268,7 @@ extern "C" {
LLAMA_GRETYPE_CHAR_ALT = 6, LLAMA_GRETYPE_CHAR_ALT = 6,
}; };
typedef struct llama_grammar_element : refl::attr::usage::type{ typedef struct llama_grammar_element : refl::attr::usage::type {
llama_grammar_element( enum llama_gretype type, llama_grammar_element( enum llama_gretype type,
uint32_t value // Unicode code point or rule ID uint32_t value // Unicode code point or rule ID
):type(type), value(value){} ):type(type), value(value){}
@ -811,7 +811,7 @@ extern "C" {
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
#ifdef __cplusplus #ifdef __cplusplus
} //}
#endif #endif
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
@ -828,6 +828,6 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
#endif // LLAMA_API_INTERNAL #endif // LLAMA_API_INTERNAL
template<typename T> void print_fields(const T& obj);
#endif // LLAMA_H #endif // LLAMA_H

386
print.hpp
View file

@ -4,65 +4,373 @@
//#include <refl.hpp> //#include <refl.hpp>
#include "llama.h" #include "llama.h"
REFL_TYPE(ggml_object)
REFL_END
REFL_TYPE(ggml_tensor)
REFL_END
REFL_TYPE(ggml_cplan )
REFL_END
REFL_TYPE(ggml_hash_set )
REFL_END
REFL_TYPE(ggml_cgraph )
REFL_END
REFL_TYPE(ggml_scratch )
REFL_END
REFL_TYPE(ggml_init_params ) REFL_TYPE(ggml_init_params )
REFL_END REFL_END
REFL_TYPE(ggml_compute_params )
REFL_END
REFL_TYPE(ggml_opt_params )
REFL_END
REFL_TYPE(ggml_opt_params::ggml_adam) REFL_TYPE(ggml_opt_params::ggml_adam)
REFL_END REFL_END
REFL_TYPE(ggml_opt_params::ggml_lbfgs) REFL_TYPE(ggml_opt_params::ggml_lbfgs)
REFL_END REFL_END
REFL_TYPE(ggml_opt_context )
REFL_END
REFL_TYPE(ggml_opt_context::ggml_grad ) REFL_TYPE(ggml_opt_context::ggml_grad )
REFL_END REFL_END
REFL_TYPE(gpt_params ) REFL_TYPE(gpt_params )
REFL_END REFL_END
REFL_TYPE(gguf_init_params )
REFL_END
REFL_TYPE(ggml_something )
REFL_END
REFL_TYPE(llama_sampling_context ) REFL_TYPE(llama_sampling_context )
REFL_END REFL_END
REFL_TYPE(llama_token_data ) REFL_TYPE(llama_token_data )
REFL_END REFL_END
REFL_TYPE(llama_model )
REFL_END
REFL_TYPE(llama_token_data_array ) REFL_TYPE(llama_token_data_array )
REFL_END REFL_END
REFL_TYPE(llama_batch ) REFL_TYPE(llama_batch )
REFL_END REFL_END
REFL_TYPE(llama_model_params )
REFL_END
REFL_TYPE(llama_context_params ) REFL_TYPE(ggml_object)
REFL_END REFL_FIELD(offs)
REFL_TYPE(llama_context )
REFL_END
REFL_TYPE(llama_model_quantize_params )
REFL_END
REFL_TYPE(llama_grammar_element )
REFL_END
REFL_TYPE(llama_timings )
REFL_END
REFL_TYPE(llama_beam_view )
REFL_END
REFL_TYPE(llama_beams_state )
REFL_END REFL_END
REFL_TYPE(ggml_tensor)
REFL_FIELD(type)
REFL_END
REFL_TYPE(ggml_cplan)
REFL_FIELD(work_size)
REFL_END
REFL_TYPE(ggml_hash_set)
REFL_FIELD(size)
REFL_END
REFL_TYPE(ggml_cgraph)
REFL_FIELD(size)
REFL_END
REFL_TYPE(ggml_scratch)
REFL_FIELD(offs)
REFL_END
REFL_TYPE(ggml_compute_params)
REFL_FIELD(type)
REFL_END
REFL_TYPE(ggml_opt_params)
REFL_FIELD(type)
REFL_END
REFL_TYPE(ggml_opt_context)
REFL_FIELD(ctx)
REFL_END
//REFL_TYPE(gguf_context)
//REFL_END
REFL_TYPE(gguf_init_params)
REFL_END
REFL_TYPE(ggml_something)
REFL_FIELD(type_name)
REFL_END
//REFL_TYPE()
// REFL_FIELD(d)
//REFL_TYPE()
// incomplete ttype
// REFL_TYPE(ggml_context)
// REFL_FIELD(mem_size)
// REFL_FIELD(mem_buffer)
// REFL_END
//REFL_TYPE(ggml_context_container)
// REFL_FIELD(used)
// REFL_FIELD(context)
//REFL_END
// REFL_TYPE(ggml_numa_node)
// REFL_FIELD(cpus)
// REFL_FIELD(n_cpus)
// REFL_END
// REFL_TYPE(ggml_numa_nodes)
// REFL_FIELD(nodes)
// REFL_FIELD(n_nodes)
// REFL_END
// REFL_TYPE(ggml_state)
// REFL_FIELD(contexts)
// REFL_FIELD(numa)
// REFL_END
// REFL_TYPE(gguf_str)
// REFL_FIELD(n)
// REFL_FIELD(data)
// REFL_END
// REFL_TYPE(ggml_map_custom1_op_params)
// REFL_FIELD(fun)
// REFL_FIELD(n_tasks)
// REFL_END
// REFL_TYPE(ggml_map_custom2_op_params)
// REFL_FIELD(fun)
// REFL_FIELD(n_tasks)
// REFL_END
// REFL_TYPE(ggml_map_custom3_op_params)
// REFL_FIELD(fun)
// REFL_FIELD(n_tasks)
// REFL_END
// REFL_TYPE(hash_map)
// REFL_FIELD(set)
// REFL_FIELD(vals)
// REFL_END
// REFL_TYPE(ggml_compute_state_shared)
// REFL_FIELD(cgraph)
// REFL_FIELD(cplan)
// REFL_END
// REFL_TYPE(ggml_compute_state)
// REFL_FIELD(thrd)
// REFL_FIELD(ith)
// REFL_END
// REFL_TYPE(ggml_lbfgs_iteration_data)
// REFL_FIELD(alpha)
// REFL_FIELD(ys)
// REFL_END
//REFL_TYPE()
// REFL_FIELD(type)
//REFL_END
// REFL_TYPE(gguf_kv)
// REFL_FIELD(key)
// REFL_FIELD(type)
// REFL_END
// REFL_TYPE(gguf_header)
// REFL_FIELD(magic)
// REFL_FIELD(version)
// REFL_END
// REFL_TYPE(gguf_tensor_info)
// REFL_FIELD(name)
// REFL_FIELD(n_dims)
// REFL_END
REFL_TYPE(gguf_context)
// REFL_FIELD(header)
// REFL_FIELD(kv)
REFL_END
// REFL_TYPE(gguf_buf)
// REFL_FIELD(data)
// REFL_FIELD(size)
// REFL_END
//REFL_TYPE(llama_token_data)
//REFL_END
REFL_TYPE(llama_model_params)
REFL_FIELD(n_gpu_layers)
REFL_END
REFL_TYPE(llama_context_params)
REFL_FIELD(seed)
REFL_END
REFL_TYPE(llama_model_quantize_params)
REFL_FIELD(nthread)
REFL_END
REFL_TYPE(llama_grammar_element)
REFL_END
REFL_TYPE(llama_timings)
REFL_FIELD(t_start_ms)
REFL_END
REFL_TYPE(llama_beam_view)
REFL_FIELD(tokens)
REFL_END
REFL_TYPE(llama_beams_state)
REFL_FIELD(beam_views)
REFL_END
//REFL_TYPE(ggml_backend)
//REFL_END
REFL_TYPE(ggml_backend_buffer)
REFL_END
//REFL_TYPE(ggml_allocr)
//REFL_END
//REFL_TYPE(ggml_tallocr)
//REFL_END
//REFL_TYPE(ggml_gallocr)
//REFL_END
//REFL_TYPE(llama_buffer)
//REFL_FIELD(data)
//REFL_FIELD(size)
//REFL_END
// REFL_TYPE(llama_file)
// REFL_FIELD(fp)
// REFL_FIELD(size)
// REFL_END
// REFL_TYPE(llama_mmap)
// REFL_FIELD(addr)
// REFL_FIELD(size)
// REFL_END
// REFL_TYPE(llama_mlock)
// REFL_FIELD(addr)
// REFL_FIELD(size)
// REFL_END
//REFL_TYPE(llama_state)
// REFL_FIELD(log_callback)
// REFL_FIELD(log_callback_user_data)
// REFL_END
// REFL_TYPE(llama_hparams)
// REFL_FIELD(vocab_only)
// REFL_FIELD(n_vocab)
// REFL_END
//REFL_TYPE(llama_cparams)
// REFL_FIELD(n_ctx)
// REFL_FIELD(n_batch)
//REFL_END
//REFL_TYPE(llama_layer)
// REFL_FIELD(attn_norm)
// REFL_FIELD(attn_norm_b)
//REFL_END
// REFL_TYPE(llama_kv_cell)
// REFL_FIELD(pos)
// REFL_FIELD(delta)
// REFL_END
// REFL_TYPE(llama_kv_cache)
// REFL_FIELD(has_shift)
// REFL_FIELD(head)
// REFL_END
// REFL_TYPE(llama_vocab)
// REFL_END
REFL_TYPE(llama_model)
// REFL_FIELD(type)
// REFL_FIELD(arch)
REFL_END
REFL_TYPE(llama_context)
REFL_END
// REFL_TYPE(llama_model_loader)
// REFL_FIELD(n_kv)
// REFL_FIELD(n_tensors)
// REFL_END
// REFL_TYPE(llm_build_context)
// REFL_FIELD(model)
// REFL_FIELD(hparams)
// REFL_END
// REFL_TYPE(llm_offload_trie)
// REFL_END
// REFL_TYPE(llm_symbol)
// REFL_FIELD(prev)
// REFL_END
// REFL_TYPE(llm_bigram_spm)
// REFL_END
// REFL_TYPE(llm_tokenizer_spm)
// REFL_END
// REFL_TYPE(llm_bigram_bpe)
// REFL_END
// REFL_TYPE(llm_tokenizer_bpe)
// REFL_END
// REFL_TYPE(fragment_buffer_variant)
// REFL_END
// REFL_TYPE(llama_partial_utf8)
// REFL_FIELD(value)
// REFL_FIELD(n_remain)
// REFL_END
REFL_TYPE(llama_grammar)
// REFL_FIELD(rules)
// REFL_FIELD(stacks)
REFL_END
//REFL_TYPE(llama_grammar_candidate)
// REFL_FIELD(index)
// REFL_FIELD(code_points)
//REFL_END
// REFL_TYPE(llama_beam)
// REFL_FIELD(tokens)
// REFL_FIELD(p)
// REFL_END
// REFL_TYPE(llama_logit_info)
// REFL_FIELD(logits)
// REFL_FIELD(n_vocab)
// REFL_END
// REFL_TYPE(llama_beam_search_data)
// REFL_FIELD(ctx)
// REFL_FIELD(n_beams)
// REFL_END
// REFL_TYPE(quantize_state_internal)
// REFL_FIELD(model)
// REFL_FIELD(params)
// REFL_END
// REFL_TYPE(llama_data_context)
// REFL_END
// REFL_TYPE(llama_data_buffer_context)
// REFL_FIELD(ptr)
// REFL_END
// REFL_TYPE(llama_data_file_context)
// REFL_FIELD(file)
// REFL_END
// // A simple struct with some fields and a function // // A simple struct with some fields and a function
// // A custom attribute to mark some fields as hidden // // A custom attribute to mark some fields as hidden
struct hidden : refl::attr::usage::field {}; struct hidden : refl::attr::usage::field {};

View file

@ -115,11 +115,11 @@ int main(int argc, char * argv[]) {
generate_data(1.0, test_data2.size(), test_data2.data()); generate_data(1.0, test_data2.size(), test_data2.data());
// Initialize GGML, ensures float conversion tables are initialized // Initialize GGML, ensures float conversion tables are initialized
struct ggml_init_params ggml_params = { struct ggml_init_params ggml_params(
/* .mem_size = */ 1*1024, /* .mem_size = */ 1*1024,
/* .mem_buffer = */ NULL, /* .mem_buffer = */ NULL,
/* .no_alloc = */ true, /* .no_alloc = */ true
}; );
struct ggml_context * ctx = ggml_init(ggml_params); struct ggml_context * ctx = ggml_init(ggml_params);
int num_failed = 0; int num_failed = 0;

View file

@ -261,11 +261,11 @@ int main(int argc, char * argv[]) {
// Initialize GGML, ensures float conversion tables are initialized // Initialize GGML, ensures float conversion tables are initialized
struct ggml_init_params ggml_params = { struct ggml_init_params ggml_params(
/* .mem_size = */ 1*1024, /* .mem_size = */ 1*1024,
/* .mem_buffer = */ NULL, /* .mem_buffer = */ NULL,
/* .no_alloc = */ true, /* .no_alloc = */ true
}; );
struct ggml_context * ctx = ggml_init(ggml_params); struct ggml_context * ctx = ggml_init(ggml_params);
for (int i = 0; i < GGML_TYPE_COUNT; i++) { for (int i = 0; i < GGML_TYPE_COUNT; i++) {

View file

@ -124,11 +124,11 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
} }
int main(int /*argc*/, const char ** /*argv*/) { int main(int /*argc*/, const char ** /*argv*/) {
struct ggml_init_params params = { struct ggml_init_params params(
/* .mem_size = */ 128*1024*1024, /* .mem_size = */ 128*1024*1024,
/* .mem_buffer = */ NULL, /* .mem_buffer = */ NULL,
/* .no_alloc = */ false, /* .no_alloc = */ false
}; );
std::vector<uint8_t> work_buffer; std::vector<uint8_t> work_buffer;