llama : model loader
ggml-ci
This commit is contained in:
parent
736e6922ce
commit
8d117a518d
7 changed files with 1264 additions and 1164 deletions
|
@ -20,6 +20,7 @@ add_library(llama
|
|||
llama-kv-cache.cpp
|
||||
llama-mmap.cpp
|
||||
llama-model.cpp
|
||||
llama-model-loader.cpp
|
||||
llama-sampling.cpp
|
||||
llama-vocab.cpp
|
||||
unicode.h
|
||||
|
|
|
@ -74,7 +74,8 @@ struct llama_hparams {
|
|||
float rope_freq_scale_train;
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul;
|
||||
int rope_sections[4]; // TODO: actually this should be std::array (I was wrong)
|
||||
|
||||
std::array<int, 4> rope_sections;
|
||||
|
||||
// for State Space Models
|
||||
uint32_t ssm_d_conv = 0;
|
||||
|
|
|
@ -2,9 +2,11 @@
|
|||
|
||||
#include "llama.h"
|
||||
|
||||
#include <cinttypes>
|
||||
#include <climits>
|
||||
#include <cstdarg>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
|
||||
struct llama_logger_state {
|
||||
ggml_log_callback log_callback = llama_log_callback_default;
|
||||
|
@ -89,3 +91,75 @@ std::string format(const char * fmt, ...) {
|
|||
va_end(ap);
|
||||
return std::string(buf.data(), size);
|
||||
}
|
||||
|
||||
std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
|
||||
char buf[256];
|
||||
snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
|
||||
for (size_t i = 1; i < ne.size(); i++) {
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
|
||||
char buf[256];
|
||||
snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
||||
switch (type) {
|
||||
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
||||
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
||||
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
||||
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
||||
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
||||
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
||||
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
||||
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
||||
default: return format("unknown type %d", type);
|
||||
}
|
||||
}
|
||||
|
||||
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||
|
||||
switch (type) {
|
||||
case GGUF_TYPE_STRING:
|
||||
return gguf_get_val_str(ctx_gguf, i);
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (int j = 0; j < arr_n; j++) {
|
||||
if (arr_type == GGUF_TYPE_STRING) {
|
||||
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
||||
// escape quotes
|
||||
replace_all(val, "\\", "\\\\");
|
||||
replace_all(val, "\"", "\\\"");
|
||||
ss << '"' << val << '"';
|
||||
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
||||
ss << "???";
|
||||
} else {
|
||||
ss << gguf_data_to_str(arr_type, data, j);
|
||||
}
|
||||
if (j < arr_n - 1) {
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
ss << "]";
|
||||
return ss.str();
|
||||
}
|
||||
default:
|
||||
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
#include "ggml.h" // for ggml_log_level
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
|
@ -33,6 +34,12 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
|
|||
// helpers
|
||||
//
|
||||
|
||||
template <typename T>
|
||||
struct no_init {
|
||||
T value;
|
||||
no_init() { /* do nothing */ }
|
||||
};
|
||||
|
||||
struct time_meas {
|
||||
time_meas(int64_t & t_acc, bool disable = false);
|
||||
~time_meas();
|
||||
|
@ -47,3 +54,8 @@ void replace_all(std::string & s, const std::string & search, const std::string
|
|||
// TODO: rename to llama_format ?
|
||||
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
||||
std::string format(const char * fmt, ...);
|
||||
|
||||
std::string llama_format_tensor_shape(const std::vector<int64_t> & ne);
|
||||
std::string llama_format_tensor_shape(const struct ggml_tensor * t);
|
||||
|
||||
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
|
||||
|
|
1008
src/llama-model-loader.cpp
Normal file
1008
src/llama-model-loader.cpp
Normal file
File diff suppressed because it is too large
Load diff
157
src/llama-model-loader.h
Normal file
157
src/llama-model-loader.h
Normal file
|
@ -0,0 +1,157 @@
|
|||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-arch.h"
|
||||
#include "llama-mmap.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
|
||||
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
||||
|
||||
enum llama_fver {
|
||||
GGUF_FILE_VERSION_V1 = 1,
|
||||
GGUF_FILE_VERSION_V2 = 2,
|
||||
GGUF_FILE_VERSION_V3 = 3,
|
||||
};
|
||||
|
||||
const char * llama_file_version_name(llama_fver version);
|
||||
|
||||
struct llama_model_loader {
|
||||
// Holds information on a model weight
|
||||
struct llama_tensor_weight {
|
||||
uint16_t idx; // source file index
|
||||
size_t offs; // tensor data offset in the original file
|
||||
|
||||
ggml_tensor * tensor;
|
||||
|
||||
llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
||||
const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor));
|
||||
if (tensor_idx < 0) {
|
||||
throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
|
||||
}
|
||||
|
||||
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
||||
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) {
|
||||
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor)));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// custom comparator to sort weights more nicely by layer
|
||||
struct weight_name_comparer {
|
||||
bool operator()(const std::string & a, const std::string & b) const {
|
||||
int a_layer = -1;
|
||||
int b_layer = -1;
|
||||
sscanf(a.c_str(), "blk.%d.", &a_layer);
|
||||
sscanf(b.c_str(), "blk.%d.", &b_layer);
|
||||
if (a_layer != b_layer) {
|
||||
return a_layer < b_layer;
|
||||
}
|
||||
return a < b;
|
||||
}
|
||||
};
|
||||
|
||||
static const int TENSOR_NOT_REQUIRED = 1;
|
||||
static const int TENSOR_DUPLICATED = 2;
|
||||
|
||||
int n_kv = 0;
|
||||
int n_tensors = 0;
|
||||
int n_created = 0;
|
||||
|
||||
uint64_t n_elements = 0;
|
||||
size_t n_bytes = 0;
|
||||
|
||||
bool use_mmap = false;
|
||||
bool check_tensors;
|
||||
|
||||
llama_files files;
|
||||
llama_ftype ftype;
|
||||
llama_fver fver;
|
||||
|
||||
llama_mmaps mappings;
|
||||
|
||||
std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
|
||||
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
||||
|
||||
gguf_context_ptr meta;
|
||||
std::vector<ggml_context_ptr> contexts;
|
||||
|
||||
std::string arch_name;
|
||||
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
||||
|
||||
size_t size_done = 0;
|
||||
size_t size_data = 0;
|
||||
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
||||
|
||||
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p);
|
||||
|
||||
template<typename T>
|
||||
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
||||
get_arr_n(const std::string & key, T & result, bool required = true);
|
||||
|
||||
template<typename T>
|
||||
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
||||
get_arr_n(enum llm_kv kid, T & result, bool required = true);
|
||||
|
||||
template<typename T>
|
||||
bool get_arr(const std::string & key, std::vector<T> & result, bool required = true);
|
||||
|
||||
template<typename T, size_t N_MAX>
|
||||
bool get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required = true);
|
||||
|
||||
template<typename T>
|
||||
bool get_arr(enum llm_kv kid, T & result, bool required = true);
|
||||
|
||||
template<typename T>
|
||||
bool get_key(const std::string & key, T & result, bool required = true);
|
||||
|
||||
template<typename T>
|
||||
bool get_key(enum llm_kv kid, T & result, bool required = true);
|
||||
|
||||
template<typename T, size_t N_MAX>
|
||||
bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required = true);
|
||||
|
||||
template<typename T>
|
||||
bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
|
||||
|
||||
std::string get_arch_name() const;
|
||||
|
||||
enum llm_arch get_arch() const;
|
||||
|
||||
const llama_tensor_weight * get_weight(const char * name) const;
|
||||
|
||||
const llama_tensor_weight & require_weight(const char * name) const;
|
||||
|
||||
struct ggml_tensor * get_tensor_meta(const char * name) const;
|
||||
|
||||
struct ggml_tensor * require_tensor_meta(const std::string & name) const;
|
||||
|
||||
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
|
||||
|
||||
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0);
|
||||
|
||||
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
|
||||
|
||||
void done_getting_tensors() const;
|
||||
|
||||
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr);
|
||||
|
||||
void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const;
|
||||
|
||||
// for backwards compatibility, does not support ggml-backend
|
||||
void load_data_for(struct ggml_tensor * cur) const;
|
||||
|
||||
// Returns false if cancelled by progress_callback
|
||||
bool load_all_data(
|
||||
struct ggml_context * ctx,
|
||||
llama_buf_map & bufs,
|
||||
llama_mlocks * lmlocks,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data);
|
||||
};
|
1163
src/llama.cpp
1163
src/llama.cpp
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue