This commit is contained in:
John 2023-06-17 14:39:28 +02:00
commit dd3d346f7a
3 changed files with 37 additions and 67 deletions

View file

@ -175,10 +175,8 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
// Add a space in front of the first character to match OG llama tokenizer behavior // Falcon does not have a dedicated bos token (bos==eos), so don't inject it here
params.prompt.insert(0, 1, ' '); embd_inp = ::falcon_tokenize(ctx, params.prompt, false);
embd_inp = ::falcon_tokenize(ctx, params.prompt, true);
} else { } else {
embd_inp = session_tokens; embd_inp = session_tokens;
} }
@ -361,8 +359,7 @@ int main(int argc, char ** argv) {
if (n_past + (int) embd.size() > n_ctx) { if (n_past + (int) embd.size() > n_ctx) {
const int n_left = n_past - params.n_keep; const int n_left = n_past - params.n_keep;
// always keep the first token - BOS n_past = params.n_keep;
n_past = std::max(1, params.n_keep);
// insert n_left/2 tokens at the start of embd from last_n_tokens // insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());

View file

@ -529,7 +529,7 @@ struct falcon_context * falcon_init_from_gpt_params(const gpt_params & params) {
lparams.main_gpu = params.main_gpu; lparams.main_gpu = params.main_gpu;
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float)); memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
lparams.seed = params.seed; lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16; lparams.f16_kv = false; //params.memory_f16; // TODO? unsupported because ggml_repeat2 currently only implemented for f32
lparams.use_mmap = params.use_mmap; lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock; lparams.use_mlock = params.use_mlock;
lparams.logits_all = params.perplexity; lparams.logits_all = params.perplexity;

View file

@ -81,16 +81,6 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
return k_sizes; return k_sizes;
} }
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
{
static std::map<e_model, size_t> k_sizes = {
{ FALCON_7B, 1026ull * MB },
{ FALCON_40B, 5120ull * MB },
};
return k_sizes;
}
// this is mostly needed for temporary mul_mat buffers to dequantize the data // this is mostly needed for temporary mul_mat buffers to dequantize the data
// not actually needed if BLAS is disabled // not actually needed if BLAS is disabled
static const std::map<e_model, size_t> & MEM_REQ_EVAL() static const std::map<e_model, size_t> & MEM_REQ_EVAL()
@ -118,6 +108,18 @@ struct falcon_hparams {
} }
}; };
static size_t MEM_REQ_KV_SELF(
const falcon_hparams & hparams, ggml_type wtype, int32_t n_ctx)
{
const int n_head_kv = hparams.n_head_kv;
const int head_dim = hparams.n_embd / hparams.n_head;
const int n_layer = hparams.n_layer;
const int64_t ne = n_head_kv * head_dim * n_layer * n_ctx;
return 2u * (ggml_tensor_overhead() + ne * ggml_type_size(wtype));
}
struct falcon_layer { struct falcon_layer {
// normalization // normalization
struct ggml_tensor* input_layernorm; struct ggml_tensor* input_layernorm;
@ -164,9 +166,6 @@ struct falcon_model {
std::vector<falcon_layer> layers; std::vector<falcon_layer> layers;
// key + value memory
struct ggml_tensor* memory_k;
struct ggml_tensor* memory_v;
int n_gpu_layers; int n_gpu_layers;
// context // context
@ -687,8 +686,7 @@ struct llama_model_loader {
void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const { void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
*ctx_size_p = *mmapped_size_p = 0; *ctx_size_p = *mmapped_size_p = 0;
for (const falcon_load_tensor & lt : tensors_map.tensors) { for (const falcon_load_tensor & lt : tensors_map.tensors) {
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; *ctx_size_p += ggml_tensor_overhead();
*ctx_size_p += 64 * MB;
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
} }
} }
@ -872,13 +870,11 @@ static bool kv_cache_init(
ggml_type wtype, ggml_type wtype,
int n_ctx) { int n_ctx) {
const int n_embd = hparams.n_embd; const int64_t head_dim = hparams.n_embd / hparams.n_head;
const int n_layer = hparams.n_layer; const int64_t n_elements =
hparams.n_layer * n_ctx * head_dim * hparams.n_head_kv;
const int64_t n_mem = n_layer*n_ctx; cache.buf.resize(MEM_REQ_KV_SELF(hparams, wtype, n_ctx));
const int64_t n_elements = n_embd*n_mem;
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
struct ggml_init_params params; struct ggml_init_params params;
params.mem_size = cache.buf.size; params.mem_size = cache.buf.size;
@ -908,7 +904,7 @@ struct falcon_context_params falcon_context_default_params() {
/*.main_gpu =*/ 0, /*.main_gpu =*/ 0,
/*.tensor_split =*/ {0}, /*.tensor_split =*/ {0},
/*.seed =*/ -1, /*.seed =*/ -1,
/*.f16_kv =*/ true, /*.f16_kv =*/ false,
/*.logits_all =*/ false, /*.logits_all =*/ false,
/*.vocab_only =*/ false, /*.vocab_only =*/ false,
/*.use_mmap =*/ true, /*.use_mmap =*/ true,
@ -1220,41 +1216,24 @@ static void falcon_model_load_internal(
} }
} }
// key + value memory
{
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_head_kv = hparams.n_head_kv;
const int head_dim = hparams.n_embd / hparams.n_head;
const int64_t n_mem = n_layer*n_ctx;
const int64_t n_elements = head_dim*n_mem;
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_head_kv * n_elements);
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_head_kv * n_elements);
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
printf("%s: (a) memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
}
ml->done_getting_tensors(); ml->done_getting_tensors();
// print memory requirements // print memory requirements
{ {
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
// this is the total memory required to run the inference // this is the total memory required to run the inference
const size_t mem_required = // TODO: this calculation is still wrong
int64_t mem_required =
ctx_size + ctx_size +
mmapped_size - vram_weights + // weights in VRAM not in memory mmapped_size - vram_weights + // weights in VRAM not in memory
MEM_REQ_SCRATCH0().at(model.type) + MEM_REQ_SCRATCH0().at(model.type) +
MEM_REQ_SCRATCH1().at(model.type) + MEM_REQ_SCRATCH1().at(model.type) +
MEM_REQ_EVAL().at (model.type); MEM_REQ_EVAL().at (model.type);
if (mem_required < 0) mem_required = 0;
// this is the memory required by one llama_state // this is the memory required by one llama_state
const size_t mem_required_state = const size_t mem_required_state =
scale*MEM_REQ_KV_SELF().at(model.type); MEM_REQ_KV_SELF(model.hparams, memory_type, n_ctx);
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@ -1348,12 +1327,6 @@ static bool falcon_eval_internal(
const int n_threads, const int n_threads,
const char * cgraph_fname) { const char * cgraph_fname) {
// enforce that the first token is BOS
if (n_past == 0 && tokens[0] != falcon_token_bos()) {
fprintf(stderr, "%s: first token must be BOS\n", __func__);
return false;
}
const int64_t t_start_us = ggml_time_us(); const int64_t t_start_us = ggml_time_us();
const int N = n_tokens; const int N = n_tokens;
@ -1488,13 +1461,13 @@ static bool falcon_eval_internal(
// store key and value to memory // store key and value to memory
{ {
struct ggml_tensor* k = ggml_view_1d( struct ggml_tensor* k = ggml_view_1d(
ctx0, model.memory_k, N * n_head_kv * head_dim, ctx0, kv_self.k, N * n_head_kv * head_dim,
(ggml_element_size(model.memory_k) * n_head_kv * head_dim) * (ggml_element_size(kv_self.k) * n_head_kv * head_dim) *
(il * n_ctx + n_past)); (il * n_ctx + n_past));
ggml_set_name(k, "k"); ggml_set_name(k, "k");
struct ggml_tensor* v = ggml_view_1d( struct ggml_tensor* v = ggml_view_1d(
ctx0, model.memory_v, N * n_head_kv * head_dim, ctx0, kv_self.v, N * n_head_kv * head_dim,
(ggml_element_size(model.memory_v) * n_head_kv * head_dim) * (ggml_element_size(kv_self.v) * n_head_kv * head_dim) *
(il * n_ctx + n_past)); (il * n_ctx + n_past));
ggml_set_name(v, "v"); ggml_set_name(v, "v");
@ -1506,9 +1479,9 @@ static bool falcon_eval_internal(
ctx0, ctx0,
ggml_reshape_3d( ggml_reshape_3d(
ctx0, ctx0,
ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_head_kv * head_dim, ggml_view_1d(ctx0, kv_self.k, (n_past + N) * n_head_kv * head_dim,
il * n_ctx * il * n_ctx *
ggml_element_size(model.memory_k) * ggml_element_size(kv_self.k) *
n_head_kv * n_head_kv *
head_dim), head_dim),
head_dim, n_head_kv, n_past + N), head_dim, n_head_kv, n_past + N),
@ -1545,9 +1518,9 @@ static bool falcon_eval_internal(
ctx0, ctx0,
ggml_reshape_3d( ggml_reshape_3d(
ctx0, ctx0,
ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_head_kv * head_dim, ggml_view_1d(ctx0, kv_self.v, (n_past + N) * n_head_kv * head_dim,
il * n_ctx * il * n_ctx *
ggml_element_size(model.memory_v) * ggml_element_size(model.kv_self.v) *
n_head_kv * n_head_kv *
head_dim), head_dim),
head_dim, n_head_kv, n_past + N), head_dim, n_head_kv, n_past + N),
@ -3389,11 +3362,11 @@ const char * falcon_token_to_str(const struct falcon_context * ctx, llama_token
} }
llama_token falcon_token_bos() { llama_token falcon_token_bos() {
return 1; return 11;
} }
llama_token falcon_token_eos() { llama_token falcon_token_eos() {
return 2; return 11;
} }
llama_token falcon_token_nl() { llama_token falcon_token_nl() {