diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index c0bd6bee4..9bf951763 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -638,19 +638,20 @@ void PurgeMissingTokens(llama_context * ctx, std::vector ¤t_context_t int found = ArrFindIndexOf(current_context_tokens,shared); if(found>=0 && found > trimstart) { + //extract the unwanted tokens out from context and KV int diff = found - trimstart; - llama_kv_cache_seq_rm(llama_ctx_v4, 0, trimstart + 1, trimstart + diff + 1); - llama_kv_cache_seq_shift(llama_ctx_v4, 0, trimstart + diff + 1, -1, -diff); + llama_kv_cache_seq_rm(llama_ctx_v4, 0, trimstart, trimstart + diff); + llama_kv_cache_seq_shift(llama_ctx_v4, 0, trimstart + diff, -1, -diff); for (size_t i = trimstart + diff; i < current_context_tokens.size() - 1; i++) { current_context_tokens[i - diff] = current_context_tokens[i]; } - printf("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart+1); + printf("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1); - current_context_tokens.resize(current_context_tokens.size() - diff - 1); + current_context_tokens.resize(current_context_tokens.size() - diff); } } @@ -1446,8 +1447,16 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o if (embd_inp.size() + params.n_predict > nctx) { + //get bos token + std::vector bos; + TokenizeString("", bos, file_format); int offset = embd_inp.size() - nctx + params.n_predict; embd_inp = std::vector(embd_inp.begin() + offset, embd_inp.end()); + //replace bos into front if exists + if(bos.size()>0 && embd_inp.size()>0) + { + embd_inp[0] = bos[0]; + } } //determine how much npast we have to rewind from the current state