diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index f87ac270c..dd15393c3 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -4,7 +4,9 @@ import argparse import convert -parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file') +parser = argparse.ArgumentParser( + description="""[DEPRECATED - use `convert.py` instead] + Convert a LLaMA model checkpoint to a ggml compatible file""") parser.add_argument('dir_model', help='directory containing the model checkpoint') parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1) args = parser.parse_args() diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 6131f5b46..57cc1e486 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -202,6 +202,13 @@ int main(int argc, char ** argv) { } } + // if we will use the cache for the full prompt without reaching the end of the cache, force + // reevaluation of the last token token to recalculate the cached logits + if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && + session_tokens.size() > embd_inp.size()) { + session_tokens.resize(embd_inp.size() - 1); + } + // number of tokens to keep when resetting context if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) { params.n_keep = (int)embd_inp.size(); @@ -360,12 +367,6 @@ int main(int argc, char ** argv) { } } if (i > 0) { - // check if we've used up all the prompt but not all cached tokens - if (embd.size() == i && n_session_consumed < (int) session_tokens.size()) { - // force revaluation of the last token to recalculate logits - i--; - n_past--; - } embd.erase(embd.begin(), embd.begin() + i); } }