diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7c40b0c12..179080576 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,8 +19,8 @@ env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} jobs: - ubuntu-latest-make: - runs-on: ubuntu-latest + ubuntu-focal-make: + runs-on: ubuntu-20.04 steps: - name: Clone @@ -31,12 +31,12 @@ jobs: id: depends run: | sudo apt-get update - sudo apt-get install build-essential + sudo apt-get install build-essential gcc-8 - name: Build id: make_build run: | - make + CC=gcc-8 make ubuntu-latest-cmake: runs-on: ubuntu-latest @@ -216,7 +216,7 @@ jobs: runs-on: ubuntu-latest needs: - - ubuntu-latest-make + - ubuntu-focal-make - ubuntu-latest-cmake - macOS-latest-make - macOS-latest-cmake diff --git a/Makefile b/Makefile index 0c7b6548d..8fbb19c46 100644 --- a/Makefile +++ b/Makefile @@ -109,9 +109,9 @@ ifdef LLAMA_CUBLAS LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 OBJS += ggml-cuda.o NVCC = nvcc - NVCCFLAGS = --forward-unknown-to-host-linker -arch=native + NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native ggml-cuda.o: ggml-cuda.cu ggml-cuda.h - $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@ + $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ endif ifdef LLAMA_GPROF CFLAGS += -pg diff --git a/README.md b/README.md index 7bf2cc1ba..44cf72124 100644 --- a/README.md +++ b/README.md @@ -241,7 +241,7 @@ Here is an example of a few-shot interaction, invoked with the command ./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt ``` -Note the use of `--color` to distinguish between user input and generated text. +Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program. ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 67a7cea54..be35363f5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -34,4 +34,5 @@ else() add_subdirectory(quantize-stats) add_subdirectory(perplexity) add_subdirectory(embedding) + add_subdirectory(save-load-state) endif() diff --git a/examples/common.cpp b/examples/common.cpp index a0b6f10ad..c0e87eb9f 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -156,10 +156,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.interactive = true; } else if (arg == "--embedding") { params.embedding = true; - } else if (arg == "--interactive-start") { - params.interactive = true; } else if (arg == "--interactive-first") { - params.interactive_start = true; + params.interactive_first = true; } else if (arg == "-ins" || arg == "--instruct") { params.instruct = true; } else if (arg == "--color") { diff --git a/examples/common.h b/examples/common.h index 0470368d5..6f26b514d 100644 --- a/examples/common.h +++ b/examples/common.h @@ -43,7 +43,7 @@ struct gpt_params { bool interactive = false; // interactive mode bool embedding = false; // get only sentence embedding - bool interactive_start = false; // wait for user input immediately + bool interactive_first = false; // wait for user input immediately bool instruct = false; // instruction mode (used for Alpaca models) bool ignore_eos = false; // do not stop generating after eos diff --git a/examples/main/README.md b/examples/main/README.md index 5cbc5033b..234bf2eb5 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -21,12 +21,20 @@ To get started right away, run the following command, making sure to use the cor ./main -m models/7B/ggml-model.bin --prompt "Once upon a time" ``` +The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it): + +```bash +./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time" +``` + For an interactive experience, try this command: ```bash ./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:' ``` +Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead. + ## Common Options In this section, we cover the most commonly used options for running the `main` program with the LLaMA models: @@ -84,6 +92,8 @@ Instruction mode is particularly useful when working with Alpaca models, which a - `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions. +Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response). + By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs. ## Context Management @@ -114,7 +124,7 @@ The following options are related to controlling the text generation process, in The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit. -It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. +It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter. ### RNG Seed @@ -126,7 +136,7 @@ The RNG seed is used to initialize the random number generator that influences t - `--temp N`: Adjust the randomness of the generated text (default: 0.8). -Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. +Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run. Example usage: `--temp 0.8` diff --git a/examples/main/main.cpp b/examples/main/main.cpp index decf41a9f..f9c9e9d98 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -178,12 +178,12 @@ int main(int argc, char ** argv) { // in instruct mode, we inject a prefix and a suffix to each input by the user if (params.instruct) { - params.interactive_start = true; + params.interactive_first = true; params.antiprompt.push_back("### Instruction:\n\n"); } // enable interactive mode if reverse prompt or interactive start is specified - if (params.antiprompt.size() != 0 || params.interactive_start) { + if (params.antiprompt.size() != 0 || params.interactive_first) { params.interactive = true; } @@ -246,7 +246,7 @@ int main(int argc, char ** argv) { #endif " - Press Return to return control to LLaMa.\n" " - If you want to submit another line, end your input in '\\'.\n\n"); - is_interacting = params.interactive_start; + is_interacting = params.interactive_first; } bool is_antiprompt = false; diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt new file mode 100644 index 000000000..cff79fa1f --- /dev/null +++ b/examples/save-load-state/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET save-load-state) +add_executable(${TARGET} save-load-state.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp new file mode 100644 index 000000000..39aa7f82c --- /dev/null +++ b/examples/save-load-state/save-load-state.cpp @@ -0,0 +1,128 @@ +#include +#include +#include + +#include "common.h" +#include "llama.h" +#include "llama.cpp" + +using namespace std; + +int main(int argc, char ** argv) { + gpt_params params; + params.model = "models/llama-7B/ggml-model.bin"; + params.seed = 42; + params.n_threads = 4; + params.repeat_last_n = 64; + params.prompt = "The quick brown fox"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + auto lparams = llama_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.use_mmap = params.use_mmap; + lparams.use_mlock = params.use_mlock; + + auto n_past = 0; + auto last_n_tokens_data = vector(params.repeat_last_n, 0); + + // init + auto ctx = llama_init_from_file(params.model.c_str(), lparams); + auto tokens = vector(params.n_ctx); + auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true); + + if (n_prompt_tokens < 1) { + fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); + return 1; + } + + // evaluate prompt + + llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads); + + last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens); + n_past += n_prompt_tokens; + + // Save state (rng, logits, embedding and kv_cache) to file + FILE *fp_write = fopen("dump_state.bin", "wb"); + auto state_size = llama_get_state_size(ctx); + auto state_mem = new uint8_t[state_size]; + llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file + fwrite(state_mem, 1, state_size, fp_write); + fclose(fp_write); + + // save state (last tokens) + auto last_n_tokens_data_saved = vector(last_n_tokens_data); + auto n_past_saved = n_past; + + // first run + printf("\n%s", params.prompt.c_str()); + for (auto i = 0; i < params.n_predict; i++) { + auto next_token = llama_sample_top_p_top_k( + ctx, + &last_n_tokens_data.back() - params.repeat_last_n, + params.repeat_last_n, + 40, + 1.0, + 1.0, + 1.1); + auto next_token_str = llama_token_to_str(ctx, next_token); + last_n_tokens_data.push_back(next_token); + printf("%s", next_token_str); + if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) { + fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + return 1; + } + n_past += 1; + } + printf("\n\n"); + + // free old model + llama_free(ctx); + + // load new model + + auto ctx2 = llama_init_from_file(params.model.c_str(), lparams); + + // Load state (rng, logits, embedding and kv_cache) from file + FILE *fp_read = fopen("dump_state.bin", "rb"); + auto state_size2 = llama_get_state_size(ctx2); + if (state_size != state_size2) { + fprintf(stderr, "\n%s : failed to validate state size\n", __func__); + } + fread(state_mem, 1, state_size, fp_read); + llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file + fclose(fp_read); + + // restore state (last tokens) + last_n_tokens_data = last_n_tokens_data_saved; + n_past = n_past_saved; + + // second run + for (auto i = 0; i < params.n_predict; i++) { + auto next_token = llama_sample_top_p_top_k( + ctx2, + &last_n_tokens_data.back() - params.repeat_last_n, + params.repeat_last_n, + 40, + 1.0, + 1.0, + 1.1); + auto next_token_str = llama_token_to_str(ctx2, next_token); + last_n_tokens_data.push_back(next_token); + printf("%s", next_token_str); + if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) { + fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + return 1; + } + n_past += 1; + } + printf("\n\n"); + return 0; +} diff --git a/ggml.c b/ggml.c index f8f73af3e..6e46c0e5a 100644 --- a/ggml.c +++ b/ggml.c @@ -436,7 +436,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi) { // Load 8 bytes from memory - __m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi ); + __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi ); // Expand bytes into uint16_t values __m128i bytes = _mm_cvtepu8_epi16( tmp ); diff --git a/llama.cpp b/llama.cpp index bc0ef1281..28d27916a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -54,7 +54,7 @@ static const std::map & MEM_REQ_SCRATCH0() { MODEL_7B, 512ull * MB }, { MODEL_13B, 512ull * MB }, { MODEL_30B, 512ull * MB }, - { MODEL_65B, 512ull * MB }, + { MODEL_65B, 1024ull * MB }, }; return _MEM_REQ_SCRATCH0; } @@ -65,7 +65,7 @@ static const std::map & MEM_REQ_SCRATCH1() { MODEL_7B, 512ull * MB }, { MODEL_13B, 512ull * MB }, { MODEL_30B, 512ull * MB }, - { MODEL_65B, 512ull * MB }, + { MODEL_65B, 1024ull * MB }, }; return _MEM_REQ_SCRATCH1; }