Merge branch 'ggerganov:master' into master

This commit is contained in:
KASR 2023-04-24 20:32:39 +02:00 committed by GitHub
commit e5bbecaf2d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 161 additions and 20 deletions

View file

@ -19,8 +19,8 @@ env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }} BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
jobs: jobs:
ubuntu-latest-make: ubuntu-focal-make:
runs-on: ubuntu-latest runs-on: ubuntu-20.04
steps: steps:
- name: Clone - name: Clone
@ -31,12 +31,12 @@ jobs:
id: depends id: depends
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install build-essential sudo apt-get install build-essential gcc-8
- name: Build - name: Build
id: make_build id: make_build
run: | run: |
make CC=gcc-8 make
ubuntu-latest-cmake: ubuntu-latest-cmake:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@ -216,7 +216,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: needs:
- ubuntu-latest-make - ubuntu-focal-make
- ubuntu-latest-cmake - ubuntu-latest-cmake
- macOS-latest-make - macOS-latest-make
- macOS-latest-cmake - macOS-latest-cmake

View file

@ -109,9 +109,9 @@ ifdef LLAMA_CUBLAS
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
OBJS += ggml-cuda.o OBJS += ggml-cuda.o
NVCC = nvcc NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-linker -arch=native NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@ $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif endif
ifdef LLAMA_GPROF ifdef LLAMA_GPROF
CFLAGS += -pg CFLAGS += -pg

View file

@ -241,7 +241,7 @@ Here is an example of a few-shot interaction, invoked with the command
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt ./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
``` ```
Note the use of `--color` to distinguish between user input and generated text. Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png) ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)

View file

@ -34,4 +34,5 @@ else()
add_subdirectory(quantize-stats) add_subdirectory(quantize-stats)
add_subdirectory(perplexity) add_subdirectory(perplexity)
add_subdirectory(embedding) add_subdirectory(embedding)
add_subdirectory(save-load-state)
endif() endif()

View file

@ -156,10 +156,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.interactive = true; params.interactive = true;
} else if (arg == "--embedding") { } else if (arg == "--embedding") {
params.embedding = true; params.embedding = true;
} else if (arg == "--interactive-start") {
params.interactive = true;
} else if (arg == "--interactive-first") { } else if (arg == "--interactive-first") {
params.interactive_start = true; params.interactive_first = true;
} else if (arg == "-ins" || arg == "--instruct") { } else if (arg == "-ins" || arg == "--instruct") {
params.instruct = true; params.instruct = true;
} else if (arg == "--color") { } else if (arg == "--color") {

View file

@ -43,7 +43,7 @@ struct gpt_params {
bool interactive = false; // interactive mode bool interactive = false; // interactive mode
bool embedding = false; // get only sentence embedding bool embedding = false; // get only sentence embedding
bool interactive_start = false; // wait for user input immediately bool interactive_first = false; // wait for user input immediately
bool instruct = false; // instruction mode (used for Alpaca models) bool instruct = false; // instruction mode (used for Alpaca models)
bool ignore_eos = false; // do not stop generating after eos bool ignore_eos = false; // do not stop generating after eos

View file

@ -21,12 +21,20 @@ To get started right away, run the following command, making sure to use the cor
./main -m models/7B/ggml-model.bin --prompt "Once upon a time" ./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
``` ```
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
```bash
./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time"
```
For an interactive experience, try this command: For an interactive experience, try this command:
```bash ```bash
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:' ./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
``` ```
Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead.
## Common Options ## Common Options
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models: In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
@ -84,6 +92,8 @@ Instruction mode is particularly useful when working with Alpaca models, which a
- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions. - `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response).
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs. By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
## Context Management ## Context Management
@ -114,7 +124,7 @@ The following options are related to controlling the text generation process, in
The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit. The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter.
### RNG Seed ### RNG Seed
@ -126,7 +136,7 @@ The RNG seed is used to initialize the random number generator that influences t
- `--temp N`: Adjust the randomness of the generated text (default: 0.8). - `--temp N`: Adjust the randomness of the generated text (default: 0.8).
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
Example usage: `--temp 0.8` Example usage: `--temp 0.8`

View file

@ -178,12 +178,12 @@ int main(int argc, char ** argv) {
// in instruct mode, we inject a prefix and a suffix to each input by the user // in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) { if (params.instruct) {
params.interactive_start = true; params.interactive_first = true;
params.antiprompt.push_back("### Instruction:\n\n"); params.antiprompt.push_back("### Instruction:\n\n");
} }
// enable interactive mode if reverse prompt or interactive start is specified // enable interactive mode if reverse prompt or interactive start is specified
if (params.antiprompt.size() != 0 || params.interactive_start) { if (params.antiprompt.size() != 0 || params.interactive_first) {
params.interactive = true; params.interactive = true;
} }
@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
#endif #endif
" - Press Return to return control to LLaMa.\n" " - Press Return to return control to LLaMa.\n"
" - If you want to submit another line, end your input in '\\'.\n\n"); " - If you want to submit another line, end your input in '\\'.\n\n");
is_interacting = params.interactive_start; is_interacting = params.interactive_first;
} }
bool is_antiprompt = false; bool is_antiprompt = false;

View file

@ -0,0 +1,4 @@
set(TARGET save-load-state)
add_executable(${TARGET} save-load-state.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,128 @@
#include <vector>
#include <cstdio>
#include <chrono>
#include "common.h"
#include "llama.h"
#include "llama.cpp"
using namespace std;
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
params.seed = 42;
params.n_threads = 4;
params.repeat_last_n = 64;
params.prompt = "The quick brown fox";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
auto n_past = 0;
auto last_n_tokens_data = vector<llama_token>(params.repeat_last_n, 0);
// init
auto ctx = llama_init_from_file(params.model.c_str(), lparams);
auto tokens = vector<llama_token>(params.n_ctx);
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
if (n_prompt_tokens < 1) {
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
return 1;
}
// evaluate prompt
llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
n_past += n_prompt_tokens;
// Save state (rng, logits, embedding and kv_cache) to file
FILE *fp_write = fopen("dump_state.bin", "wb");
auto state_size = llama_get_state_size(ctx);
auto state_mem = new uint8_t[state_size];
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
fwrite(state_mem, 1, state_size, fp_write);
fclose(fp_write);
// save state (last tokens)
auto last_n_tokens_data_saved = vector<llama_token>(last_n_tokens_data);
auto n_past_saved = n_past;
// first run
printf("\n%s", params.prompt.c_str());
for (auto i = 0; i < params.n_predict; i++) {
auto next_token = llama_sample_top_p_top_k(
ctx,
&last_n_tokens_data.back() - params.repeat_last_n,
params.repeat_last_n,
40,
1.0,
1.0,
1.1);
auto next_token_str = llama_token_to_str(ctx, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str);
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
return 1;
}
n_past += 1;
}
printf("\n\n");
// free old model
llama_free(ctx);
// load new model
auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
// Load state (rng, logits, embedding and kv_cache) from file
FILE *fp_read = fopen("dump_state.bin", "rb");
auto state_size2 = llama_get_state_size(ctx2);
if (state_size != state_size2) {
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
}
fread(state_mem, 1, state_size, fp_read);
llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file
fclose(fp_read);
// restore state (last tokens)
last_n_tokens_data = last_n_tokens_data_saved;
n_past = n_past_saved;
// second run
for (auto i = 0; i < params.n_predict; i++) {
auto next_token = llama_sample_top_p_top_k(
ctx2,
&last_n_tokens_data.back() - params.repeat_last_n,
params.repeat_last_n,
40,
1.0,
1.0,
1.1);
auto next_token_str = llama_token_to_str(ctx2, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str);
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
return 1;
}
n_past += 1;
}
printf("\n\n");
return 0;
}

2
ggml.c
View file

@ -436,7 +436,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi) static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
{ {
// Load 8 bytes from memory // Load 8 bytes from memory
__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi ); __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
// Expand bytes into uint16_t values // Expand bytes into uint16_t values
__m128i bytes = _mm_cvtepu8_epi16( tmp ); __m128i bytes = _mm_cvtepu8_epi16( tmp );

View file

@ -54,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
{ MODEL_7B, 512ull * MB }, { MODEL_7B, 512ull * MB },
{ MODEL_13B, 512ull * MB }, { MODEL_13B, 512ull * MB },
{ MODEL_30B, 512ull * MB }, { MODEL_30B, 512ull * MB },
{ MODEL_65B, 512ull * MB }, { MODEL_65B, 1024ull * MB },
}; };
return _MEM_REQ_SCRATCH0; return _MEM_REQ_SCRATCH0;
} }
@ -65,7 +65,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
{ MODEL_7B, 512ull * MB }, { MODEL_7B, 512ull * MB },
{ MODEL_13B, 512ull * MB }, { MODEL_13B, 512ull * MB },
{ MODEL_30B, 512ull * MB }, { MODEL_30B, 512ull * MB },
{ MODEL_65B, 512ull * MB }, { MODEL_65B, 1024ull * MB },
}; };
return _MEM_REQ_SCRATCH1; return _MEM_REQ_SCRATCH1;
} }