Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
e5bbecaf2d
12 changed files with 161 additions and 20 deletions
10
.github/workflows/build.yml
vendored
10
.github/workflows/build.yml
vendored
|
@ -19,8 +19,8 @@ env:
|
|||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
jobs:
|
||||
ubuntu-latest-make:
|
||||
runs-on: ubuntu-latest
|
||||
ubuntu-focal-make:
|
||||
runs-on: ubuntu-20.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
@ -31,12 +31,12 @@ jobs:
|
|||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
sudo apt-get install build-essential gcc-8
|
||||
|
||||
- name: Build
|
||||
id: make_build
|
||||
run: |
|
||||
make
|
||||
CC=gcc-8 make
|
||||
|
||||
ubuntu-latest-cmake:
|
||||
runs-on: ubuntu-latest
|
||||
|
@ -216,7 +216,7 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
|
||||
needs:
|
||||
- ubuntu-latest-make
|
||||
- ubuntu-focal-make
|
||||
- ubuntu-latest-cmake
|
||||
- macOS-latest-make
|
||||
- macOS-latest-cmake
|
||||
|
|
4
Makefile
4
Makefile
|
@ -109,9 +109,9 @@ ifdef LLAMA_CUBLAS
|
|||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
|
||||
OBJS += ggml-cuda.o
|
||||
NVCC = nvcc
|
||||
NVCCFLAGS = --forward-unknown-to-host-linker -arch=native
|
||||
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@
|
||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||
endif
|
||||
ifdef LLAMA_GPROF
|
||||
CFLAGS += -pg
|
||||
|
|
|
@ -241,7 +241,7 @@ Here is an example of a few-shot interaction, invoked with the command
|
|||
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||
```
|
||||
|
||||
Note the use of `--color` to distinguish between user input and generated text.
|
||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
|
||||
|
||||

|
||||
|
||||
|
|
|
@ -34,4 +34,5 @@ else()
|
|||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(save-load-state)
|
||||
endif()
|
||||
|
|
|
@ -156,10 +156,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
params.interactive = true;
|
||||
} else if (arg == "--embedding") {
|
||||
params.embedding = true;
|
||||
} else if (arg == "--interactive-start") {
|
||||
params.interactive = true;
|
||||
} else if (arg == "--interactive-first") {
|
||||
params.interactive_start = true;
|
||||
params.interactive_first = true;
|
||||
} else if (arg == "-ins" || arg == "--instruct") {
|
||||
params.instruct = true;
|
||||
} else if (arg == "--color") {
|
||||
|
|
|
@ -43,7 +43,7 @@ struct gpt_params {
|
|||
bool interactive = false; // interactive mode
|
||||
|
||||
bool embedding = false; // get only sentence embedding
|
||||
bool interactive_start = false; // wait for user input immediately
|
||||
bool interactive_first = false; // wait for user input immediately
|
||||
|
||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||
bool ignore_eos = false; // do not stop generating after eos
|
||||
|
|
|
@ -21,12 +21,20 @@ To get started right away, run the following command, making sure to use the cor
|
|||
./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
|
||||
```
|
||||
|
||||
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time"
|
||||
```
|
||||
|
||||
For an interactive experience, try this command:
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
|
||||
```
|
||||
|
||||
Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead.
|
||||
|
||||
## Common Options
|
||||
|
||||
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
||||
|
@ -84,6 +92,8 @@ Instruction mode is particularly useful when working with Alpaca models, which a
|
|||
|
||||
- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
|
||||
|
||||
Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response).
|
||||
|
||||
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
|
||||
|
||||
## Context Management
|
||||
|
@ -114,7 +124,7 @@ The following options are related to controlling the text generation process, in
|
|||
|
||||
The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
|
||||
|
||||
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value.
|
||||
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter.
|
||||
|
||||
### RNG Seed
|
||||
|
||||
|
@ -126,7 +136,7 @@ The RNG seed is used to initialize the random number generator that influences t
|
|||
|
||||
- `--temp N`: Adjust the randomness of the generated text (default: 0.8).
|
||||
|
||||
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism.
|
||||
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
|
||||
|
||||
Example usage: `--temp 0.8`
|
||||
|
||||
|
|
|
@ -178,12 +178,12 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||
if (params.instruct) {
|
||||
params.interactive_start = true;
|
||||
params.interactive_first = true;
|
||||
params.antiprompt.push_back("### Instruction:\n\n");
|
||||
}
|
||||
|
||||
// enable interactive mode if reverse prompt or interactive start is specified
|
||||
if (params.antiprompt.size() != 0 || params.interactive_start) {
|
||||
if (params.antiprompt.size() != 0 || params.interactive_first) {
|
||||
params.interactive = true;
|
||||
}
|
||||
|
||||
|
@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
|
|||
#endif
|
||||
" - Press Return to return control to LLaMa.\n"
|
||||
" - If you want to submit another line, end your input in '\\'.\n\n");
|
||||
is_interacting = params.interactive_start;
|
||||
is_interacting = params.interactive_first;
|
||||
}
|
||||
|
||||
bool is_antiprompt = false;
|
||||
|
|
4
examples/save-load-state/CMakeLists.txt
Normal file
4
examples/save-load-state/CMakeLists.txt
Normal file
|
@ -0,0 +1,4 @@
|
|||
set(TARGET save-load-state)
|
||||
add_executable(${TARGET} save-load-state.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
128
examples/save-load-state/save-load-state.cpp
Normal file
128
examples/save-load-state/save-load-state.cpp
Normal file
|
@ -0,0 +1,128 @@
|
|||
#include <vector>
|
||||
#include <cstdio>
|
||||
#include <chrono>
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "llama.cpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
params.model = "models/llama-7B/ggml-model.bin";
|
||||
params.seed = 42;
|
||||
params.n_threads = 4;
|
||||
params.repeat_last_n = 64;
|
||||
params.prompt = "The quick brown fox";
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
lparams.n_parts = params.n_parts;
|
||||
lparams.seed = params.seed;
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
lparams.use_mmap = params.use_mmap;
|
||||
lparams.use_mlock = params.use_mlock;
|
||||
|
||||
auto n_past = 0;
|
||||
auto last_n_tokens_data = vector<llama_token>(params.repeat_last_n, 0);
|
||||
|
||||
// init
|
||||
auto ctx = llama_init_from_file(params.model.c_str(), lparams);
|
||||
auto tokens = vector<llama_token>(params.n_ctx);
|
||||
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
|
||||
|
||||
if (n_prompt_tokens < 1) {
|
||||
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// evaluate prompt
|
||||
|
||||
llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
|
||||
|
||||
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
|
||||
n_past += n_prompt_tokens;
|
||||
|
||||
// Save state (rng, logits, embedding and kv_cache) to file
|
||||
FILE *fp_write = fopen("dump_state.bin", "wb");
|
||||
auto state_size = llama_get_state_size(ctx);
|
||||
auto state_mem = new uint8_t[state_size];
|
||||
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
|
||||
fwrite(state_mem, 1, state_size, fp_write);
|
||||
fclose(fp_write);
|
||||
|
||||
// save state (last tokens)
|
||||
auto last_n_tokens_data_saved = vector<llama_token>(last_n_tokens_data);
|
||||
auto n_past_saved = n_past;
|
||||
|
||||
// first run
|
||||
printf("\n%s", params.prompt.c_str());
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sample_top_p_top_k(
|
||||
ctx,
|
||||
&last_n_tokens_data.back() - params.repeat_last_n,
|
||||
params.repeat_last_n,
|
||||
40,
|
||||
1.0,
|
||||
1.0,
|
||||
1.1);
|
||||
auto next_token_str = llama_token_to_str(ctx, next_token);
|
||||
last_n_tokens_data.push_back(next_token);
|
||||
printf("%s", next_token_str);
|
||||
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
printf("\n\n");
|
||||
|
||||
// free old model
|
||||
llama_free(ctx);
|
||||
|
||||
// load new model
|
||||
|
||||
auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
|
||||
|
||||
// Load state (rng, logits, embedding and kv_cache) from file
|
||||
FILE *fp_read = fopen("dump_state.bin", "rb");
|
||||
auto state_size2 = llama_get_state_size(ctx2);
|
||||
if (state_size != state_size2) {
|
||||
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
|
||||
}
|
||||
fread(state_mem, 1, state_size, fp_read);
|
||||
llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file
|
||||
fclose(fp_read);
|
||||
|
||||
// restore state (last tokens)
|
||||
last_n_tokens_data = last_n_tokens_data_saved;
|
||||
n_past = n_past_saved;
|
||||
|
||||
// second run
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sample_top_p_top_k(
|
||||
ctx2,
|
||||
&last_n_tokens_data.back() - params.repeat_last_n,
|
||||
params.repeat_last_n,
|
||||
40,
|
||||
1.0,
|
||||
1.0,
|
||||
1.1);
|
||||
auto next_token_str = llama_token_to_str(ctx2, next_token);
|
||||
last_n_tokens_data.push_back(next_token);
|
||||
printf("%s", next_token_str);
|
||||
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
printf("\n\n");
|
||||
return 0;
|
||||
}
|
2
ggml.c
2
ggml.c
|
@ -436,7 +436,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|||
static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
|
||||
{
|
||||
// Load 8 bytes from memory
|
||||
__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
|
||||
__m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
|
||||
|
||||
// Expand bytes into uint16_t values
|
||||
__m128i bytes = _mm_cvtepu8_epi16( tmp );
|
||||
|
|
|
@ -54,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 512ull * MB },
|
||||
{ MODEL_65B, 1024ull * MB },
|
||||
};
|
||||
return _MEM_REQ_SCRATCH0;
|
||||
}
|
||||
|
@ -65,7 +65,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 512ull * MB },
|
||||
{ MODEL_65B, 1024ull * MB },
|
||||
};
|
||||
return _MEM_REQ_SCRATCH1;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue