Merge branch 'master' into concedo
# Conflicts: # .github/workflows/build.yml # README.md
This commit is contained in:
commit
235daf4016
11 changed files with 821 additions and 656 deletions
4
Makefile
4
Makefile
|
@ -128,9 +128,9 @@ ifdef LLAMA_CUBLAS
|
||||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
|
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
NVCC = nvcc
|
NVCC = nvcc
|
||||||
NVCCFLAGS = --forward-unknown-to-host-linker -arch=native
|
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
endif
|
endif
|
||||||
ifdef LLAMA_GPROF
|
ifdef LLAMA_GPROF
|
||||||
CFLAGS += -pg
|
CFLAGS += -pg
|
||||||
|
|
|
@ -34,4 +34,5 @@ else()
|
||||||
add_subdirectory(quantize-stats)
|
add_subdirectory(quantize-stats)
|
||||||
add_subdirectory(perplexity)
|
add_subdirectory(perplexity)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
|
add_subdirectory(save-load-state)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -156,10 +156,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
} else if (arg == "--embedding") {
|
} else if (arg == "--embedding") {
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
} else if (arg == "--interactive-start") {
|
|
||||||
params.interactive = true;
|
|
||||||
} else if (arg == "--interactive-first") {
|
} else if (arg == "--interactive-first") {
|
||||||
params.interactive_start = true;
|
params.interactive_first = true;
|
||||||
} else if (arg == "-ins" || arg == "--instruct") {
|
} else if (arg == "-ins" || arg == "--instruct") {
|
||||||
params.instruct = true;
|
params.instruct = true;
|
||||||
} else if (arg == "--color") {
|
} else if (arg == "--color") {
|
||||||
|
|
|
@ -43,7 +43,7 @@ struct gpt_params {
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
|
|
||||||
bool embedding = false; // get only sentence embedding
|
bool embedding = false; // get only sentence embedding
|
||||||
bool interactive_start = false; // wait for user input immediately
|
bool interactive_first = false; // wait for user input immediately
|
||||||
|
|
||||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||||
bool ignore_eos = false; // do not stop generating after eos
|
bool ignore_eos = false; // do not stop generating after eos
|
||||||
|
|
|
@ -21,12 +21,20 @@ To get started right away, run the following command, making sure to use the cor
|
||||||
./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
|
./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time"
|
||||||
|
```
|
||||||
|
|
||||||
For an interactive experience, try this command:
|
For an interactive experience, try this command:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
|
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead.
|
||||||
|
|
||||||
## Common Options
|
## Common Options
|
||||||
|
|
||||||
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
||||||
|
@ -84,6 +92,8 @@ Instruction mode is particularly useful when working with Alpaca models, which a
|
||||||
|
|
||||||
- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
|
- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
|
||||||
|
|
||||||
|
Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response).
|
||||||
|
|
||||||
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
|
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
|
||||||
|
|
||||||
## Context Management
|
## Context Management
|
||||||
|
@ -114,7 +124,7 @@ The following options are related to controlling the text generation process, in
|
||||||
|
|
||||||
The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
|
The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
|
||||||
|
|
||||||
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value.
|
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter.
|
||||||
|
|
||||||
### RNG Seed
|
### RNG Seed
|
||||||
|
|
||||||
|
@ -126,7 +136,7 @@ The RNG seed is used to initialize the random number generator that influences t
|
||||||
|
|
||||||
- `--temp N`: Adjust the randomness of the generated text (default: 0.8).
|
- `--temp N`: Adjust the randomness of the generated text (default: 0.8).
|
||||||
|
|
||||||
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism.
|
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
|
||||||
|
|
||||||
Example usage: `--temp 0.8`
|
Example usage: `--temp 0.8`
|
||||||
|
|
||||||
|
|
|
@ -178,12 +178,12 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||||
if (params.instruct) {
|
if (params.instruct) {
|
||||||
params.interactive_start = true;
|
params.interactive_first = true;
|
||||||
params.antiprompt.push_back("### Instruction:\n\n");
|
params.antiprompt.push_back("### Instruction:\n\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// enable interactive mode if reverse prompt or interactive start is specified
|
// enable interactive mode if reverse prompt or interactive start is specified
|
||||||
if (params.antiprompt.size() != 0 || params.interactive_start) {
|
if (params.antiprompt.size() != 0 || params.interactive_first) {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
|
||||||
#endif
|
#endif
|
||||||
" - Press Return to return control to LLaMa.\n"
|
" - Press Return to return control to LLaMa.\n"
|
||||||
" - If you want to submit another line, end your input in '\\'.\n\n");
|
" - If you want to submit another line, end your input in '\\'.\n\n");
|
||||||
is_interacting = params.interactive_start;
|
is_interacting = params.interactive_first;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_antiprompt = false;
|
bool is_antiprompt = false;
|
||||||
|
|
4
examples/save-load-state/CMakeLists.txt
Normal file
4
examples/save-load-state/CMakeLists.txt
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
set(TARGET save-load-state)
|
||||||
|
add_executable(${TARGET} save-load-state.cpp)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
128
examples/save-load-state/save-load-state.cpp
Normal file
128
examples/save-load-state/save-load-state.cpp
Normal file
|
@ -0,0 +1,128 @@
|
||||||
|
#include <vector>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <chrono>
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "llama.cpp"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
gpt_params params;
|
||||||
|
params.model = "models/llama-7B/ggml-model.bin";
|
||||||
|
params.seed = 42;
|
||||||
|
params.n_threads = 4;
|
||||||
|
params.repeat_last_n = 64;
|
||||||
|
params.prompt = "The quick brown fox";
|
||||||
|
|
||||||
|
if (gpt_params_parse(argc, argv, params) == false) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto lparams = llama_context_default_params();
|
||||||
|
|
||||||
|
lparams.n_ctx = params.n_ctx;
|
||||||
|
lparams.n_parts = params.n_parts;
|
||||||
|
lparams.seed = params.seed;
|
||||||
|
lparams.f16_kv = params.memory_f16;
|
||||||
|
lparams.use_mmap = params.use_mmap;
|
||||||
|
lparams.use_mlock = params.use_mlock;
|
||||||
|
|
||||||
|
auto n_past = 0;
|
||||||
|
auto last_n_tokens_data = vector<llama_token>(params.repeat_last_n, 0);
|
||||||
|
|
||||||
|
// init
|
||||||
|
auto ctx = llama_init_from_file(params.model.c_str(), lparams);
|
||||||
|
auto tokens = vector<llama_token>(params.n_ctx);
|
||||||
|
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
|
||||||
|
|
||||||
|
if (n_prompt_tokens < 1) {
|
||||||
|
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// evaluate prompt
|
||||||
|
|
||||||
|
llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
|
||||||
|
|
||||||
|
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
|
||||||
|
n_past += n_prompt_tokens;
|
||||||
|
|
||||||
|
// Save state (rng, logits, embedding and kv_cache) to file
|
||||||
|
FILE *fp_write = fopen("dump_state.bin", "wb");
|
||||||
|
auto state_size = llama_get_state_size(ctx);
|
||||||
|
auto state_mem = new uint8_t[state_size];
|
||||||
|
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
|
||||||
|
fwrite(state_mem, 1, state_size, fp_write);
|
||||||
|
fclose(fp_write);
|
||||||
|
|
||||||
|
// save state (last tokens)
|
||||||
|
auto last_n_tokens_data_saved = vector<llama_token>(last_n_tokens_data);
|
||||||
|
auto n_past_saved = n_past;
|
||||||
|
|
||||||
|
// first run
|
||||||
|
printf("\n%s", params.prompt.c_str());
|
||||||
|
for (auto i = 0; i < params.n_predict; i++) {
|
||||||
|
auto next_token = llama_sample_top_p_top_k(
|
||||||
|
ctx,
|
||||||
|
&last_n_tokens_data.back() - params.repeat_last_n,
|
||||||
|
params.repeat_last_n,
|
||||||
|
40,
|
||||||
|
1.0,
|
||||||
|
1.0,
|
||||||
|
1.1);
|
||||||
|
auto next_token_str = llama_token_to_str(ctx, next_token);
|
||||||
|
last_n_tokens_data.push_back(next_token);
|
||||||
|
printf("%s", next_token_str);
|
||||||
|
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
||||||
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
n_past += 1;
|
||||||
|
}
|
||||||
|
printf("\n\n");
|
||||||
|
|
||||||
|
// free old model
|
||||||
|
llama_free(ctx);
|
||||||
|
|
||||||
|
// load new model
|
||||||
|
|
||||||
|
auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
|
||||||
|
|
||||||
|
// Load state (rng, logits, embedding and kv_cache) from file
|
||||||
|
FILE *fp_read = fopen("dump_state.bin", "rb");
|
||||||
|
auto state_size2 = llama_get_state_size(ctx2);
|
||||||
|
if (state_size != state_size2) {
|
||||||
|
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
|
||||||
|
}
|
||||||
|
fread(state_mem, 1, state_size, fp_read);
|
||||||
|
llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file
|
||||||
|
fclose(fp_read);
|
||||||
|
|
||||||
|
// restore state (last tokens)
|
||||||
|
last_n_tokens_data = last_n_tokens_data_saved;
|
||||||
|
n_past = n_past_saved;
|
||||||
|
|
||||||
|
// second run
|
||||||
|
for (auto i = 0; i < params.n_predict; i++) {
|
||||||
|
auto next_token = llama_sample_top_p_top_k(
|
||||||
|
ctx2,
|
||||||
|
&last_n_tokens_data.back() - params.repeat_last_n,
|
||||||
|
params.repeat_last_n,
|
||||||
|
40,
|
||||||
|
1.0,
|
||||||
|
1.0,
|
||||||
|
1.1);
|
||||||
|
auto next_token_str = llama_token_to_str(ctx2, next_token);
|
||||||
|
last_n_tokens_data.push_back(next_token);
|
||||||
|
printf("%s", next_token_str);
|
||||||
|
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
||||||
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
n_past += 1;
|
||||||
|
}
|
||||||
|
printf("\n\n");
|
||||||
|
return 0;
|
||||||
|
}
|
9
ggml.c
9
ggml.c
|
@ -438,7 +438,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
||||||
static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
|
static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
|
||||||
{
|
{
|
||||||
// Load 8 bytes from memory
|
// Load 8 bytes from memory
|
||||||
__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
|
__m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
|
||||||
|
|
||||||
// Expand bytes into uint16_t values
|
// Expand bytes into uint16_t values
|
||||||
__m128i bytes = _mm_cvtepu8_epi16( tmp );
|
__m128i bytes = _mm_cvtepu8_epi16( tmp );
|
||||||
|
@ -6781,15 +6781,20 @@ static void ggml_compute_forward_sum_f32(
|
||||||
const size_t nb02 = src0->nb[2];
|
const size_t nb02 = src0->nb[2];
|
||||||
const size_t nb03 = src0->nb[3];
|
const size_t nb03 = src0->nb[3];
|
||||||
|
|
||||||
|
ggml_float sum = 0;
|
||||||
|
float row_sum = 0;
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||||
ggml_vec_sum_f32(ne00,
|
ggml_vec_sum_f32(ne00,
|
||||||
(float *) (dst->data),
|
&row_sum,
|
||||||
(float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
|
(float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
|
||||||
|
sum += row_sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
((float *) dst->data)[0] = sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_compute_forward_sum(
|
static void ggml_compute_forward_sum(
|
||||||
|
|
|
@ -54,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||||
{ MODEL_7B, 512ull * MB },
|
{ MODEL_7B, 512ull * MB },
|
||||||
{ MODEL_13B, 512ull * MB },
|
{ MODEL_13B, 512ull * MB },
|
||||||
{ MODEL_30B, 512ull * MB },
|
{ MODEL_30B, 512ull * MB },
|
||||||
{ MODEL_65B, 512ull * MB },
|
{ MODEL_65B, 1024ull * MB },
|
||||||
};
|
};
|
||||||
return _MEM_REQ_SCRATCH0;
|
return _MEM_REQ_SCRATCH0;
|
||||||
}
|
}
|
||||||
|
@ -65,7 +65,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
||||||
{ MODEL_7B, 512ull * MB },
|
{ MODEL_7B, 512ull * MB },
|
||||||
{ MODEL_13B, 512ull * MB },
|
{ MODEL_13B, 512ull * MB },
|
||||||
{ MODEL_30B, 512ull * MB },
|
{ MODEL_30B, 512ull * MB },
|
||||||
{ MODEL_65B, 512ull * MB },
|
{ MODEL_65B, 1024ull * MB },
|
||||||
};
|
};
|
||||||
return _MEM_REQ_SCRATCH1;
|
return _MEM_REQ_SCRATCH1;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue