merge
This commit is contained in:
commit
84ab887349
12 changed files with 180 additions and 125 deletions
10
.github/ISSUE_TEMPLATE/custom.md
vendored
10
.github/ISSUE_TEMPLATE/custom.md
vendored
|
@ -1,7 +1,7 @@
|
|||
---
|
||||
name: Custom issue template
|
||||
about: Used to report user-related issues with the software
|
||||
title: "[User] I encountered a problem .."
|
||||
name: Issue and enhancement template
|
||||
about: Used to report issues and request enhancements for llama.cpp
|
||||
title: "[User] Insert summary of your issue or enhancement.."
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
|
@ -18,11 +18,11 @@ Please answer the following questions for yourself before submitting an issue.
|
|||
|
||||
# Expected Behavior
|
||||
|
||||
Please provide a detailed written description of what you were trying to do, and what you expected `lamma.cpp` to do.
|
||||
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
|
||||
|
||||
# Current Behavior
|
||||
|
||||
Please provide a detailed written description of what `lamma.cpp` did, instead.
|
||||
Please provide a detailed written description of what `llama.cpp` did, instead.
|
||||
|
||||
# Environment and Context
|
||||
|
||||
|
|
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
|
@ -89,7 +89,7 @@ jobs:
|
|||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
cmake -DLLAMA_AVX2=OFF ..
|
||||
cmake --build . --config Release
|
||||
ctest --output-on-failure
|
||||
|
||||
|
|
|
@ -217,6 +217,7 @@ add_library(utils OBJECT
|
|||
|
||||
target_include_directories(utils PUBLIC .)
|
||||
target_compile_features(utils PUBLIC cxx_std_11) # don't bump
|
||||
target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
|
||||
|
||||
add_library(ggml OBJECT
|
||||
ggml.c
|
||||
|
@ -226,12 +227,13 @@ target_include_directories(ggml PUBLIC .)
|
|||
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
||||
target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||
|
||||
add_library(llama OBJECT
|
||||
add_library(llama
|
||||
llama.cpp
|
||||
llama.h)
|
||||
|
||||
target_include_directories(llama PUBLIC .)
|
||||
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
|
||||
target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
|
||||
|
||||
#
|
||||
# Executables
|
||||
|
|
35
README.md
35
README.md
|
@ -240,6 +240,40 @@ or
|
|||
|
||||
`shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS
|
||||
|
||||
### Perplexity (Measuring model quality)
|
||||
|
||||
You can pass `--perplexity` as a command line option to measure perplexity over the given prompt. For more background,
|
||||
see https://huggingface.co/docs/transformers/perplexity. However, in general, lower perplexity is better for LLMs.
|
||||
|
||||
#### Measurements
|
||||
|
||||
https://github.com/ggerganov/llama.cpp/pull/270 is the unofficial tracking page for now. llama.cpp is measuring very well
|
||||
compared to the baseline implementations. Quantization has a small negative impact to quality, but, as you can see, running
|
||||
13B at q4_0 beats the 7B f16 model by a significant amount.
|
||||
|
||||
All measurements are done against wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context).
|
||||
Note that the changing the context length will have a significant impact on perplexity (longer context = better perplexity).
|
||||
```
|
||||
Perplexity - model options
|
||||
5.5985 - 13B, q4_0
|
||||
5.9565 - 7B, f16
|
||||
6.3001 - 7B, q4_1
|
||||
6.5949 - 7B, q4_0
|
||||
6.5995 - 7B, q4_0, --memory_f16
|
||||
```
|
||||
|
||||
#### How to run
|
||||
|
||||
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
2. Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
3. Output:
|
||||
```
|
||||
Calculating perplexity over 655 chunks
|
||||
24.43 seconds per pass - ETA 4.45 hours
|
||||
[1]4.5970,[2]5.1807,[3]6.0382,...
|
||||
```
|
||||
And after 4.45 hours, you will have the final perplexity.
|
||||
|
||||
### Android
|
||||
|
||||
You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
|
||||
|
@ -290,7 +324,6 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
|
|||
|
||||
## Limitations
|
||||
|
||||
- We don't know yet how much the quantization affects the quality of the generated text
|
||||
- Probably the token sampling can be improved
|
||||
- The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder,
|
||||
there is no benefit compared to the ARM_NEON intrinsics implementation. Of course, it's possible that I simply don't
|
||||
|
|
158
ggml.c
158
ggml.c
|
@ -1,3 +1,6 @@
|
|||
// Defines CLOCK_MONOTONIC on Linux
|
||||
#define _POSIX_C_SOURCE 199309L
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
|
@ -400,9 +403,55 @@ static inline __m128i packNibbles( __m256i bytes )
|
|||
// method 5
|
||||
// blocks of QK elements
|
||||
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
static void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) {
|
||||
assert(k % QK == 0);
|
||||
const int nb = k / QK;
|
||||
|
||||
const size_t bs = sizeof(float) + QK/2;
|
||||
|
||||
uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
|
||||
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
|
||||
|
||||
uint8_t pp[QK/2];
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int l = 0; l < QK; l++) {
|
||||
const float v = x[i*QK + l];
|
||||
amax = MAX(amax, fabsf(v));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 3) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
*(float *)pd = d;
|
||||
pd += bs;
|
||||
|
||||
for (int l = 0; l < QK; l += 2) {
|
||||
const float v0 = x[i*QK + l + 0]*id;
|
||||
const float v1 = x[i*QK + l + 1]*id;
|
||||
|
||||
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
|
||||
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
|
||||
|
||||
assert(vi0 >= 0 && vi0 < 16);
|
||||
assert(vi1 >= 0 && vi1 < 16);
|
||||
|
||||
pp[l/2] = vi0 | (vi1 << 4);
|
||||
}
|
||||
|
||||
memcpy(pb, pp, sizeof(pp));
|
||||
pb += bs;
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
||||
assert(k % QK == 0);
|
||||
|
||||
#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__)
|
||||
const int nb = k / QK;
|
||||
const size_t bs = sizeof(float) + QK/2;
|
||||
|
||||
|
@ -410,6 +459,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|||
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
|
||||
|
||||
uint8_t pp[QK/2];
|
||||
#endif
|
||||
|
||||
#if __ARM_NEON
|
||||
#if QK == 32
|
||||
|
@ -566,36 +616,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|||
#endif
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int l = 0; l < QK; l++) {
|
||||
const float v = x[i*QK + l];
|
||||
amax = MAX(amax, fabsf(v));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 3) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
*(float *)pd = d;
|
||||
pd += bs;
|
||||
|
||||
for (int l = 0; l < QK; l += 2) {
|
||||
const float v0 = x[i*QK + l + 0]*id;
|
||||
const float v1 = x[i*QK + l + 1]*id;
|
||||
|
||||
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
|
||||
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
|
||||
|
||||
assert(vi0 >= 0 && vi0 < 16);
|
||||
assert(vi1 >= 0 && vi1 < 16);
|
||||
|
||||
pp[l/2] = vi0 | (vi1 << 4);
|
||||
}
|
||||
|
||||
memcpy(pb, pp, sizeof(pp));
|
||||
pb += bs;
|
||||
}
|
||||
quantize_row_q4_0_reference(x, y, k);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -10702,121 +10723,62 @@ enum ggml_opt_result ggml_opt(
|
|||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
|
||||
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
|
||||
const int nb = k / qk;
|
||||
const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
|
||||
const size_t row_size = nb*bs;
|
||||
|
||||
assert(k % qk == 0);
|
||||
|
||||
const size_t pp_size = qk / 2;
|
||||
uint8_t * pp = (uint8_t *) alloca(pp_size);
|
||||
|
||||
char * pdst = (char *) dst;
|
||||
|
||||
for (int j = 0; j < n; j += k) {
|
||||
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
|
||||
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
|
||||
|
||||
quantize_row_q4_0_reference(src + j, pd, k);
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
{
|
||||
for (int l = 0; l < qk; l++) {
|
||||
const float v = src[j + i*qk + l];
|
||||
amax = MAX(amax, fabsf(v));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 3) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
*(float *) pd = d;
|
||||
pd += bs;
|
||||
|
||||
for (int l = 0; l < qk; l += 2) {
|
||||
const float v0 = (src[j + i*qk + l + 0])*id;
|
||||
const float v1 = (src[j + i*qk + l + 1])*id;
|
||||
|
||||
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
|
||||
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
|
||||
|
||||
assert(vi0 >= 0 && vi0 < 16);
|
||||
assert(vi1 >= 0 && vi1 < 16);
|
||||
const uint8_t vi0 = pb[l/2] & 0xF;
|
||||
const uint8_t vi1 = pb[l/2] >> 4;
|
||||
|
||||
hist[vi0]++;
|
||||
hist[vi1]++;
|
||||
|
||||
pp[l/2] = vi0 | (vi1 << 4);
|
||||
}
|
||||
|
||||
memcpy(pb, pp, pp_size);
|
||||
pb += bs;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (n/k)*row_size;
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
|
||||
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
|
||||
const int nb = k / qk;
|
||||
const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
|
||||
const size_t row_size = nb*bs;
|
||||
|
||||
assert(k % qk == 0);
|
||||
|
||||
const size_t pp_size = qk / 2;
|
||||
uint8_t * pp = (uint8_t *) alloca(pp_size);
|
||||
|
||||
char * pdst = (char *) dst;
|
||||
|
||||
for (int j = 0; j < n; j += k) {
|
||||
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
|
||||
uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
|
||||
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
|
||||
|
||||
//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
|
||||
quantize_row_q4_1(src + j, pd, k);
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float min = FLT_MAX;
|
||||
float max = -FLT_MAX;
|
||||
|
||||
{
|
||||
for (int l = 0; l < qk; l++) {
|
||||
const float v = src[j + i*qk + l];
|
||||
if (v < min) min = v;
|
||||
if (v > max) max = v;
|
||||
}
|
||||
|
||||
const float d = (max - min) / ((1 << 4) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
*(float *) pd = d;
|
||||
*(float *) pm = min;
|
||||
pd += bs;
|
||||
pm += bs;
|
||||
|
||||
for (int l = 0; l < qk; l += 2) {
|
||||
const float v0 = (src[j + i*qk + l + 0] - min)*id;
|
||||
const float v1 = (src[j + i*qk + l + 1] - min)*id;
|
||||
|
||||
const uint8_t vi0 = round(v0);
|
||||
const uint8_t vi1 = round(v1);
|
||||
|
||||
assert(vi0 >= 0 && vi0 < 16);
|
||||
assert(vi1 >= 0 && vi1 < 16);
|
||||
const uint8_t vi0 = pb[l/2] & 0xF;
|
||||
const uint8_t vi1 = pb[l/2] >> 4;
|
||||
|
||||
hist[vi0]++;
|
||||
hist[vi1]++;
|
||||
|
||||
pp[l/2] = vi0 | (vi1 << 4);
|
||||
}
|
||||
|
||||
memcpy(pb, pp, pp_size);
|
||||
pb += bs;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (n/k)*row_size;
|
||||
}
|
||||
|
|
4
ggml.h
4
ggml.h
|
@ -745,8 +745,8 @@ enum ggml_opt_result ggml_opt(
|
|||
// quantization
|
||||
//
|
||||
|
||||
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
||||
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
||||
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
||||
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
||||
|
||||
//
|
||||
// system info
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include <queue>
|
||||
#include <regex>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
|
||||
// determine number of model parts based on the dimension
|
||||
static const std::unordered_map<int, int> LLAMA_N_PARTS = {
|
||||
|
|
8
main.cpp
8
main.cpp
|
@ -85,7 +85,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
|||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
// Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
// Output: `perplexity: 13.5106 [114/114]`
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt.c_str(), true);
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
int count = 0;
|
||||
double nll = 0.0;
|
||||
|
@ -254,6 +254,10 @@ int main(int argc, char ** argv) {
|
|||
params.interactive = true;
|
||||
}
|
||||
|
||||
if (params.interactive_start) {
|
||||
params.interactive = true;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
|
@ -297,7 +301,7 @@ int main(int argc, char ** argv) {
|
|||
#endif
|
||||
" - Press Return to return control to LLaMa.\n"
|
||||
" - If you want to submit another line, end your input in '\\'.\n\n");
|
||||
is_interacting = true;
|
||||
is_interacting = params.interactive_start;
|
||||
}
|
||||
|
||||
int input_consumed = 0;
|
||||
|
|
|
@ -1,4 +1,9 @@
|
|||
set(TEST_TARGET test-tokenizer-0)
|
||||
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
|
||||
function(llama_add_test source)
|
||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||
add_executable(${TEST_TARGET} ${source})
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
|
||||
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
||||
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||
endfunction()
|
||||
|
||||
llama_add_test(test-quantize.c)
|
||||
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
||||
|
|
42
tests/test-quantize.c
Normal file
42
tests/test-quantize.c
Normal file
|
@ -0,0 +1,42 @@
|
|||
#include "ggml.h"
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
|
||||
int main(void) {
|
||||
#define QK 32
|
||||
float src[QK];
|
||||
uint8_t dst[24];
|
||||
int64_t hist[16];
|
||||
|
||||
for (int i = 0; i < QK; i++) {
|
||||
src[i] = (float)(i + 1);
|
||||
}
|
||||
|
||||
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist);
|
||||
assert(size == 20);
|
||||
float max_result = ((float *)dst)[0];
|
||||
float max_expected = src[31] / ((1 << 3) - 1);
|
||||
assert(max_result == max_expected);
|
||||
for (int i = 0; i < QK; i++) {
|
||||
uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
|
||||
uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
|
||||
assert(q4_result == q4_expected);
|
||||
}
|
||||
|
||||
size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist);
|
||||
assert(size == 24);
|
||||
float delta_result = ((float *)dst)[0];
|
||||
float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
|
||||
assert(delta_result == delta_expected);
|
||||
float min_result = ((float *)dst)[1];
|
||||
float min_expected = src[0];
|
||||
assert(min_result == min_expected);
|
||||
for (int i = 0; i < QK; i++) {
|
||||
uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF);
|
||||
uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected);
|
||||
assert(q4_result == q4_expected);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -67,6 +67,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
params.embedding = true;
|
||||
} else if (arg == "--interactive-start") {
|
||||
params.interactive = true;
|
||||
} else if (arg == "--interactive-first") {
|
||||
params.interactive_start = true;
|
||||
} else if (arg == "-ins" || arg == "--instruct") {
|
||||
params.instruct = true;
|
||||
|
@ -101,9 +102,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -i, --interactive run in interactive mode\n");
|
||||
fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
|
||||
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
|
||||
fprintf(stderr, " in interactive mode, poll user input upon seeing PROMPT (can be\n");
|
||||
fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
|
||||
fprintf(stderr, " specified more than once for multiple prompts).\n");
|
||||
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
|
||||
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
|
||||
|
@ -151,8 +153,10 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
|||
|
||||
// TODO: not great allocating this every time
|
||||
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
|
||||
std::vector<llama_token> res(8096);
|
||||
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
|
||||
std::vector<llama_token> res(text.size() + (int)add_bos);
|
||||
int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
|
||||
assert(n >= 0);
|
||||
res.resize(n);
|
||||
|
||||
return res;
|
||||
|
|
4
utils.h
4
utils.h
|
@ -39,8 +39,10 @@ struct gpt_params {
|
|||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
|
||||
bool embedding = false; // get only sentence embedding
|
||||
bool interactive_start = false; // reverse prompt immediately
|
||||
bool interactive_start = false; // wait for user input immediately
|
||||
|
||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||
bool ignore_eos = false; // do not stop generating after eos
|
||||
bool perplexity = false; // compute perplexity over the prompt
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue