merge
This commit is contained in:
commit
84ab887349
12 changed files with 180 additions and 125 deletions
10
.github/ISSUE_TEMPLATE/custom.md
vendored
10
.github/ISSUE_TEMPLATE/custom.md
vendored
|
@ -1,7 +1,7 @@
|
||||||
---
|
---
|
||||||
name: Custom issue template
|
name: Issue and enhancement template
|
||||||
about: Used to report user-related issues with the software
|
about: Used to report issues and request enhancements for llama.cpp
|
||||||
title: "[User] I encountered a problem .."
|
title: "[User] Insert summary of your issue or enhancement.."
|
||||||
labels: ''
|
labels: ''
|
||||||
assignees: ''
|
assignees: ''
|
||||||
|
|
||||||
|
@ -18,11 +18,11 @@ Please answer the following questions for yourself before submitting an issue.
|
||||||
|
|
||||||
# Expected Behavior
|
# Expected Behavior
|
||||||
|
|
||||||
Please provide a detailed written description of what you were trying to do, and what you expected `lamma.cpp` to do.
|
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
|
||||||
|
|
||||||
# Current Behavior
|
# Current Behavior
|
||||||
|
|
||||||
Please provide a detailed written description of what `lamma.cpp` did, instead.
|
Please provide a detailed written description of what `llama.cpp` did, instead.
|
||||||
|
|
||||||
# Environment and Context
|
# Environment and Context
|
||||||
|
|
||||||
|
|
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
|
@ -89,7 +89,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake ..
|
cmake -DLLAMA_AVX2=OFF ..
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
ctest --output-on-failure
|
ctest --output-on-failure
|
||||||
|
|
||||||
|
|
|
@ -217,6 +217,7 @@ add_library(utils OBJECT
|
||||||
|
|
||||||
target_include_directories(utils PUBLIC .)
|
target_include_directories(utils PUBLIC .)
|
||||||
target_compile_features(utils PUBLIC cxx_std_11) # don't bump
|
target_compile_features(utils PUBLIC cxx_std_11) # don't bump
|
||||||
|
target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
|
||||||
|
|
||||||
add_library(ggml OBJECT
|
add_library(ggml OBJECT
|
||||||
ggml.c
|
ggml.c
|
||||||
|
@ -226,12 +227,13 @@ target_include_directories(ggml PUBLIC .)
|
||||||
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
||||||
target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||||
|
|
||||||
add_library(llama OBJECT
|
add_library(llama
|
||||||
llama.cpp
|
llama.cpp
|
||||||
llama.h)
|
llama.h)
|
||||||
|
|
||||||
target_include_directories(llama PUBLIC .)
|
target_include_directories(llama PUBLIC .)
|
||||||
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
|
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
|
||||||
|
target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
|
||||||
|
|
||||||
#
|
#
|
||||||
# Executables
|
# Executables
|
||||||
|
|
35
README.md
35
README.md
|
@ -240,6 +240,40 @@ or
|
||||||
|
|
||||||
`shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS
|
`shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS
|
||||||
|
|
||||||
|
### Perplexity (Measuring model quality)
|
||||||
|
|
||||||
|
You can pass `--perplexity` as a command line option to measure perplexity over the given prompt. For more background,
|
||||||
|
see https://huggingface.co/docs/transformers/perplexity. However, in general, lower perplexity is better for LLMs.
|
||||||
|
|
||||||
|
#### Measurements
|
||||||
|
|
||||||
|
https://github.com/ggerganov/llama.cpp/pull/270 is the unofficial tracking page for now. llama.cpp is measuring very well
|
||||||
|
compared to the baseline implementations. Quantization has a small negative impact to quality, but, as you can see, running
|
||||||
|
13B at q4_0 beats the 7B f16 model by a significant amount.
|
||||||
|
|
||||||
|
All measurements are done against wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context).
|
||||||
|
Note that the changing the context length will have a significant impact on perplexity (longer context = better perplexity).
|
||||||
|
```
|
||||||
|
Perplexity - model options
|
||||||
|
5.5985 - 13B, q4_0
|
||||||
|
5.9565 - 7B, f16
|
||||||
|
6.3001 - 7B, q4_1
|
||||||
|
6.5949 - 7B, q4_0
|
||||||
|
6.5995 - 7B, q4_0, --memory_f16
|
||||||
|
```
|
||||||
|
|
||||||
|
#### How to run
|
||||||
|
|
||||||
|
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||||
|
2. Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||||
|
3. Output:
|
||||||
|
```
|
||||||
|
Calculating perplexity over 655 chunks
|
||||||
|
24.43 seconds per pass - ETA 4.45 hours
|
||||||
|
[1]4.5970,[2]5.1807,[3]6.0382,...
|
||||||
|
```
|
||||||
|
And after 4.45 hours, you will have the final perplexity.
|
||||||
|
|
||||||
### Android
|
### Android
|
||||||
|
|
||||||
You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
|
You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
|
||||||
|
@ -290,7 +324,6 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
|
||||||
|
|
||||||
## Limitations
|
## Limitations
|
||||||
|
|
||||||
- We don't know yet how much the quantization affects the quality of the generated text
|
|
||||||
- Probably the token sampling can be improved
|
- Probably the token sampling can be improved
|
||||||
- The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder,
|
- The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder,
|
||||||
there is no benefit compared to the ARM_NEON intrinsics implementation. Of course, it's possible that I simply don't
|
there is no benefit compared to the ARM_NEON intrinsics implementation. Of course, it's possible that I simply don't
|
||||||
|
|
174
ggml.c
174
ggml.c
|
@ -1,3 +1,6 @@
|
||||||
|
// Defines CLOCK_MONOTONIC on Linux
|
||||||
|
#define _POSIX_C_SOURCE 199309L
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
|
@ -400,9 +403,55 @@ static inline __m128i packNibbles( __m256i bytes )
|
||||||
// method 5
|
// method 5
|
||||||
// blocks of QK elements
|
// blocks of QK elements
|
||||||
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
|
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
|
||||||
|
|
||||||
|
// reference implementation for deterministic creation of model files
|
||||||
|
static void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) {
|
||||||
|
assert(k % QK == 0);
|
||||||
|
const int nb = k / QK;
|
||||||
|
|
||||||
|
const size_t bs = sizeof(float) + QK/2;
|
||||||
|
|
||||||
|
uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
|
||||||
|
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
|
||||||
|
|
||||||
|
uint8_t pp[QK/2];
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
float amax = 0.0f; // absolute max
|
||||||
|
|
||||||
|
for (int l = 0; l < QK; l++) {
|
||||||
|
const float v = x[i*QK + l];
|
||||||
|
amax = MAX(amax, fabsf(v));
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = amax / ((1 << 3) - 1);
|
||||||
|
const float id = d ? 1.0f/d : 0.0f;
|
||||||
|
|
||||||
|
*(float *)pd = d;
|
||||||
|
pd += bs;
|
||||||
|
|
||||||
|
for (int l = 0; l < QK; l += 2) {
|
||||||
|
const float v0 = x[i*QK + l + 0]*id;
|
||||||
|
const float v1 = x[i*QK + l + 1]*id;
|
||||||
|
|
||||||
|
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
|
||||||
|
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
|
||||||
|
|
||||||
|
assert(vi0 >= 0 && vi0 < 16);
|
||||||
|
assert(vi1 >= 0 && vi1 < 16);
|
||||||
|
|
||||||
|
pp[l/2] = vi0 | (vi1 << 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(pb, pp, sizeof(pp));
|
||||||
|
pb += bs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
||||||
assert(k % QK == 0);
|
assert(k % QK == 0);
|
||||||
|
|
||||||
|
#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__)
|
||||||
const int nb = k / QK;
|
const int nb = k / QK;
|
||||||
const size_t bs = sizeof(float) + QK/2;
|
const size_t bs = sizeof(float) + QK/2;
|
||||||
|
|
||||||
|
@ -410,6 +459,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
||||||
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
|
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
|
||||||
|
|
||||||
uint8_t pp[QK/2];
|
uint8_t pp[QK/2];
|
||||||
|
#endif
|
||||||
|
|
||||||
#if __ARM_NEON
|
#if __ARM_NEON
|
||||||
#if QK == 32
|
#if QK == 32
|
||||||
|
@ -566,36 +616,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
// scalar
|
// scalar
|
||||||
for (int i = 0; i < nb; i++) {
|
quantize_row_q4_0_reference(x, y, k);
|
||||||
float amax = 0.0f; // absolute max
|
|
||||||
|
|
||||||
for (int l = 0; l < QK; l++) {
|
|
||||||
const float v = x[i*QK + l];
|
|
||||||
amax = MAX(amax, fabsf(v));
|
|
||||||
}
|
|
||||||
|
|
||||||
const float d = amax / ((1 << 3) - 1);
|
|
||||||
const float id = d ? 1.0f/d : 0.0f;
|
|
||||||
|
|
||||||
*(float *)pd = d;
|
|
||||||
pd += bs;
|
|
||||||
|
|
||||||
for (int l = 0; l < QK; l += 2) {
|
|
||||||
const float v0 = x[i*QK + l + 0]*id;
|
|
||||||
const float v1 = x[i*QK + l + 1]*id;
|
|
||||||
|
|
||||||
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
|
|
||||||
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
|
|
||||||
|
|
||||||
assert(vi0 >= 0 && vi0 < 16);
|
|
||||||
assert(vi1 >= 0 && vi1 < 16);
|
|
||||||
|
|
||||||
pp[l/2] = vi0 | (vi1 << 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
memcpy(pb, pp, sizeof(pp));
|
|
||||||
pb += bs;
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10702,119 +10723,60 @@ enum ggml_opt_result ggml_opt(
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
|
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
|
||||||
const int nb = k / qk;
|
const int nb = k / qk;
|
||||||
const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
|
const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
|
||||||
const size_t row_size = nb*bs;
|
const size_t row_size = nb*bs;
|
||||||
|
|
||||||
assert(k % qk == 0);
|
assert(k % qk == 0);
|
||||||
|
|
||||||
const size_t pp_size = qk / 2;
|
|
||||||
uint8_t * pp = (uint8_t *) alloca(pp_size);
|
|
||||||
|
|
||||||
char * pdst = (char *) dst;
|
char * pdst = (char *) dst;
|
||||||
|
|
||||||
for (int j = 0; j < n; j += k) {
|
for (int j = 0; j < n; j += k) {
|
||||||
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
|
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
|
||||||
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
|
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
|
||||||
|
|
||||||
|
quantize_row_q4_0_reference(src + j, pd, k);
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
float amax = 0.0f; // absolute max
|
for (int l = 0; l < qk; l += 2) {
|
||||||
|
const uint8_t vi0 = pb[l/2] & 0xF;
|
||||||
|
const uint8_t vi1 = pb[l/2] >> 4;
|
||||||
|
|
||||||
{
|
hist[vi0]++;
|
||||||
for (int l = 0; l < qk; l++) {
|
hist[vi1]++;
|
||||||
const float v = src[j + i*qk + l];
|
|
||||||
amax = MAX(amax, fabsf(v));
|
|
||||||
}
|
|
||||||
|
|
||||||
const float d = amax / ((1 << 3) - 1);
|
|
||||||
const float id = d ? 1.0f/d : 0.0f;
|
|
||||||
|
|
||||||
*(float *) pd = d;
|
|
||||||
pd += bs;
|
|
||||||
|
|
||||||
for (int l = 0; l < qk; l += 2) {
|
|
||||||
const float v0 = (src[j + i*qk + l + 0])*id;
|
|
||||||
const float v1 = (src[j + i*qk + l + 1])*id;
|
|
||||||
|
|
||||||
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
|
|
||||||
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
|
|
||||||
|
|
||||||
assert(vi0 >= 0 && vi0 < 16);
|
|
||||||
assert(vi1 >= 0 && vi1 < 16);
|
|
||||||
|
|
||||||
hist[vi0]++;
|
|
||||||
hist[vi1]++;
|
|
||||||
|
|
||||||
pp[l/2] = vi0 | (vi1 << 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
memcpy(pb, pp, pp_size);
|
|
||||||
pb += bs;
|
|
||||||
}
|
}
|
||||||
|
pb += bs;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return (n/k)*row_size;
|
return (n/k)*row_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
|
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
|
||||||
const int nb = k / qk;
|
const int nb = k / qk;
|
||||||
const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
|
const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
|
||||||
const size_t row_size = nb*bs;
|
const size_t row_size = nb*bs;
|
||||||
|
|
||||||
assert(k % qk == 0);
|
assert(k % qk == 0);
|
||||||
|
|
||||||
const size_t pp_size = qk / 2;
|
|
||||||
uint8_t * pp = (uint8_t *) alloca(pp_size);
|
|
||||||
|
|
||||||
char * pdst = (char *) dst;
|
char * pdst = (char *) dst;
|
||||||
|
|
||||||
for (int j = 0; j < n; j += k) {
|
for (int j = 0; j < n; j += k) {
|
||||||
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
|
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
|
||||||
uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
|
|
||||||
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
|
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
|
||||||
|
|
||||||
//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
|
quantize_row_q4_1(src + j, pd, k);
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
float min = FLT_MAX;
|
for (int l = 0; l < qk; l += 2) {
|
||||||
float max = -FLT_MAX;
|
const uint8_t vi0 = pb[l/2] & 0xF;
|
||||||
|
const uint8_t vi1 = pb[l/2] >> 4;
|
||||||
|
|
||||||
{
|
hist[vi0]++;
|
||||||
for (int l = 0; l < qk; l++) {
|
hist[vi1]++;
|
||||||
const float v = src[j + i*qk + l];
|
|
||||||
if (v < min) min = v;
|
|
||||||
if (v > max) max = v;
|
|
||||||
}
|
|
||||||
|
|
||||||
const float d = (max - min) / ((1 << 4) - 1);
|
|
||||||
const float id = d ? 1.0f/d : 0.0f;
|
|
||||||
|
|
||||||
*(float *) pd = d;
|
|
||||||
*(float *) pm = min;
|
|
||||||
pd += bs;
|
|
||||||
pm += bs;
|
|
||||||
|
|
||||||
for (int l = 0; l < qk; l += 2) {
|
|
||||||
const float v0 = (src[j + i*qk + l + 0] - min)*id;
|
|
||||||
const float v1 = (src[j + i*qk + l + 1] - min)*id;
|
|
||||||
|
|
||||||
const uint8_t vi0 = round(v0);
|
|
||||||
const uint8_t vi1 = round(v1);
|
|
||||||
|
|
||||||
assert(vi0 >= 0 && vi0 < 16);
|
|
||||||
assert(vi1 >= 0 && vi1 < 16);
|
|
||||||
|
|
||||||
hist[vi0]++;
|
|
||||||
hist[vi1]++;
|
|
||||||
|
|
||||||
pp[l/2] = vi0 | (vi1 << 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
memcpy(pb, pp, pp_size);
|
|
||||||
pb += bs;
|
|
||||||
}
|
}
|
||||||
|
pb += bs;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
4
ggml.h
4
ggml.h
|
@ -745,8 +745,8 @@ enum ggml_opt_result ggml_opt(
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
|
||||||
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
||||||
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
||||||
|
|
||||||
//
|
//
|
||||||
// system info
|
// system info
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
#include <queue>
|
#include <queue>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
// determine number of model parts based on the dimension
|
// determine number of model parts based on the dimension
|
||||||
static const std::unordered_map<int, int> LLAMA_N_PARTS = {
|
static const std::unordered_map<int, int> LLAMA_N_PARTS = {
|
||||||
|
|
8
main.cpp
8
main.cpp
|
@ -85,7 +85,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||||
// Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
// Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||||
// Output: `perplexity: 13.5106 [114/114]`
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
auto tokens = ::llama_tokenize(ctx, params.prompt.c_str(), true);
|
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
double nll = 0.0;
|
double nll = 0.0;
|
||||||
|
@ -254,6 +254,10 @@ int main(int argc, char ** argv) {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params.interactive_start) {
|
||||||
|
params.interactive = true;
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
|
@ -297,7 +301,7 @@ int main(int argc, char ** argv) {
|
||||||
#endif
|
#endif
|
||||||
" - Press Return to return control to LLaMa.\n"
|
" - Press Return to return control to LLaMa.\n"
|
||||||
" - If you want to submit another line, end your input in '\\'.\n\n");
|
" - If you want to submit another line, end your input in '\\'.\n\n");
|
||||||
is_interacting = true;
|
is_interacting = params.interactive_start;
|
||||||
}
|
}
|
||||||
|
|
||||||
int input_consumed = 0;
|
int input_consumed = 0;
|
||||||
|
|
|
@ -1,4 +1,9 @@
|
||||||
set(TEST_TARGET test-tokenizer-0)
|
function(llama_add_test source)
|
||||||
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
|
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||||
target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
|
add_executable(${TEST_TARGET} ${source})
|
||||||
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
|
||||||
|
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
llama_add_test(test-quantize.c)
|
||||||
|
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
||||||
|
|
42
tests/test-quantize.c
Normal file
42
tests/test-quantize.c
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
#include "ggml.h"
|
||||||
|
#undef NDEBUG
|
||||||
|
#include <assert.h>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
int main(void) {
|
||||||
|
#define QK 32
|
||||||
|
float src[QK];
|
||||||
|
uint8_t dst[24];
|
||||||
|
int64_t hist[16];
|
||||||
|
|
||||||
|
for (int i = 0; i < QK; i++) {
|
||||||
|
src[i] = (float)(i + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist);
|
||||||
|
assert(size == 20);
|
||||||
|
float max_result = ((float *)dst)[0];
|
||||||
|
float max_expected = src[31] / ((1 << 3) - 1);
|
||||||
|
assert(max_result == max_expected);
|
||||||
|
for (int i = 0; i < QK; i++) {
|
||||||
|
uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
|
||||||
|
uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
|
||||||
|
assert(q4_result == q4_expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist);
|
||||||
|
assert(size == 24);
|
||||||
|
float delta_result = ((float *)dst)[0];
|
||||||
|
float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
|
||||||
|
assert(delta_result == delta_expected);
|
||||||
|
float min_result = ((float *)dst)[1];
|
||||||
|
float min_expected = src[0];
|
||||||
|
assert(min_result == min_expected);
|
||||||
|
for (int i = 0; i < QK; i++) {
|
||||||
|
uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF);
|
||||||
|
uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected);
|
||||||
|
assert(q4_result == q4_expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -67,6 +67,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
} else if (arg == "--interactive-start") {
|
} else if (arg == "--interactive-start") {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
|
} else if (arg == "--interactive-first") {
|
||||||
params.interactive_start = true;
|
params.interactive_start = true;
|
||||||
} else if (arg == "-ins" || arg == "--instruct") {
|
} else if (arg == "-ins" || arg == "--instruct") {
|
||||||
params.instruct = true;
|
params.instruct = true;
|
||||||
|
@ -101,9 +102,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stderr, "options:\n");
|
fprintf(stderr, "options:\n");
|
||||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||||
fprintf(stderr, " -i, --interactive run in interactive mode\n");
|
fprintf(stderr, " -i, --interactive run in interactive mode\n");
|
||||||
|
fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
|
||||||
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||||
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
|
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
|
||||||
fprintf(stderr, " in interactive mode, poll user input upon seeing PROMPT (can be\n");
|
fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
|
||||||
fprintf(stderr, " specified more than once for multiple prompts).\n");
|
fprintf(stderr, " specified more than once for multiple prompts).\n");
|
||||||
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
|
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
|
||||||
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
|
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
|
||||||
|
@ -151,8 +153,10 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
||||||
|
|
||||||
// TODO: not great allocating this every time
|
// TODO: not great allocating this every time
|
||||||
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
|
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
|
||||||
std::vector<llama_token> res(8096);
|
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
|
||||||
|
std::vector<llama_token> res(text.size() + (int)add_bos);
|
||||||
int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
|
int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
|
||||||
|
assert(n >= 0);
|
||||||
res.resize(n);
|
res.resize(n);
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
|
|
4
utils.h
4
utils.h
|
@ -39,8 +39,10 @@ struct gpt_params {
|
||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool random_prompt = false; // do not randomize prompt if none provided
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
|
|
||||||
bool embedding = false; // get only sentence embedding
|
bool embedding = false; // get only sentence embedding
|
||||||
bool interactive_start = false; // reverse prompt immediately
|
bool interactive_start = false; // wait for user input immediately
|
||||||
|
|
||||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||||
bool ignore_eos = false; // do not stop generating after eos
|
bool ignore_eos = false; // do not stop generating after eos
|
||||||
bool perplexity = false; // compute perplexity over the prompt
|
bool perplexity = false; // compute perplexity over the prompt
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue