Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
926e49e78e
10 changed files with 34 additions and 39 deletions
6
.github/workflows/build.yml
vendored
6
.github/workflows/build.yml
vendored
|
@ -72,6 +72,8 @@ jobs:
|
|||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug, Release]
|
||||
accelerate: [ON, OFF]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
@ -89,8 +91,8 @@ jobs:
|
|||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
cmake --build . --config Release
|
||||
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_ACCELERATE=${{ matrix.accelerate }}
|
||||
cmake --build . --config ${{ matrix.build_type }}
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
|
|
2
.github/workflows/docker.yml
vendored
2
.github/workflows/docker.yml
vendored
|
@ -49,6 +49,7 @@ jobs:
|
|||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: linux/amd64,linux/arm64
|
||||
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
||||
|
@ -57,5 +58,6 @@ jobs:
|
|||
with:
|
||||
context: .
|
||||
push: ${{ github.event_name == 'push' }}
|
||||
platforms: linux/amd64,linux/arm64
|
||||
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
|
@ -248,7 +248,7 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
|||
|
||||
### Perplexity (Measuring model quality)
|
||||
|
||||
You can pass `--perplexity` as a command line option to measure perplexity over the given prompt. For more background,
|
||||
You can use the `perplexity` example to measure perplexity over the given prompt. For more background,
|
||||
see https://huggingface.co/docs/transformers/perplexity. However, in general, lower perplexity is better for LLMs.
|
||||
|
||||
#### Latest measurements
|
||||
|
@ -271,10 +271,10 @@ Perplexity - model options
|
|||
#### How to run
|
||||
|
||||
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
2. Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
3. Output:
|
||||
```
|
||||
Calculating perplexity over 655 chunks
|
||||
perplexity : calculating perplexity over 655 chunks
|
||||
24.43 seconds per pass - ETA 4.45 hours
|
||||
[1]4.5970,[2]5.1807,[3]6.0382,...
|
||||
```
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET embedding)
|
||||
add_executable(${TARGET} embedding.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET main)
|
||||
add_executable(${TARGET} main.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
|
@ -493,7 +493,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
||||
if (params.interactive && n_remain <= 0) {
|
||||
if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
|
||||
n_remain = params.n_predict;
|
||||
is_interacting = true;
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET perplexity)
|
||||
add_executable(${TARGET} perplexity.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
|
@ -19,7 +19,7 @@ std::vector<double> softmax(const std::vector<float>& logits) {
|
|||
|
||||
void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
// Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
// Output: `perplexity: 13.5106 [114/114]`
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET quantize)
|
||||
add_executable(${TARGET} quantize.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
47
ggml.c
47
ggml.c
|
@ -1698,8 +1698,6 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
|
|||
// Horizontal sum of all lanes of the accumulator
|
||||
sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 );
|
||||
#elif defined(__AVX2__)
|
||||
const size_t countBlocks = nb;
|
||||
|
||||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
|
@ -5806,23 +5804,28 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
const int ne10 = src1->ne[0];
|
||||
#endif
|
||||
const int ne11 = src1->ne[1];
|
||||
//const int ne12 = src1->ne[2];
|
||||
//const int ne13 = src1->ne[3];
|
||||
#ifndef NDEBUG
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3];
|
||||
|
||||
//const int ne0 = dst->ne[0];
|
||||
//const int ne1 = dst->ne[1];
|
||||
//const int ne2 = dst->ne[2];
|
||||
//const int ne3 = dst->ne[3];
|
||||
//const int ne = ne0*ne1*ne2*ne3;
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
const int ne2 = dst->ne[2];
|
||||
const int ne3 = dst->ne[3];
|
||||
|
||||
//const int nb00 = src0->nb[0];
|
||||
const int nb00 = src0->nb[0];
|
||||
#endif
|
||||
const int nb01 = src0->nb[1];
|
||||
const int nb02 = src0->nb[2];
|
||||
const int nb03 = src0->nb[3];
|
||||
|
||||
#ifndef NDEBUG
|
||||
const int nb10 = src1->nb[0];
|
||||
#endif
|
||||
const int nb11 = src1->nb[1];
|
||||
const int nb12 = src1->nb[2];
|
||||
const int nb13 = src1->nb[3];
|
||||
|
@ -5840,8 +5843,9 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||
assert(ne2 == ne12);
|
||||
assert(ne3 == ne13);
|
||||
|
||||
// TODO: we don't support permuted src0
|
||||
// we don't support permuted src0 or src1
|
||||
assert(nb00 == sizeof(float));
|
||||
assert(nb10 == sizeof(float));
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
assert(nb0 == sizeof(float));
|
||||
|
@ -5859,8 +5863,6 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
@ -5903,9 +5905,6 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||
return;
|
||||
}
|
||||
|
||||
// TODO: do not support transposed src1
|
||||
assert(nb10 == sizeof(float));
|
||||
|
||||
// parallelize by src0 rows using ggml_vec_dot_f32
|
||||
|
||||
// total rows in src0
|
||||
|
@ -6169,7 +6168,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
|
|||
const int ne1 = dst->ne[1];
|
||||
const int ne2 = dst->ne[2];
|
||||
const int ne3 = dst->ne[3];
|
||||
//const int ne = ne0*ne1*ne2*ne3;
|
||||
|
||||
const int nb00 = src0->nb[0];
|
||||
const int nb01 = src0->nb[1];
|
||||
|
@ -6194,8 +6192,9 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
|
|||
GGML_ASSERT(ne2 == ne12);
|
||||
GGML_ASSERT(ne3 == ne13);
|
||||
|
||||
// TODO: we don't support permuted src0
|
||||
// we don't support permuted src0 or src1
|
||||
GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0]);
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
|
@ -6213,8 +6212,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
|
|||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
@ -6278,8 +6275,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
|
|||
return;
|
||||
}
|
||||
|
||||
// TODO: do not support transposed src1
|
||||
|
||||
// parallelize by src0 rows using ggml_vec_dot_q4_0
|
||||
|
||||
// total rows in src0
|
||||
|
@ -6354,7 +6349,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
|
|||
const int ne1 = dst->ne[1];
|
||||
const int ne2 = dst->ne[2];
|
||||
const int ne3 = dst->ne[3];
|
||||
//const int ne = ne0*ne1*ne2*ne3;
|
||||
|
||||
const int nb00 = src0->nb[0];
|
||||
const int nb01 = src0->nb[1];
|
||||
|
@ -6379,8 +6373,9 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
|
|||
GGML_ASSERT(ne2 == ne12);
|
||||
GGML_ASSERT(ne3 == ne13);
|
||||
|
||||
// TODO: we don't support permuted src0
|
||||
// we don't support permuted src0 or src1
|
||||
GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1]);
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
|
@ -6398,8 +6393,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
|
|||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
@ -6466,8 +6459,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
|
|||
return;
|
||||
}
|
||||
|
||||
// TODO: do not support transposed src1
|
||||
|
||||
// parallelize by src0 rows using ggml_vec_dot_q4_1
|
||||
|
||||
// total rows in src0
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue