diff --git a/Makefile b/Makefile index eba07670a..e0e5b96f1 100644 --- a/Makefile +++ b/Makefile @@ -235,7 +235,7 @@ extra.o: extra.cpp extra.h $(CXX) $(CXXFLAGS) -c extra.cpp -o extra.o clean: - rm -vf *.o main quantize perplexity + rm -vf *.o main quantize perplexity embedding main: examples/main/main.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) @@ -252,6 +252,9 @@ quantize: examples/quantize/quantize.cpp ggml.o llama.o perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) +embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o + $(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS) + # # Tests # diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index 88c425d4a..def5b831b 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -1,4 +1,4 @@ set(TARGET embedding) add_executable(${TARGET} embedding.cpp) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index b2dcc2910..aa1f79406 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -1,4 +1,4 @@ set(TARGET main) add_executable(${TARGET} main.cpp) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index e9478d541..66b7c2d5d 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -493,7 +493,7 @@ int main(int argc, char ** argv) { } // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. - if (params.interactive && n_remain <= 0) { + if (params.interactive && n_remain <= 0 && params.n_predict != -1) { n_remain = params.n_predict; is_interacting = true; } diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt index 5836df8b2..9bd8e376f 100644 --- a/examples/perplexity/CMakeLists.txt +++ b/examples/perplexity/CMakeLists.txt @@ -1,4 +1,4 @@ set(TARGET perplexity) add_executable(${TARGET} perplexity.cpp) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index f617ba365..75d526d3d 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -19,7 +19,7 @@ std::vector softmax(const std::vector& logits) { void perplexity(llama_context * ctx, const gpt_params & params) { // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research - // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` + // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` auto tokens = ::llama_tokenize(ctx, params.prompt, true); diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index fb27d4517..17a995bbd 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,4 +1,4 @@ set(TARGET quantize) add_executable(${TARGET} quantize.cpp) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama ggml ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/ggml.c b/ggml.c index c9a4e8675..ec00e6317 100644 --- a/ggml.c +++ b/ggml.c @@ -1698,8 +1698,6 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void // Horizontal sum of all lanes of the accumulator sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 ); #elif defined(__AVX2__) - const size_t countBlocks = nb; - // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); @@ -5806,23 +5804,28 @@ static void ggml_compute_forward_mul_mat_f32( const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3]; +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) const int ne10 = src1->ne[0]; +#endif const int ne11 = src1->ne[1]; - //const int ne12 = src1->ne[2]; - //const int ne13 = src1->ne[3]; +#ifndef NDEBUG + const int ne12 = src1->ne[2]; + const int ne13 = src1->ne[3]; - //const int ne0 = dst->ne[0]; - //const int ne1 = dst->ne[1]; - //const int ne2 = dst->ne[2]; - //const int ne3 = dst->ne[3]; - //const int ne = ne0*ne1*ne2*ne3; + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + const int ne2 = dst->ne[2]; + const int ne3 = dst->ne[3]; - //const int nb00 = src0->nb[0]; + const int nb00 = src0->nb[0]; +#endif const int nb01 = src0->nb[1]; const int nb02 = src0->nb[2]; const int nb03 = src0->nb[3]; +#ifndef NDEBUG const int nb10 = src1->nb[0]; +#endif const int nb11 = src1->nb[1]; const int nb12 = src1->nb[2]; const int nb13 = src1->nb[3]; @@ -5840,8 +5843,9 @@ static void ggml_compute_forward_mul_mat_f32( assert(ne2 == ne12); assert(ne3 == ne13); - // TODO: we don't support permuted src0 + // we don't support permuted src0 or src1 assert(nb00 == sizeof(float)); + assert(nb10 == sizeof(float)); // dst cannot be transposed or permuted assert(nb0 == sizeof(float)); @@ -5859,8 +5863,6 @@ static void ggml_compute_forward_mul_mat_f32( #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - GGML_ASSERT(nb10 == sizeof(float)); - if (params->ith != 0) { return; } @@ -5903,9 +5905,6 @@ static void ggml_compute_forward_mul_mat_f32( return; } - // TODO: do not support transposed src1 - assert(nb10 == sizeof(float)); - // parallelize by src0 rows using ggml_vec_dot_f32 // total rows in src0 @@ -6169,7 +6168,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( const int ne1 = dst->ne[1]; const int ne2 = dst->ne[2]; const int ne3 = dst->ne[3]; - //const int ne = ne0*ne1*ne2*ne3; const int nb00 = src0->nb[0]; const int nb01 = src0->nb[1]; @@ -6194,8 +6192,9 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( GGML_ASSERT(ne2 == ne12); GGML_ASSERT(ne3 == ne13); - // TODO: we don't support permuted src0 + // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0]); + GGML_ASSERT(nb10 == sizeof(float)); // dst cannot be transposed or permuted GGML_ASSERT(nb0 == sizeof(float)); @@ -6213,8 +6212,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - GGML_ASSERT(nb10 == sizeof(float)); - if (params->ith != 0) { return; } @@ -6278,8 +6275,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( return; } - // TODO: do not support transposed src1 - // parallelize by src0 rows using ggml_vec_dot_q4_0 // total rows in src0 @@ -6354,7 +6349,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( const int ne1 = dst->ne[1]; const int ne2 = dst->ne[2]; const int ne3 = dst->ne[3]; - //const int ne = ne0*ne1*ne2*ne3; const int nb00 = src0->nb[0]; const int nb01 = src0->nb[1]; @@ -6379,8 +6373,9 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( GGML_ASSERT(ne2 == ne12); GGML_ASSERT(ne3 == ne13); - // TODO: we don't support permuted src0 + // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1]); + GGML_ASSERT(nb10 == sizeof(float)); // dst cannot be transposed or permuted GGML_ASSERT(nb0 == sizeof(float)); @@ -6398,8 +6393,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - GGML_ASSERT(nb10 == sizeof(float)); - if (params->ith != 0) { return; } @@ -6466,8 +6459,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( return; } - // TODO: do not support transposed src1 - // parallelize by src0 rows using ggml_vec_dot_q4_1 // total rows in src0 diff --git a/llamacpp.dll b/llamacpp.dll index 11ecb4fa2..c5dc1bfff 100644 Binary files a/llamacpp.dll and b/llamacpp.dll differ diff --git a/main.exe b/main.exe index 6eeeeff11..3c5e6beb4 100644 Binary files a/main.exe and b/main.exe differ diff --git a/quantize.exe b/quantize.exe index 8ca9ddfbc..7f4f3f54b 100644 Binary files a/quantize.exe and b/quantize.exe differ