From 97ab2b257897bfe7e2ae72876a3e50ed41b8c7ce Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 14 Mar 2023 09:43:52 +0200 Subject: [PATCH 1/3] Add Misc section + update hot topics + minor fixes --- README.md | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index e936282f4..085f19e03 100644 --- a/README.md +++ b/README.md @@ -5,12 +5,17 @@ Inference of [Facebook's LLaMA](https://github.com/facebookresearch/llama) model in pure C/C++ +**Hot topics:** + +- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64 +- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105 + ## Description The main goal is to run the model using 4-bit quantization on a MacBook - Plain C/C++ implementation without dependencies -- Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework +- Apple silicon first-class citizen - optimized via ARM NEON - AVX2 support for x86 architectures - Mixed F16 / F32 precision - 4-bit quantization support @@ -174,7 +179,7 @@ Note the use of `--color` to distinguish between user input and generated text. ## Limitations -- I don't know yet how much the quantization affects the quality of the generated text +- We don't know yet how much the quantization affects the quality of the generated text - Probably the token sampling can be improved - The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder, there is no benefit compared to the ARM_NEON intrinsics implementation. Of course, it's possible that I simlpy don't @@ -187,11 +192,15 @@ Note the use of `--color` to distinguish between user input and generated text. - Collaborators can push to branches in the `llama.cpp` repo - Collaborators will be invited based on contributions -### Coding guide-lines +### Coding guidelines - Avoid adding third-party dependencies, extra files, extra headers, etc. - Always consider cross-compatibility with other operating systems and architectures -- Avoid fancy looking modern STL constructs, use basic for loops, avoid templates, keep it simple +- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit -- Clean-up any tailing whitespaces, use 4 spaces indentation, brackets on same line, `int * var` -- Look at the [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks +- Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a` +- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions + +### Misc + +- Practice your C++ typing skills: https://typing-battles.ggerganov.com From 60f819a2b10475055a36415bc489e5b55df2d052 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Tue, 14 Mar 2023 15:30:08 +0200 Subject: [PATCH 2/3] Add section to README on how to run the project on Android (#130) --- README.md | 17 +++++++++++++++++ models/.gitignore | 0 2 files changed, 17 insertions(+) delete mode 100644 models/.gitignore diff --git a/README.md b/README.md index 085f19e03..5d8b3b6db 100644 --- a/README.md +++ b/README.md @@ -177,6 +177,23 @@ Note the use of `--color` to distinguish between user input and generated text. ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png) +### Android + +You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux). +First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake: +``` +$ mkdir build-android +$ cd build-android +$ export NDK= +$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. +$ make +``` +Install [termux](https://play.google.com/store/apps/details?id=com.termux) on your device and run `termux-setup-storage` to get access to your SD card. +Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone: + +https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 + + ## Limitations - We don't know yet how much the quantization affects the quality of the generated text diff --git a/models/.gitignore b/models/.gitignore deleted file mode 100644 index e69de29bb..000000000 From 47857e564c218a2c38346d0cdd94314632878fcb Mon Sep 17 00:00:00 2001 From: Ronsor Date: Tue, 14 Mar 2023 12:34:37 -0700 Subject: [PATCH 3/3] Don't use vdotq_s32 if it's not available (#139) * Don't use vdotq_s32 if it's not available `dotprod` extensions aren't available on some ARM CPUs (e.g. Raspberry Pi 4), so check for them and only use them if they're available. Reintroduces the code removed in 84d9015 if `__ARM_FEATURE_DOTPROD` isn't defined. * Update ggml.c --------- Co-authored-by: Georgi Gerganov --- ggml.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 58a4c9b6d..42621267b 100644 --- a/ggml.c +++ b/ggml.c @@ -1359,8 +1359,8 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b); +#if defined(__ARM_FEATURE_DOTPROD) // dot product into int16x8_t - // assume that vdotq_s32 is always available, if not, should check for __ARM_FEATURE_DOTPROD int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls); int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls); @@ -1374,6 +1374,37 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void #else sum0 += d0_0*d1_0*(vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3)); sum1 += d0_1*d1_1*(vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3)); +#endif +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); + + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls)); + + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); + + const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h); + const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h); + + const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h); + const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h); + + const int16x8_t p_0 = vaddq_s16(pl_0, ph_0); + const int16x8_t p_1 = vaddq_s16(pl_1, ph_1); + + // scalar +#if defined(__ARM_FEATURE_QRDMX) + sum0 += d0_0*d1_0*vaddvq_s16(p_0); + sum1 += d0_1*d1_1*vaddvq_s16(p_1); +#else + sum0 += d0_0*d1_0*(vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7)); + sum1 += d0_1*d1_1*(vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7)); +#endif #endif }