Improvements for Windows with Snapdragon X

This commit is contained in:
AndreasKunar 2024-07-17 11:54:53 +02:00
parent 820665f3a1
commit 1289e3516e
2 changed files with 14 additions and 7 deletions

View file

@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options.
make make
``` ```
- On Windows: - On Windows (x86/x64 only, arm64 requires cmake):
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
2. Extract `w64devkit` on your pc. 2. Extract `w64devkit` on your pc.
@ -31,6 +31,13 @@ In order to build llama.cpp you have four different options.
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`. - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel. - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/). - For faster repeated compilation, install [ccache](https://ccache.dev/).
- For Windows:
- Install cmake e.g. via `winget install cmake`:
- As alternative to the w64devkit mentioned in "using make" above, install MSVC (e.g. via Visual Studio 2022 Community Edition).
- For Windows on ARM you need MSVC installed and _additonally_:
- Install [clang via LLVM for woa64](https://releases.llvm.org) to enable better ARM optimizations (clang needs the MSVC backend).
- For using clang, the first build step needs to be `cmake --preset arm64-windows-llvm-release` (instead of the `cmake -B ...` which defaults to MSVC).
- Note: Building for ARM can also be done just with MSVC (without installing clang or using the preset), but this does not support e.g. the accelerated Q_4_0_4_4/Q_4_0_4_8 kernels (a 2-2.5x prompt-processing speed improvement on the CPU), because the MSVC frontend cannot inline ARM assembly-code.
- For debug builds, run `make LLAMA_DEBUG=1` - For debug builds, run `make LLAMA_DEBUG=1`
- Using `CMake`: - Using `CMake`:

View file

@ -392,7 +392,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) && GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance"); "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
#elif defined(__ARM_NEON) && defined(__aarch64__) #elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;
float * res_ptr = s; float * res_ptr = s;
@ -501,7 +501,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
} }
#endif #endif
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;
float * res_ptr = s; float * res_ptr = s;
@ -613,7 +613,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(ncols_interleaved); UNUSED(ncols_interleaved);
UNUSED(blocklen); UNUSED(blocklen);
#if defined(__ARM_FEATURE_SVE) #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
if (svcntw() == 8) { if (svcntw() == 8) {
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;
@ -753,7 +753,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) && GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance"); "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
#elif defined(__ARM_NEON) && defined(__aarch64__) #elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;
float * res_ptr = s; float * res_ptr = s;
@ -1271,7 +1271,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
} }
#endif #endif
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;
float * res_ptr = s; float * res_ptr = s;
@ -1727,7 +1727,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(ncols_interleaved); UNUSED(ncols_interleaved);
UNUSED(blocklen); UNUSED(blocklen);
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
if (svcntw() == 8) { if (svcntw() == 8) {
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;