From d7e852c1bc8e85bf62a6f1aede08cd2de723404a Mon Sep 17 00:00:00 2001 From: jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> Date: Tue, 21 May 2024 14:39:48 +0200 Subject: [PATCH 01/34] Tokenizer SPM fixes for phi-3 and llama-spm (bugfix) (#7425) * Update brute force test: add_special * Update brute force test: default values for add_bos_token and add_eos_token * Enable rtrim when pre-inserting BOS Co-authored-by: Georgi Gerganov * Revert "server : fix test regexes" --- convert-hf-to-gguf.py | 4 ++-- examples/server/tests/features/server.feature | 10 ++++----- .../server/tests/features/slotsave.feature | 4 ++-- llama.cpp | 11 +++++----- tests/test-tokenizer-random.py | 22 +++++++++++-------- 5 files changed, 28 insertions(+), 23 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 8937a4981..1acf45bf2 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1749,7 +1749,7 @@ class Phi3MiniModel(Model): token_id = int(token_id) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: - assert(tokens[token_id] == token) + assert tokens[token_id] == token tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -1765,7 +1765,7 @@ class Phi3MiniModel(Model): token_id = int(foken_data["id"]) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: - assert(tokens[token_id] == token) + assert tokens[token_id] == token tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 048cfad06..d21c09135 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -37,8 +37,8 @@ Feature: llama.cpp server Examples: Prompts | prompt | n_predict | re_content | n_prompt | n_predicted | truncated | - | I believe the meaning of life is | 8 | (read\|going\|pretty)+ | 18 | 8 | not | - | Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 45 | 64 | not | + | I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not | + | Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 46 | 64 | not | Scenario: Completion prompt truncated Given a prompt: @@ -67,8 +67,8 @@ Feature: llama.cpp server Examples: Prompts | model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated | - | llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 76 | 8 | disabled | not | - | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|fireplace)+ | -1 | 64 | enabled | | + | llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not | + | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | | Scenario Outline: OAI Compatibility w/ response format @@ -84,7 +84,7 @@ Feature: llama.cpp server | response_format | n_predicted | re_content | | {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" | | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] | - | {"type": "json_object"} | 10 | \{ " Saragine. | + | {"type": "json_object"} | 10 | \{ " Jacky. | Scenario: Tokenize / Detokenize diff --git a/examples/server/tests/features/slotsave.feature b/examples/server/tests/features/slotsave.feature index ba4ecb6f5..1c281c074 100644 --- a/examples/server/tests/features/slotsave.feature +++ b/examples/server/tests/features/slotsave.feature @@ -26,7 +26,7 @@ Feature: llama.cpp server slot management # Since we have cache, this should only process the last tokens Given a user prompt "What is the capital of Germany?" And a completion request with no api error - Then 24 tokens are predicted matching (Thank|special|Lily) + Then 24 tokens are predicted matching (Thank|special) And 7 prompt tokens are processed # Loading the original cache into slot 0, # we should only be processing 1 prompt token and get the same output @@ -41,7 +41,7 @@ Feature: llama.cpp server slot management Given a user prompt "What is the capital of Germany?" And using slot id 1 And a completion request with no api error - Then 24 tokens are predicted matching (Thank|special|Lily) + Then 24 tokens are predicted matching (Thank|special) And 1 prompt tokens are processed Scenario: Erase Slot diff --git a/llama.cpp b/llama.cpp index e2ebe1752..d26fe559a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12498,15 +12498,16 @@ static std::vector llama_tokenize_internal(const llama_vocab & // tokenizer.encode('', add_special_tokens=True) returns [1] // tokenizer.encode('', add_special_tokens=False) returns [] - if (add_special && vocab.special_add_bos != 0) { - GGML_ASSERT(vocab.special_bos_id != -1); - output.push_back(vocab.special_bos_id); - } - static const bool rtrim = true; //TODO: as param bool is_prev_special = false; bool special_token_rtrim = false; + if (add_special && vocab.special_add_bos != 0) { + GGML_ASSERT(vocab.special_bos_id != -1); + output.push_back(vocab.special_bos_id); + is_prev_special = true; + } + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { // without adding this leading whitespace, we do not get the same results as the original tokenizer diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 1166ac1e4..7e1b656e5 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -154,19 +154,22 @@ def generator_custom_text_edge_cases() -> Iterator[str]: '\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM) 'Cửa Việt', # llama-3, ignore_merges = true 'a', # Phi-3 fail - '<|endoftext|>' # Phi-3 fail + '<|endoftext|>', # Phi-3 fail 'a\na', # TODO: Bert fail ] -def generator_random_special_tokens(special_tokens:list[str], iterations=100) -> Iterator[str]: - special_tokens = set(special_tokens) +def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]: + special_tokens = set(tokenizer.all_special_tokens) special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "", ""]) special_tokens = list(sorted(special_tokens)) rand = random.Random() for m in range(iterations): rand.seed(m) words = rand.choices(special_tokens, k=500) + if tokenizer.add_bos_token: # skip spam warning of double BOS + while words and words[0] == tokenizer.bos_token: + words.pop(0) yield "".join(words) @@ -290,18 +293,19 @@ def main(argv: list[str] = None): model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096)) tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer) - def func_tokenize2(text: str): - return tokenizer.encode(text, add_special_tokens=False) - - parse_special = all(len(func_tokenize2(t)) == 1 for t in tokenizer.all_special_tokens) + tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", True) + tokenizer.add_eos_token = getattr(tokenizer, "add_eos_token", False) def func_tokenize1(text: str): - return model.tokenize(text, add_special=False, parse_special=parse_special) + return model.tokenize(text, add_special=True, parse_special=True) + + def func_tokenize2(text: str): + return tokenizer.encode(text, add_special_tokens=True) vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True))) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text()) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases()) - test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer.all_special_tokens, 10_000)) + test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer, 10_000)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000)) From d8ee90222791afff2ab666ded4cb6195fd94cced Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 21 May 2024 16:02:12 +0200 Subject: [PATCH 02/34] CUDA: deduplicate mmq code (#7397) --- ggml-cuda/mmq.cu | 1237 ++++++++++------------------------------------ 1 file changed, 271 insertions(+), 966 deletions(-) diff --git a/ggml-cuda/mmq.cu b/ggml-cuda/mmq.cu index 7948f1b12..5b540d375 100644 --- a/ggml-cuda/mmq.cu +++ b/ggml-cuda/mmq.cu @@ -9,6 +9,135 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k); typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v); +typedef void (mul_mat_q_t)( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst); + +struct mmq_arch_config_t { + int x; + int y; + int nwarps; +}; + +struct mmq_config_t { + mmq_arch_config_t rdna2; + mmq_arch_config_t rdna1; + mmq_arch_config_t ampere; + mmq_arch_config_t pascal; +}; + +constexpr mmq_config_t MMQ_CONFIG_Q4_0 = { +// x y nwarps + { 64, 128, 8}, + { 64, 64, 8}, +#ifdef CUDA_USE_TENSOR_CORES + { 4, 32, 4}, +#else + { 64, 128, 4}, +#endif // CUDA_USE_TENSOR_CORES + { 64, 64, 8}, +}; +constexpr mmq_config_t MMQ_CONFIG_Q4_1 = { +// x y nwarps + { 64, 128, 8}, + { 64, 64, 8}, +#ifdef CUDA_USE_TENSOR_CORES + { 4, 32, 4}, +#else + { 64, 128, 4}, +#endif // CUDA_USE_TENSOR_CORES + { 64, 64, 8}, +}; +constexpr mmq_config_t MMQ_CONFIG_Q5_0 = { +// x y nwarps + { 64, 128, 8}, + { 64, 64, 8}, +#ifdef CUDA_USE_TENSOR_CORES + { 4, 32, 4}, +#else + {128, 64, 4}, +#endif // CUDA_USE_TENSOR_CORES + { 64, 64, 8}, +}; +constexpr mmq_config_t MMQ_CONFIG_Q5_1 = { +// x y nwarps + { 64, 128, 8}, + { 64, 64, 8}, +#ifdef CUDA_USE_TENSOR_CORES + { 4, 32, 4}, +#else + {128, 64, 4}, +#endif // CUDA_USE_TENSOR_CORES + { 64, 64, 8}, +}; +constexpr mmq_config_t MMQ_CONFIG_Q8_0 = { +// x y nwarps + { 64, 128, 8}, + { 64, 64, 8}, +#ifdef CUDA_USE_TENSOR_CORES + { 4, 32, 4}, +#else + {128, 64, 4}, +#endif // CUDA_USE_TENSOR_CORES + { 64, 64, 8}, +}; +constexpr mmq_config_t MMQ_CONFIG_Q2_K = { +// x y nwarps + { 64, 128, 8}, + {128, 32, 8}, +#ifdef CUDA_USE_TENSOR_CORES + { 4, 32, 4}, +#else + { 64, 128, 4}, +#endif // CUDA_USE_TENSOR_CORES + { 64, 64, 8}, +}; +constexpr mmq_config_t MMQ_CONFIG_Q3_K = { +// x y nwarps + {128, 64, 8}, + { 32, 128, 8}, +#ifdef CUDA_USE_TENSOR_CORES + { 4, 32, 4}, +#else + {128, 128, 4}, +#endif // CUDA_USE_TENSOR_CORES + { 64, 64, 8}, +}; +constexpr mmq_config_t MMQ_CONFIG_Q4_K = { +// x y nwarps + { 64, 128, 8}, + { 32, 64, 8}, +#ifdef CUDA_USE_TENSOR_CORES + { 4, 32, 4}, +#else + { 64, 128, 4}, +#endif // CUDA_USE_TENSOR_CORES + { 64, 64, 8}, +}; +constexpr mmq_config_t MMQ_CONFIG_Q5_K = { +// x y nwarps + { 64, 128, 8}, + { 32, 64, 8}, +#ifdef CUDA_USE_TENSOR_CORES + { 4, 32, 4}, +#else + { 64, 128, 4}, +#endif // CUDA_USE_TENSOR_CORES + { 64, 64, 8}, +}; +constexpr mmq_config_t MMQ_CONFIG_Q6_K = { +// x y nwarps + { 64, 128, 8}, + { 32, 64, 8}, +#ifdef CUDA_USE_TENSOR_CORES + { 4, 32, 4}, +#else + { 64, 64, 4}, +#endif // CUDA_USE_TENSOR_CORES + { 64, 64, 8}, +}; + +// ------------------------------------------------------------ template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { GGML_UNUSED(x_qh); @@ -943,25 +1072,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); } -#define MMQ_X_Q4_0_RDNA2 64 -#define MMQ_Y_Q4_0_RDNA2 128 -#define NWARPS_Q4_0_RDNA2 8 -#define MMQ_X_Q4_0_RDNA1 64 -#define MMQ_Y_Q4_0_RDNA1 64 -#define NWARPS_Q4_0_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q4_0_AMPERE 4 -#define MMQ_Y_Q4_0_AMPERE 32 -#define NWARPS_Q4_0_AMPERE 4 -#else -#define MMQ_X_Q4_0_AMPERE 64 -#define MMQ_Y_Q4_0_AMPERE 128 -#define NWARPS_Q4_0_AMPERE 4 -#endif -#define MMQ_X_Q4_0_PASCAL 64 -#define MMQ_Y_Q4_0_PASCAL 64 -#define NWARPS_Q4_0_PASCAL 8 - template static __device__ __forceinline__ void mul_mat_q( @@ -1072,1107 +1182,265 @@ static __device__ __forceinline__ void mul_mat_q( } } +static constexpr __device__ mmq_arch_config_t get_arch_config_device(mmq_config_t mmq_config) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + +#if defined(RDNA3) || defined(RDNA2) + return mmq_config.rdna2; +#else + return mmq_config.rdna1; +#endif // defined(RDNA3) || defined(RDNA2) + +#else + +#if __CUDA_ARCH__ >= CC_VOLTA + return mmq_config.ampere; +#else + return mmq_config.pascal; +#endif // __CUDA_ARCH__ >= CC_VOLTA + +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +} + template static __global__ void #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2) + __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_0.rdna2.nwarps, 2) #endif // defined(RDNA3) || defined(RDNA2) #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) mul_mat_q4_0( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q4_0_RDNA2; - const int mmq_y = MMQ_Y_Q4_0_RDNA2; - const int nwarps = NWARPS_Q4_0_RDNA2; -#else - const int mmq_x = MMQ_X_Q4_0_RDNA1; - const int mmq_y = MMQ_Y_Q4_0_RDNA1; - const int nwarps = NWARPS_Q4_0_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) +#if __CUDA_ARCH__ >= MIN_CC_DP4A + constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_0); - mul_mat_q, - load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q4_0_AMPERE; - const int mmq_y = MMQ_Y_Q4_0_AMPERE; - const int nwarps = NWARPS_Q4_0_AMPERE; - - mul_mat_q, - load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q4_0_PASCAL; - const int mmq_y = MMQ_Y_Q4_0_PASCAL; - const int nwarps = NWARPS_Q4_0_PASCAL; - - mul_mat_q, - load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -#define MMQ_X_Q4_1_RDNA2 64 -#define MMQ_Y_Q4_1_RDNA2 128 -#define NWARPS_Q4_1_RDNA2 8 -#define MMQ_X_Q4_1_RDNA1 64 -#define MMQ_Y_Q4_1_RDNA1 64 -#define NWARPS_Q4_1_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q4_1_AMPERE 4 -#define MMQ_Y_Q4_1_AMPERE 32 -#define NWARPS_Q4_1_AMPERE 4 -#else -#define MMQ_X_Q4_1_AMPERE 64 -#define MMQ_Y_Q4_1_AMPERE 128 -#define NWARPS_Q4_1_AMPERE 4 -#endif -#define MMQ_X_Q4_1_PASCAL 64 -#define MMQ_Y_Q4_1_PASCAL 64 -#define NWARPS_Q4_1_PASCAL 8 - template static __global__ void #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2) + __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.rdna2.nwarps, 2) #endif // defined(RDNA3) || defined(RDNA2) #elif __CUDA_ARCH__ < CC_VOLTA - __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2) + __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.pascal.nwarps, 2) #endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q4_1( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q4_1_RDNA2; - const int mmq_y = MMQ_Y_Q4_1_RDNA2; - const int nwarps = NWARPS_Q4_1_RDNA2; -#else - const int mmq_x = MMQ_X_Q4_1_RDNA1; - const int mmq_y = MMQ_Y_Q4_1_RDNA1; - const int nwarps = NWARPS_Q4_1_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) +#if __CUDA_ARCH__ >= MIN_CC_DP4A + constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_1); - mul_mat_q, - load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q4_1_AMPERE; - const int mmq_y = MMQ_Y_Q4_1_AMPERE; - const int nwarps = NWARPS_Q4_1_AMPERE; - - mul_mat_q, - load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q4_1_PASCAL; - const int mmq_y = MMQ_Y_Q4_1_PASCAL; - const int nwarps = NWARPS_Q4_1_PASCAL; - - mul_mat_q, - load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -#define MMQ_X_Q5_0_RDNA2 64 -#define MMQ_Y_Q5_0_RDNA2 128 -#define NWARPS_Q5_0_RDNA2 8 -#define MMQ_X_Q5_0_RDNA1 64 -#define MMQ_Y_Q5_0_RDNA1 64 -#define NWARPS_Q5_0_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q5_0_AMPERE 4 -#define MMQ_Y_Q5_0_AMPERE 32 -#define NWARPS_Q5_0_AMPERE 4 -#else -#define MMQ_X_Q5_0_AMPERE 128 -#define MMQ_Y_Q5_0_AMPERE 64 -#define NWARPS_Q5_0_AMPERE 4 -#endif -#define MMQ_X_Q5_0_PASCAL 64 -#define MMQ_Y_Q5_0_PASCAL 64 -#define NWARPS_Q5_0_PASCAL 8 - template static __global__ void #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2) + __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_0.rdna2.nwarps, 2) #endif // defined(RDNA3) || defined(RDNA2) #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) mul_mat_q5_0( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q5_0_RDNA2; - const int mmq_y = MMQ_Y_Q5_0_RDNA2; - const int nwarps = NWARPS_Q5_0_RDNA2; -#else - const int mmq_x = MMQ_X_Q5_0_RDNA1; - const int mmq_y = MMQ_Y_Q5_0_RDNA1; - const int nwarps = NWARPS_Q5_0_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) +#if __CUDA_ARCH__ >= MIN_CC_DP4A + constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_0); - mul_mat_q, - load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q5_0_AMPERE; - const int mmq_y = MMQ_Y_Q5_0_AMPERE; - const int nwarps = NWARPS_Q5_0_AMPERE; - - mul_mat_q, - load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q5_0_PASCAL; - const int mmq_y = MMQ_Y_Q5_0_PASCAL; - const int nwarps = NWARPS_Q5_0_PASCAL; - - mul_mat_q, - load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -#define MMQ_X_Q5_1_RDNA2 64 -#define MMQ_Y_Q5_1_RDNA2 128 -#define NWARPS_Q5_1_RDNA2 8 -#define MMQ_X_Q5_1_RDNA1 64 -#define MMQ_Y_Q5_1_RDNA1 64 -#define NWARPS_Q5_1_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q5_1_AMPERE 4 -#define MMQ_Y_Q5_1_AMPERE 32 -#define NWARPS_Q5_1_AMPERE 4 -#else -#define MMQ_X_Q5_1_AMPERE 128 -#define MMQ_Y_Q5_1_AMPERE 64 -#define NWARPS_Q5_1_AMPERE 4 -#endif -#define MMQ_X_Q5_1_PASCAL 64 -#define MMQ_Y_Q5_1_PASCAL 64 -#define NWARPS_Q5_1_PASCAL 8 - template static __global__ void #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2) + __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_1.rdna2.nwarps, 2) #endif // defined(RDNA3) || defined(RDNA2) #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) mul_mat_q5_1( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q5_1_RDNA2; - const int mmq_y = MMQ_Y_Q5_1_RDNA2; - const int nwarps = NWARPS_Q5_1_RDNA2; -#else - const int mmq_x = MMQ_X_Q5_1_RDNA1; - const int mmq_y = MMQ_Y_Q5_1_RDNA1; - const int nwarps = NWARPS_Q5_1_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) +#if __CUDA_ARCH__ >= MIN_CC_DP4A + constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_1); - mul_mat_q, - load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q5_1_AMPERE; - const int mmq_y = MMQ_Y_Q5_1_AMPERE; - const int nwarps = NWARPS_Q5_1_AMPERE; - - mul_mat_q, - load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q5_1_PASCAL; - const int mmq_y = MMQ_Y_Q5_1_PASCAL; - const int nwarps = NWARPS_Q5_1_PASCAL; - - mul_mat_q, - load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -#define MMQ_X_Q8_0_RDNA2 64 -#define MMQ_Y_Q8_0_RDNA2 128 -#define NWARPS_Q8_0_RDNA2 8 -#define MMQ_X_Q8_0_RDNA1 64 -#define MMQ_Y_Q8_0_RDNA1 64 -#define NWARPS_Q8_0_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q8_0_AMPERE 4 -#define MMQ_Y_Q8_0_AMPERE 32 -#define NWARPS_Q8_0_AMPERE 4 -#else -#define MMQ_X_Q8_0_AMPERE 128 -#define MMQ_Y_Q8_0_AMPERE 64 -#define NWARPS_Q8_0_AMPERE 4 -#endif -#define MMQ_X_Q8_0_PASCAL 64 -#define MMQ_Y_Q8_0_PASCAL 64 -#define NWARPS_Q8_0_PASCAL 8 - template static __global__ void #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2) + __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q8_0.rdna2.nwarps, 2) #endif // defined(RDNA3) || defined(RDNA2) #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) mul_mat_q8_0( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q8_0_RDNA2; - const int mmq_y = MMQ_Y_Q8_0_RDNA2; - const int nwarps = NWARPS_Q8_0_RDNA2; -#else - const int mmq_x = MMQ_X_Q8_0_RDNA1; - const int mmq_y = MMQ_Y_Q8_0_RDNA1; - const int nwarps = NWARPS_Q8_0_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) +#if __CUDA_ARCH__ >= MIN_CC_DP4A + constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q8_0); - mul_mat_q, - load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q8_0_AMPERE; - const int mmq_y = MMQ_Y_Q8_0_AMPERE; - const int nwarps = NWARPS_Q8_0_AMPERE; - - mul_mat_q, - load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q8_0_PASCAL; - const int mmq_y = MMQ_Y_Q8_0_PASCAL; - const int nwarps = NWARPS_Q8_0_PASCAL; - - mul_mat_q, - load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -#define MMQ_X_Q2_K_RDNA2 64 -#define MMQ_Y_Q2_K_RDNA2 128 -#define NWARPS_Q2_K_RDNA2 8 -#define MMQ_X_Q2_K_RDNA1 128 -#define MMQ_Y_Q2_K_RDNA1 32 -#define NWARPS_Q2_K_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q2_K_AMPERE 4 -#define MMQ_Y_Q2_K_AMPERE 32 -#define NWARPS_Q2_K_AMPERE 4 -#else -#define MMQ_X_Q2_K_AMPERE 64 -#define MMQ_Y_Q2_K_AMPERE 128 -#define NWARPS_Q2_K_AMPERE 4 -#endif -#define MMQ_X_Q2_K_PASCAL 64 -#define MMQ_Y_Q2_K_PASCAL 64 -#define NWARPS_Q2_K_PASCAL 8 - template static __global__ void #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2) + __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q2_K.rdna2.nwarps, 2) #endif // defined(RDNA3) || defined(RDNA2) #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) mul_mat_q2_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q2_K_RDNA2; - const int mmq_y = MMQ_Y_Q2_K_RDNA2; - const int nwarps = NWARPS_Q2_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q2_K_RDNA1; - const int mmq_y = MMQ_Y_Q2_K_RDNA1; - const int nwarps = NWARPS_Q2_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) +#if __CUDA_ARCH__ >= MIN_CC_DP4A + constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q2_K); - mul_mat_q, - load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q2_K_AMPERE; - const int mmq_y = MMQ_Y_Q2_K_AMPERE; - const int nwarps = NWARPS_Q2_K_AMPERE; - - mul_mat_q, - load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q2_K_PASCAL; - const int mmq_y = MMQ_Y_Q2_K_PASCAL; - const int nwarps = NWARPS_Q2_K_PASCAL; - - mul_mat_q, - load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -#define MMQ_X_Q3_K_RDNA2 128 -#define MMQ_Y_Q3_K_RDNA2 64 -#define NWARPS_Q3_K_RDNA2 8 -#define MMQ_X_Q3_K_RDNA1 32 -#define MMQ_Y_Q3_K_RDNA1 128 -#define NWARPS_Q3_K_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q3_K_AMPERE 4 -#define MMQ_Y_Q3_K_AMPERE 32 -#define NWARPS_Q3_K_AMPERE 4 -#else -#define MMQ_X_Q3_K_AMPERE 128 -#define MMQ_Y_Q3_K_AMPERE 128 -#define NWARPS_Q3_K_AMPERE 4 -#endif -#define MMQ_X_Q3_K_PASCAL 64 -#define MMQ_Y_Q3_K_PASCAL 64 -#define NWARPS_Q3_K_PASCAL 8 - template static __global__ void #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2) + __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.rdna2.nwarps, 2) #endif // defined(RDNA3) || defined(RDNA2) #elif __CUDA_ARCH__ < CC_VOLTA - __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2) + __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.pascal.nwarps, 2) #endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q3_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q3_K_RDNA2; - const int mmq_y = MMQ_Y_Q3_K_RDNA2; - const int nwarps = NWARPS_Q3_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q3_K_RDNA1; - const int mmq_y = MMQ_Y_Q3_K_RDNA1; - const int nwarps = NWARPS_Q3_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) +#if __CUDA_ARCH__ >= MIN_CC_DP4A + constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q3_K); - mul_mat_q, - load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q3_K_AMPERE; - const int mmq_y = MMQ_Y_Q3_K_AMPERE; - const int nwarps = NWARPS_Q3_K_AMPERE; - - mul_mat_q, - load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q3_K_PASCAL; - const int mmq_y = MMQ_Y_Q3_K_PASCAL; - const int nwarps = NWARPS_Q3_K_PASCAL; - - mul_mat_q, - load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -#define MMQ_X_Q4_K_RDNA2 64 -#define MMQ_Y_Q4_K_RDNA2 128 -#define NWARPS_Q4_K_RDNA2 8 -#define MMQ_X_Q4_K_RDNA1 32 -#define MMQ_Y_Q4_K_RDNA1 64 -#define NWARPS_Q4_K_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q4_K_AMPERE 4 -#define MMQ_Y_Q4_K_AMPERE 32 -#define NWARPS_Q4_K_AMPERE 4 -#else -#define MMQ_X_Q4_K_AMPERE 64 -#define MMQ_Y_Q4_K_AMPERE 128 -#define NWARPS_Q4_K_AMPERE 4 -#endif -#define MMQ_X_Q4_K_PASCAL 64 -#define MMQ_Y_Q4_K_PASCAL 64 -#define NWARPS_Q4_K_PASCAL 8 - template static __global__ void #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2) + __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.rdna2.nwarps, 2) #endif // defined(RDNA3) || defined(RDNA2) #elif __CUDA_ARCH__ < CC_VOLTA - __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2) + __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2) #endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q4_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q4_K_RDNA2; - const int mmq_y = MMQ_Y_Q4_K_RDNA2; - const int nwarps = NWARPS_Q4_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q4_K_RDNA1; - const int mmq_y = MMQ_Y_Q4_K_RDNA1; - const int nwarps = NWARPS_Q4_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) +#if __CUDA_ARCH__ >= MIN_CC_DP4A + constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_K); - mul_mat_q, - load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q4_K_AMPERE; - const int mmq_y = MMQ_Y_Q4_K_AMPERE; - const int nwarps = NWARPS_Q4_K_AMPERE; - - mul_mat_q, - load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q4_K_PASCAL; - const int mmq_y = MMQ_Y_Q4_K_PASCAL; - const int nwarps = NWARPS_Q4_K_PASCAL; - - mul_mat_q, - load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -#define MMQ_X_Q5_K_RDNA2 64 -#define MMQ_Y_Q5_K_RDNA2 128 -#define NWARPS_Q5_K_RDNA2 8 -#define MMQ_X_Q5_K_RDNA1 32 -#define MMQ_Y_Q5_K_RDNA1 64 -#define NWARPS_Q5_K_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q5_K_AMPERE 4 -#define MMQ_Y_Q5_K_AMPERE 32 -#define NWARPS_Q5_K_AMPERE 4 -#else -#define MMQ_X_Q5_K_AMPERE 64 -#define MMQ_Y_Q5_K_AMPERE 128 -#define NWARPS_Q5_K_AMPERE 4 -#endif -#define MMQ_X_Q5_K_PASCAL 64 -#define MMQ_Y_Q5_K_PASCAL 64 -#define NWARPS_Q5_K_PASCAL 8 - template static __global__ void #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2) + __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_K.rdna2.nwarps, 2) #endif // defined(RDNA3) || defined(RDNA2) #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) mul_mat_q5_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q5_K_RDNA2; - const int mmq_y = MMQ_Y_Q5_K_RDNA2; - const int nwarps = NWARPS_Q5_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q5_K_RDNA1; - const int mmq_y = MMQ_Y_Q5_K_RDNA1; - const int nwarps = NWARPS_Q5_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) +#if __CUDA_ARCH__ >= MIN_CC_DP4A + constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_K); - mul_mat_q, - load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q5_K_AMPERE; - const int mmq_y = MMQ_Y_Q5_K_AMPERE; - const int nwarps = NWARPS_Q5_K_AMPERE; - - mul_mat_q, - load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q5_K_PASCAL; - const int mmq_y = MMQ_Y_Q5_K_PASCAL; - const int nwarps = NWARPS_Q5_K_PASCAL; - - mul_mat_q, - load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -#define MMQ_X_Q6_K_RDNA2 64 -#define MMQ_Y_Q6_K_RDNA2 128 -#define NWARPS_Q6_K_RDNA2 8 -#define MMQ_X_Q6_K_RDNA1 32 -#define MMQ_Y_Q6_K_RDNA1 64 -#define NWARPS_Q6_K_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q6_K_AMPERE 4 -#define MMQ_Y_Q6_K_AMPERE 32 -#define NWARPS_Q6_K_AMPERE 4 -#else -#define MMQ_X_Q6_K_AMPERE 64 -#define MMQ_Y_Q6_K_AMPERE 64 -#define NWARPS_Q6_K_AMPERE 4 -#endif -#define MMQ_X_Q6_K_PASCAL 64 -#define MMQ_Y_Q6_K_PASCAL 64 -#define NWARPS_Q6_K_PASCAL 8 - template static __global__ void #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2) + __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q6_K.rdna2.nwarps, 2) #endif // defined(RDNA3) || defined(RDNA2) #elif __CUDA_ARCH__ < CC_VOLTA - __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2) + __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2) #endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q6_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q6_K_RDNA2; - const int mmq_y = MMQ_Y_Q6_K_RDNA2; - const int nwarps = NWARPS_Q6_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q6_K_RDNA1; - const int mmq_y = MMQ_Y_Q6_K_RDNA1; - const int nwarps = NWARPS_Q6_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) +#if __CUDA_ARCH__ >= MIN_CC_DP4A + constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q6_K); - mul_mat_q, - load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q6_K_AMPERE; - const int mmq_y = MMQ_Y_Q6_K_AMPERE; - const int nwarps = NWARPS_Q6_K_AMPERE; - - mul_mat_q, - load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q6_K_PASCAL; - const int mmq_y = MMQ_Y_Q6_K_PASCAL; - const int nwarps = NWARPS_Q6_K_PASCAL; - - mul_mat_q, - load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -static void ggml_mul_mat_q4_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id = ggml_cuda_get_device(); - const int compute_capability = ggml_cuda_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q4_0_RDNA2; - mmq_y = MMQ_Y_Q4_0_RDNA2; - nwarps = NWARPS_Q4_0_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q4_0_RDNA1; - mmq_y = MMQ_Y_Q4_0_RDNA1; - nwarps = NWARPS_Q4_0_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q4_0_AMPERE; - mmq_y = MMQ_Y_Q4_0_AMPERE; - nwarps = NWARPS_Q4_0_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q4_0_PASCAL; - mmq_y = MMQ_Y_Q4_0_PASCAL; - nwarps = NWARPS_Q4_0_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q4_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q4_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q4_1_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id = ggml_cuda_get_device(); - const int compute_capability = ggml_cuda_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q4_1_RDNA2; - mmq_y = MMQ_Y_Q4_1_RDNA2; - nwarps = NWARPS_Q4_1_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q4_1_RDNA1; - mmq_y = MMQ_Y_Q4_1_RDNA1; - nwarps = NWARPS_Q4_1_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q4_1_AMPERE; - mmq_y = MMQ_Y_Q4_1_AMPERE; - nwarps = NWARPS_Q4_1_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q4_1_PASCAL; - mmq_y = MMQ_Y_Q4_1_PASCAL; - nwarps = NWARPS_Q4_1_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q4_1<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q4_1<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q5_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id = ggml_cuda_get_device(); - const int compute_capability = ggml_cuda_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q5_0_RDNA2; - mmq_y = MMQ_Y_Q5_0_RDNA2; - nwarps = NWARPS_Q5_0_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q5_0_RDNA1; - mmq_y = MMQ_Y_Q5_0_RDNA1; - nwarps = NWARPS_Q5_0_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q5_0_AMPERE; - mmq_y = MMQ_Y_Q5_0_AMPERE; - nwarps = NWARPS_Q5_0_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q5_0_PASCAL; - mmq_y = MMQ_Y_Q5_0_PASCAL; - nwarps = NWARPS_Q5_0_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q5_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q5_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q5_1_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id = ggml_cuda_get_device(); - const int compute_capability = ggml_cuda_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q5_1_RDNA2; - mmq_y = MMQ_Y_Q5_1_RDNA2; - nwarps = NWARPS_Q5_1_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q5_1_RDNA1; - mmq_y = MMQ_Y_Q5_1_RDNA1; - nwarps = NWARPS_Q5_1_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q5_1_AMPERE; - mmq_y = MMQ_Y_Q5_1_AMPERE; - nwarps = NWARPS_Q5_1_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q5_1_PASCAL; - mmq_y = MMQ_Y_Q5_1_PASCAL; - nwarps = NWARPS_Q5_1_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q5_1<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q5_1<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q8_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id = ggml_cuda_get_device(); - const int compute_capability = ggml_cuda_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q8_0_RDNA2; - mmq_y = MMQ_Y_Q8_0_RDNA2; - nwarps = NWARPS_Q8_0_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q8_0_RDNA1; - mmq_y = MMQ_Y_Q8_0_RDNA1; - nwarps = NWARPS_Q8_0_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q8_0_AMPERE; - mmq_y = MMQ_Y_Q8_0_AMPERE; - nwarps = NWARPS_Q8_0_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q8_0_PASCAL; - mmq_y = MMQ_Y_Q8_0_PASCAL; - nwarps = NWARPS_Q8_0_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q8_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q8_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q2_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id = ggml_cuda_get_device(); - const int compute_capability = ggml_cuda_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q2_K_RDNA2; - mmq_y = MMQ_Y_Q2_K_RDNA2; - nwarps = NWARPS_Q2_K_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q2_K_RDNA1; - mmq_y = MMQ_Y_Q2_K_RDNA1; - nwarps = NWARPS_Q2_K_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q2_K_AMPERE; - mmq_y = MMQ_Y_Q2_K_AMPERE; - nwarps = NWARPS_Q2_K_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q2_K_PASCAL; - mmq_y = MMQ_Y_Q2_K_PASCAL; - nwarps = NWARPS_Q2_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q2_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q2_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q3_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - -#if QK_K == 256 - - int id = ggml_cuda_get_device(); - const int compute_capability = ggml_cuda_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q3_K_RDNA2; - mmq_y = MMQ_Y_Q3_K_RDNA2; - nwarps = NWARPS_Q3_K_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q3_K_RDNA1; - mmq_y = MMQ_Y_Q3_K_RDNA1; - nwarps = NWARPS_Q3_K_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q3_K_AMPERE; - mmq_y = MMQ_Y_Q3_K_AMPERE; - nwarps = NWARPS_Q3_K_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q3_K_PASCAL; - mmq_y = MMQ_Y_Q3_K_PASCAL; - nwarps = NWARPS_Q3_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q3_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q3_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -#endif -} - -static void ggml_mul_mat_q4_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id = ggml_cuda_get_device(); - const int compute_capability = ggml_cuda_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q4_K_RDNA2; - mmq_y = MMQ_Y_Q4_K_RDNA2; - nwarps = NWARPS_Q4_K_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q4_K_RDNA1; - mmq_y = MMQ_Y_Q4_K_RDNA1; - nwarps = NWARPS_Q4_K_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q4_K_AMPERE; - mmq_y = MMQ_Y_Q4_K_AMPERE; - nwarps = NWARPS_Q4_K_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q4_K_PASCAL; - mmq_y = MMQ_Y_Q4_K_PASCAL; - nwarps = NWARPS_Q4_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q4_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q4_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q5_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id = ggml_cuda_get_device(); - const int compute_capability = ggml_cuda_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q5_K_RDNA2; - mmq_y = MMQ_Y_Q5_K_RDNA2; - nwarps = NWARPS_Q5_K_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q5_K_RDNA1; - mmq_y = MMQ_Y_Q5_K_RDNA1; - nwarps = NWARPS_Q5_K_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q5_K_AMPERE; - mmq_y = MMQ_Y_Q5_K_AMPERE; - nwarps = NWARPS_Q5_K_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q5_K_PASCAL; - mmq_y = MMQ_Y_Q5_K_PASCAL; - nwarps = NWARPS_Q5_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q5_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q5_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q6_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id = ggml_cuda_get_device(); - const int compute_capability = ggml_cuda_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q6_K_RDNA2; - mmq_y = MMQ_Y_Q6_K_RDNA2; - nwarps = NWARPS_Q6_K_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q6_K_RDNA1; - mmq_y = MMQ_Y_Q6_K_RDNA1; - nwarps = NWARPS_Q6_K_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q6_K_AMPERE; - mmq_y = MMQ_Y_Q6_K_AMPERE; - nwarps = NWARPS_Q6_K_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q6_K_PASCAL; - mmq_y = MMQ_Y_Q6_K_PASCAL; - nwarps = NWARPS_Q6_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q6_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q6_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} +#define MMQ_SWITCH_CASE(type_suffix) \ + case GGML_TYPE_Q##type_suffix: if (row_diff % arch_config.y == 0) { \ + const bool need_check = false; \ + mul_mat_q##type_suffix<<>> \ + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \ + } else { \ + const bool need_check = true; \ + mul_mat_q##type_suffix<<>> \ + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \ + } break; \ void ggml_cuda_op_mul_mat_q( ggml_backend_cuda_context & ctx, @@ -2190,47 +1458,84 @@ void ggml_cuda_op_mul_mat_q( const int64_t row_diff = row_high - row_low; int id = ggml_cuda_get_device(); + const int compute_capability = ggml_cuda_info().devices[id].cc; // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the kernel writes into const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; + mmq_config_t mmq_config; + switch (src0->type) { case GGML_TYPE_Q4_0: - ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + mmq_config = MMQ_CONFIG_Q4_0; break; case GGML_TYPE_Q4_1: - ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + mmq_config = MMQ_CONFIG_Q4_1; break; case GGML_TYPE_Q5_0: - ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + mmq_config = MMQ_CONFIG_Q5_0; break; case GGML_TYPE_Q5_1: - ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + mmq_config = MMQ_CONFIG_Q5_1; break; case GGML_TYPE_Q8_0: - ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + mmq_config = MMQ_CONFIG_Q8_0; break; case GGML_TYPE_Q2_K: - ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + mmq_config = MMQ_CONFIG_Q2_K; break; case GGML_TYPE_Q3_K: - ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + mmq_config = MMQ_CONFIG_Q3_K; break; case GGML_TYPE_Q4_K: - ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + mmq_config = MMQ_CONFIG_Q4_K; break; case GGML_TYPE_Q5_K: - ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + mmq_config = MMQ_CONFIG_Q5_K; break; case GGML_TYPE_Q6_K: - ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + mmq_config = MMQ_CONFIG_Q6_K; break; default: GGML_ASSERT(false); break; } + mmq_arch_config_t arch_config; + if (compute_capability >= CC_RDNA2) { + arch_config = mmq_config.rdna2; + } else if (compute_capability >= CC_OFFSET_AMD) { + arch_config = mmq_config.rdna1; + } else if (compute_capability >= CC_VOLTA) { + arch_config = mmq_config.ampere; + } else if (compute_capability >= MIN_CC_DP4A) { + arch_config = mmq_config.pascal; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (row_diff + arch_config.y - 1) / arch_config.y; + const int block_num_y = (src1_ncols + arch_config.x - 1) / arch_config.x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, arch_config.nwarps, 1); + + switch (src0->type) { + MMQ_SWITCH_CASE(4_0) + MMQ_SWITCH_CASE(4_1) + MMQ_SWITCH_CASE(5_0) + MMQ_SWITCH_CASE(5_1) + MMQ_SWITCH_CASE(8_0) + MMQ_SWITCH_CASE(2_K) + MMQ_SWITCH_CASE(3_K) + MMQ_SWITCH_CASE(4_K) + MMQ_SWITCH_CASE(5_K) + MMQ_SWITCH_CASE(6_K) + default: + GGML_ASSERT(false); + break; + } + GGML_UNUSED(src1); GGML_UNUSED(dst); GGML_UNUSED(src1_ddf_i); From 11474e756de3f56b760986e73086d40e787e52f8 Mon Sep 17 00:00:00 2001 From: Amir Date: Tue, 21 May 2024 17:13:12 +0300 Subject: [PATCH 03/34] examples: cache hf model when --model not provided (#7353) * examples: cache hf model when --model not provided * examples: cache hf model when --model not provided * examples: cache hf model when --model not provided * examples: cache hf model when --model not provided * examples: cache hf model when --model not provided --- common/common.cpp | 32 +++++++++++++++++++++++++++++++- common/common.h | 1 + examples/main/README.md | 2 ++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index e624fc7f3..ae11650b4 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1354,7 +1354,12 @@ void gpt_params_handle_model_default(gpt_params & params) { } params.hf_file = params.model; } else if (params.model.empty()) { - params.model = "models/" + string_split(params.hf_file, '/').back(); + std::string cache_directory = get_cache_directory(); + const bool success = create_directory_with_parents(cache_directory); + if (!success) { + throw std::runtime_error("failed to create cache directory: " + cache_directory); + } + params.model = cache_directory + string_split(params.hf_file, '/').back(); } } else if (!params.model_url.empty()) { if (params.model.empty()) { @@ -2516,6 +2521,31 @@ bool create_directory_with_parents(const std::string & path) { #endif // _WIN32 } +std::string get_cache_directory() { + std::string cache_directory = ""; + if (getenv("LLAMA_CACHE")) { + cache_directory = std::getenv("LLAMA_CACHE"); + if (cache_directory.back() != DIRECTORY_SEPARATOR) { + cache_directory += DIRECTORY_SEPARATOR; + } + } else { +#ifdef __linux__ + if (std::getenv("XDG_CACHE_HOME")) { + cache_directory = std::getenv("XDG_CACHE_HOME"); + } else { + cache_directory = std::getenv("HOME") + std::string("/.cache/"); + } +#elif defined(__APPLE__) + cache_directory = std::getenv("HOME") + std::string("/Library/Caches/"); +#elif defined(_WIN32) + cache_directory = std::getenv("APPDATA"); +#endif // __linux__ + cache_directory += "llama.cpp"; + cache_directory += DIRECTORY_SEPARATOR; + } + return cache_directory; +} + void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data) { if (data.empty()) { fprintf(stream, "%s:\n", prop_name); diff --git a/common/common.h b/common/common.h index 566490e2f..a8e5e50e6 100644 --- a/common/common.h +++ b/common/common.h @@ -281,6 +281,7 @@ bool llama_should_add_bos_token(const llama_model * model); // bool create_directory_with_parents(const std::string & path); +std::string get_cache_directory(); void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data); void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector & data); void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data); diff --git a/examples/main/README.md b/examples/main/README.md index 97e2ae4c2..ee930f4e7 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -325,3 +325,5 @@ These options provide extra functionality and customization when running the LLa - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. + +- `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable or in an OS-specific local cache. From c3f8d583560b4f261fa21c976793e538c60cd66c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 21 May 2024 19:53:48 +0300 Subject: [PATCH 04/34] tests : test-tokenizer-0.sh print more info (#7402) --- convert-hf-to-gguf-update.py | 2 +- convert-hf-to-gguf.py | 2 +- tests/test-tokenizer-0.sh | 9 +++++++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 45404b32b..1923b88ba 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -72,7 +72,7 @@ models = [ {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, - {"name": "stablelm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", }, + {"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", }, {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 1acf45bf2..6357d4034 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -447,7 +447,7 @@ class Model: # ref: https://huggingface.co/openai-community/gpt2 res = "gpt-2" if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": - # ref: https://huggingface.co/stabilityai/stablelm-2-1_6b + # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b res = "stablelm2" if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": # ref: https://huggingface.co/smallcloudai/Refact-1_6-base diff --git a/tests/test-tokenizer-0.sh b/tests/test-tokenizer-0.sh index 2fb8632d8..1fec8bbf1 100755 --- a/tests/test-tokenizer-0.sh +++ b/tests/test-tokenizer-0.sh @@ -17,10 +17,15 @@ make -j tests/test-tokenizer-0 printf "Testing %s on %s ...\n" $name $input -python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 -cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" +set -e +printf "Tokenizing using (py) Python AutoTokenizer ...\n" +python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 + +printf "Tokenizing using (cpp) llama.cpp ...\n" ./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 + +cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in" diff $input.tok $input.tokcpp > /dev/null 2>&1 From fcf6538ba6702c55eaec70da9a75c81d04900a72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 21 May 2024 19:27:12 +0200 Subject: [PATCH 05/34] CUDA: fix unused warning in mmq.cu (#7442) --- ggml-cuda/mmq.cu | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ggml-cuda/mmq.cu b/ggml-cuda/mmq.cu index 5b540d375..933d799ce 100644 --- a/ggml-cuda/mmq.cu +++ b/ggml-cuda/mmq.cu @@ -1220,6 +1220,7 @@ template static __global__ void load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else + GGML_UNUSED(get_arch_config_device); GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A @@ -1244,6 +1245,7 @@ template static __global__ void load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else + GGML_UNUSED(get_arch_config_device); GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A @@ -1266,6 +1268,7 @@ template static __global__ void load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else + GGML_UNUSED(get_arch_config_device); GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A @@ -1288,6 +1291,7 @@ mul_mat_q5_1( load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else + GGML_UNUSED(get_arch_config_device); GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A @@ -1310,6 +1314,7 @@ template static __global__ void load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else + GGML_UNUSED(get_arch_config_device); GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A @@ -1332,6 +1337,7 @@ mul_mat_q2_K( load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else + GGML_UNUSED(get_arch_config_device); GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A @@ -1356,6 +1362,7 @@ template static __global__ void load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else + GGML_UNUSED(get_arch_config_device); GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A @@ -1380,6 +1387,7 @@ template static __global__ void load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else + GGML_UNUSED(get_arch_config_device); GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A @@ -1402,6 +1410,7 @@ mul_mat_q5_K( load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else + GGML_UNUSED(get_arch_config_device); GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A @@ -1426,6 +1435,7 @@ template static __global__ void load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else + GGML_UNUSED(get_arch_config_device); GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A From e402de364b643cb89ea9f43057733b5d36298670 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Tue, 21 May 2024 20:40:00 +0100 Subject: [PATCH 06/34] `grammars`: fix resampling logic regression (#7424) --- common/sampling.cpp | 13 +++++++------ examples/main/main.cpp | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index f0f1b92d3..7fc2e2158 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -179,7 +179,7 @@ static llama_token llama_sampling_sample_impl( struct llama_context * ctx_main, struct llama_context * ctx_cfg, const int idx, - bool is_resampling) { // Add a parameter to indicate if we are resampling + bool is_resampling) { const llama_sampling_params & params = ctx_sampling->params; const float temp = params.temp; @@ -188,8 +188,8 @@ static llama_token llama_sampling_sample_impl( const float mirostat_eta = params.mirostat_eta; std::vector original_logits; - auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits); - if (!is_resampling) { + auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits); + if (ctx_sampling->grammar != NULL && !is_resampling) { GGML_ASSERT(!original_logits.empty()); } llama_token id = 0; @@ -252,7 +252,7 @@ static llama_token llama_sampling_sample_impl( // Restore logits from the copy std::copy(original_logits.begin(), original_logits.end(), logits); - return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling + return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true); } } @@ -285,7 +285,8 @@ static llama_token_data_array llama_sampling_prepare_impl( // Get a pointer to the logits float * logits = llama_get_logits_ith(ctx_main, idx); - if (apply_grammar && original_logits != NULL) { + if (ctx_sampling->grammar != NULL && !apply_grammar) { + GGML_ASSERT(original_logits != NULL); // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this. *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))}; } @@ -342,7 +343,7 @@ llama_token llama_sampling_sample( struct llama_context * ctx_cfg, const int idx) { // Call the implementation function with is_resampling set to false by default - return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false); + return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false); } llama_token_data_array llama_sampling_prepare( diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 9dee41001..832b51ee0 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -707,7 +707,7 @@ int main(int argc, char ** argv) { const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); - llama_sampling_accept(ctx_sampling, ctx, id, true); + llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true); LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); @@ -728,7 +728,7 @@ int main(int argc, char ** argv) { // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); + llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { From 6369bf04336ab60e5c892dd77a3246df91015147 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 21 May 2024 23:03:42 +0300 Subject: [PATCH 07/34] metal : handle F16 inf values, fix FA partial offload (#7434) ggml-ci --- ggml-metal.metal | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/ggml-metal.metal b/ggml-metal.metal index 386e9195f..cf262e834 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -2204,11 +2204,7 @@ kernel void kernel_flash_attn_ext_f16( // pointer to the mask device const half * mp = (device const half *) (mask + iq1*nb31); - // prepare diagonal scale matrix - simdgroup_float8x8 mscale(scale); - - // prepare diagonal slope matrix - simdgroup_float8x8 mslope(1.0f); + float slope = 1.0f; // ALiBi if (max_bias > 0.0f) { @@ -2217,7 +2213,7 @@ kernel void kernel_flash_attn_ext_f16( const float base = h < n_head_log2 ? m0 : m1; const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; - mslope = simdgroup_float8x8(pow(base, exph)); + slope = pow(base, exph); } // loop over the KV cache @@ -2242,18 +2238,20 @@ kernel void kernel_flash_attn_ext_f16( simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk); } + simdgroup_store(mqk, ss + 8*cc, TF, 0, false); + + const short tx = tiisg%4; + const short ty = tiisg/4; + if (mask != q) { // mqk = mqk*scale + mask*slope - simdgroup_half8x8 mm; - simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false); - simdgroup_multiply(mm, mslope, mm); - simdgroup_multiply_accumulate(mqk, mqk, mscale, mm); + ss[8*cc + ty*TF + 2*tx + 0] = scale*ss[8*cc + ty*TF + 2*tx + 0] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 0]; + ss[8*cc + ty*TF + 2*tx + 1] = scale*ss[8*cc + ty*TF + 2*tx + 1] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 1]; } else { // mqk = mqk*scale - simdgroup_multiply(mqk, mscale, mqk); + ss[8*cc + ty*TF + 2*tx + 0] *= scale; + ss[8*cc + ty*TF + 2*tx + 1] *= scale; } - - simdgroup_store(mqk, ss + 8*cc, TF, 0, false); } } @@ -2816,8 +2814,7 @@ kernel void kernel_cpy_f32_f16( for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) { device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); - // TODO: is there a better way to handle -INFINITY? - dst_data[i00] = src[0] == -INFINITY ? -MAXHALF : src[0]; + dst_data[i00] = src[0]; } } From 201cc11afa0a1950e1f632390b2ac6c937a0d8f0 Mon Sep 17 00:00:00 2001 From: liuwei-git <14815172+liuwei-git@users.noreply.github.com> Date: Wed, 22 May 2024 04:28:32 +0800 Subject: [PATCH 08/34] llama : add phi3 128K model support (#7225) * add phi3 128k support in convert-hf-to-gguf * add phi3 128k support in cuda * address build warnings on llama.cpp * adjust index value in cuda long rope freq factors * add long rope support in ggml cpu backend * make freq factors only depend on ctx size * remove unused rope scaling type 'su' frin gguf converter * fix flint warnings on convert-hf-to-gguf.py * set to the short freq factor when context size is small than trained context size * add one line of comments * metal : support rope freq_factors * ggml : update ggml_rope_ext API to support freq. factors * backends : add dev messages to support rope freq. factors * minor : style * tests : update to use new rope API * backends : fix pragma semicolons * minor : cleanup * llama : move rope factors from KV header to tensors * llama : remove tmp assert * cuda : fix compile warning * convert : read/write n_head_kv * llama : fix uninitialized tensors --------- Co-authored-by: Georgi Gerganov --- convert-hf-to-gguf.py | 49 +++- examples/finetune/finetune.cpp | 4 +- .../train-text-from-scratch.cpp | 4 +- ggml-cuda/rope.cu | 72 +++-- ggml-kompute.cpp | 4 + ggml-metal.m | 121 ++++---- ggml-metal.metal | 6 +- ggml-sycl.cpp | 3 + ggml-vulkan.cpp | 4 + ggml.c | 88 +++++- ggml.h | 49 +++- gguf-py/gguf/constants.py | 17 +- gguf-py/gguf/gguf_writer.py | 3 + llama.cpp | 277 +++++++++++------- tests/test-backend-ops.cpp | 16 +- 15 files changed, 484 insertions(+), 233 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 6357d4034..daad1c4fc 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -14,6 +14,7 @@ from pathlib import Path from hashlib import sha256 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast +import math import numpy as np import torch @@ -1784,23 +1785,59 @@ class Phi3MiniModel(Model): def set_gguf_parameters(self): block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) - rot_pct = 1.0 n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) + n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) rms_eps = self.find_hparam(["rms_norm_eps"]) + max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) + orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) + rope_dims = n_embd // n_head self.gguf_writer.add_name("Phi3") - self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) - + self.gguf_writer.add_context_length(max_pos_embds) + self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(8192) + self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"])) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) self.gguf_writer.add_layer_norm_rms_eps(rms_eps) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + self.gguf_writer.add_rope_dimension_count(rope_dims) + self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) self.gguf_writer.add_file_type(self.ftype) + # write rope scaling for long context (128k) model + rope_scaling = self.find_hparam(['rope_scaling'], True) + if (rope_scaling is None): + return + + scale = max_pos_embds / orig_max_pos_embds + + rope_scaling_type = rope_scaling.get('type', '').lower() + if len(rope_scaling_type) == 0: + raise KeyError('Missing the required key rope_scaling.type') + + if rope_scaling_type == 'su': + attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 + elif rope_scaling_type == 'yarn': + attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 + else: + raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet') + + self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) + + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) + self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) + @Model.register("PlamoForCausalLM") class PlamoModel(Model): diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 22743b1bf..992426c1b 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( // not capturing these, to silcence warnings const int rope_mode = 0; - return ggml_rope_custom(ctx, - t, KQ_pos, n_rot, rope_mode, n_ctx, 0, + return ggml_rope_ext(ctx, + t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f ); }; diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 587418cc7..45bdfa8f5 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -301,8 +301,8 @@ static struct ggml_tensor * llama_build_train_graphs( // not capturing these, to silcence warnings const int rope_mode = 0; - return ggml_rope_custom( - ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f + return ggml_rope_ext( + ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f ); }; diff --git a/ggml-cuda/rope.cu b/ggml-cuda/rope.cu index 4b0d2e5ad..4a558f4b3 100644 --- a/ggml-cuda/rope.cu +++ b/ggml-cuda/rope.cu @@ -58,10 +58,10 @@ static __global__ void rope( dst[i + 1] = x0*sin_theta + x1*cos_theta; } -template +template static __global__ void rope_neox( const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, - float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims + float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims, const float * freq_factors ) { const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); @@ -88,7 +88,9 @@ static __global__ void rope_neox( float cur_rot = inv_ndims * ic - ib; const int p = has_pos ? pos[i2] : 0; - const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f); + const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f; + + const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f)/freq_factor; float cos_theta, sin_theta; rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); @@ -164,7 +166,7 @@ static void rope_cuda( template static void rope_neox_cuda( const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream ) { GGML_ASSERT(ncols % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); @@ -175,15 +177,29 @@ static void rope_neox_cuda( const float inv_ndims = -1.0f / n_dims; if (pos == nullptr) { - rope_neox<<>>( - x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, - theta_scale, inv_ndims - ); + if (freq_factors == nullptr) { + rope_neox<<>>( + x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, + theta_scale, inv_ndims, freq_factors + ); + } else { + rope_neox<<>>( + x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, + theta_scale, inv_ndims, freq_factors + ); + } } else { - rope_neox<<>>( - x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, - theta_scale, inv_ndims - ); + if (freq_factors == nullptr) { + rope_neox<<>>( + x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, + theta_scale, inv_ndims, freq_factors + ); + } else { + rope_neox<<>>( + x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, + theta_scale, inv_ndims, freq_factors + ); + } } } @@ -214,24 +230,27 @@ static void rope_cuda_f32( static void rope_neox_cuda_f16( const half * x, half * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) { + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) { - rope_neox_cuda(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream); + rope_neox_cuda(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream); } static void rope_neox_cuda_f32( const float * x, float * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream ) { - rope_neox_cuda(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream); + rope_neox_cuda(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream); } void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; + const ggml_tensor * src2 = dst->src[2]; + const float * src0_d = (const float *)src0->data; const float * src1_d = (const float *)src1->data; + float * dst_d = (float *)dst->data; cudaStream_t stream = ctx.stream(); @@ -241,7 +260,6 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; - const int64_t ne2 = dst->ne[2]; const int64_t nrows = ggml_nrows(src0); //const int n_past = ((int32_t *) dst->op_params)[0]; @@ -259,16 +277,22 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + const float * freq_factors = nullptr; const int32_t * pos = nullptr; - if ((mode & 1) == 0) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(src1->ne[0] == ne2); - pos = (const int32_t *) src1_d; - } const bool is_neox = mode & 2; const bool is_glm = mode & 4; + if (is_neox) { + pos = (const int32_t *) src1_d; + + if (src2 != nullptr) { + freq_factors = (const float *) src2->data; + } + } else { + GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox"); + } + rope_corr_dims corr_dims; ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v); @@ -280,12 +304,12 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { if (src0->type == GGML_TYPE_F32) { rope_neox_cuda_f32( (const float *)src0_d, (float *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, stream + attn_factor, corr_dims, freq_factors, stream ); } else if (src0->type == GGML_TYPE_F16) { rope_neox_cuda_f16( (const half *)src0_d, (half *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, stream + attn_factor, corr_dims, freq_factors, stream ); } else { GGML_ASSERT(false); diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 3f033d58b..6c6058b2a 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1677,6 +1677,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml } break; case GGML_OP_ROPE: { +#pragma message("TODO: implement phi3 frequency factors support") +#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225") + GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet"); + GGML_ASSERT(ne10 == ne02); GGML_ASSERT(src0t == dstt); // const int n_past = ((int32_t *) dst->op_params)[0]; diff --git a/ggml-metal.m b/ggml-metal.m index b0b16dbf7..5d5ad20ad 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -927,22 +927,32 @@ static enum ggml_status ggml_metal_graph_compute( const int64_t ne10 = src1 ? src1->ne[0] : 0; const int64_t ne11 = src1 ? src1->ne[1] : 0; const int64_t ne12 = src1 ? src1->ne[2] : 0; - const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); + const int64_t ne13 = src1 ? src1->ne[3] : 0; const uint64_t nb10 = src1 ? src1->nb[0] : 0; const uint64_t nb11 = src1 ? src1->nb[1] : 0; const uint64_t nb12 = src1 ? src1->nb[2] : 0; - const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); + const uint64_t nb13 = src1 ? src1->nb[3] : 0; - const int64_t ne0 = dst ? dst->ne[0] : 0; - const int64_t ne1 = dst ? dst->ne[1] : 0; - const int64_t ne2 = dst ? dst->ne[2] : 0; - const int64_t ne3 = dst ? dst->ne[3] : 0; + const int64_t ne20 = src2 ? src2->ne[0] : 0; + const int64_t ne21 = src2 ? src2->ne[1] : 0; + const int64_t ne22 = src2 ? src2->ne[2] : 0; GGML_UNUSED(ne22); + const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23); - const uint64_t nb0 = dst ? dst->nb[0] : 0; - const uint64_t nb1 = dst ? dst->nb[1] : 0; - const uint64_t nb2 = dst ? dst->nb[2] : 0; - const uint64_t nb3 = dst ? dst->nb[3] : 0; + const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20); + const uint64_t nb21 = src2 ? src2->nb[1] : 0; + const uint64_t nb22 = src2 ? src2->nb[2] : 0; + const uint64_t nb23 = src2 ? src2->nb[3] : 0; + + const int64_t ne0 = dst ? dst->ne[0] : 0; + const int64_t ne1 = dst ? dst->ne[1] : 0; + const int64_t ne2 = dst ? dst->ne[2] : 0; + const int64_t ne3 = dst ? dst->ne[3] : 0; + + const uint64_t nb0 = dst ? dst->nb[0] : 0; + const uint64_t nb1 = dst ? dst->nb[1] : 0; + const uint64_t nb2 = dst ? dst->nb[2] : 0; + const uint64_t nb3 = dst ? dst->nb[3] : 0; const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; @@ -1785,16 +1795,6 @@ static enum ggml_status ggml_metal_graph_compute( const int n_as = src0->ne[2]; // src2 = ids - const int64_t ne20 = src2->ne[0]; - const int64_t ne21 = src2->ne[1]; - const int64_t ne22 = src2->ne[2]; GGML_UNUSED(ne22); - const int64_t ne23 = src2->ne[3]; GGML_UNUSED(ne23); - - const uint64_t nb20 = src2->nb[0]; GGML_UNUSED(nb20); - const uint64_t nb21 = src2->nb[1]; - const uint64_t nb22 = src2->nb[2]; GGML_UNUSED(nb22); - const uint64_t nb23 = src2->nb[3]; GGML_UNUSED(nb23); - const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t); GGML_ASSERT(src2t == GGML_TYPE_I32); @@ -2244,7 +2244,13 @@ static enum ggml_status ggml_metal_graph_compute( // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); @@ -2252,6 +2258,15 @@ static enum ggml_status ggml_metal_graph_compute( memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + const bool is_neox = mode & 2; + const bool is_glm = mode & 4; + + GGML_ASSERT(!is_glm && "GLM RoPE not implemented in Metal"); + + if (!is_neox) { + GGML_ASSERT(id_src2 == nil && "TODO: freq_factors not implemented for !is_neox"); + } + id pipeline = nil; switch (src0->type) { @@ -2263,33 +2278,38 @@ static enum ggml_status ggml_metal_graph_compute( [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:19]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:20]; - [encoder setBytes:&mode length:sizeof( int) atIndex:21]; - [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22]; - [encoder setBytes:&freq_base length:sizeof( float) atIndex:23]; - [encoder setBytes:&freq_scale length:sizeof( float) atIndex:24]; - [encoder setBytes:&ext_factor length:sizeof( float) atIndex:25]; - [encoder setBytes:&attn_factor length:sizeof( float) atIndex:26]; - [encoder setBytes:&beta_fast length:sizeof( float) atIndex:27]; - [encoder setBytes:&beta_slow length:sizeof( float) atIndex:28]; + if (id_src2 != nil) { + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:2]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:10]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:11]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:14]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:15]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:18]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:19]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:20]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:21]; + [encoder setBytes:&mode length:sizeof( int) atIndex:22]; + [encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:23]; + [encoder setBytes:&freq_base length:sizeof( float) atIndex:24]; + [encoder setBytes:&freq_scale length:sizeof( float) atIndex:25]; + [encoder setBytes:&ext_factor length:sizeof( float) atIndex:26]; + [encoder setBytes:&attn_factor length:sizeof( float) atIndex:27]; + [encoder setBytes:&beta_fast length:sizeof( float) atIndex:28]; + [encoder setBytes:&beta_slow length:sizeof( float) atIndex:29]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; @@ -2535,11 +2555,6 @@ static enum ggml_status ggml_metal_graph_compute( GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) && "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big"); - const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20); - const uint64_t nb21 = src2 ? src2->nb[1] : 0; - const uint64_t nb22 = src2 ? src2->nb[2] : 0; - const uint64_t nb23 = src2 ? src2->nb[3] : 0; - const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30); //const int64_t ne31 = src3 ? src3->ne[1] : 0; const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32); diff --git a/ggml-metal.metal b/ggml-metal.metal index cf262e834..c5eb25280 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1640,6 +1640,7 @@ static void rope_yarn_corr_dims( typedef void (rope_t)( device const void * src0, device const int32_t * src1, + device const float * src2, device float * dst, constant int64_t & ne00, constant int64_t & ne01, @@ -1675,6 +1676,7 @@ template kernel void kernel_rope( device const void * src0, device const int32_t * src1, + device const float * src2, device float * dst, constant int64_t & ne00, constant int64_t & ne01, @@ -1744,8 +1746,10 @@ kernel void kernel_rope( // simplified from `(ib * n_dims + ic) * inv_ndims` const float cur_rot = inv_ndims*ic - ib; + const float freq_factor = src2 != src0 ? src2[ic/2] : 1.0f; + + const float theta = theta_0 * pow(freq_base, cur_rot) / freq_factor; - const float theta = theta_0 * pow(freq_base, cur_rot); float cos_theta, sin_theta; rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index eac8f5579..f486b6c0a 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -14454,6 +14454,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd, const dpct::queue_ptr &main_stream) { +#pragma message("TODO: implement phi3 frequency factors support") +#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225") + GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet"); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index aff451b63..16287a280 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -4238,6 +4238,10 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, } static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +#pragma message("TODO: implement phi3 frequency factors support") +#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225") + GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet"); + const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; // const int n_ctx = ((int32_t *) dst->op_params)[3]; diff --git a/ggml.c b/ggml.c index 4bd911528..37b16b7a9 100644 --- a/ggml.c +++ b/ggml.c @@ -6231,6 +6231,7 @@ static struct ggml_tensor * ggml_rope_impl( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, + struct ggml_tensor * c, int n_dims, int mode, int n_ctx, @@ -6248,6 +6249,11 @@ static struct ggml_tensor * ggml_rope_impl( GGML_ASSERT(b->type == GGML_TYPE_I32); GGML_ASSERT(a->ne[2] == b->ne[0]); + if (c) { + GGML_ASSERT(c->type == GGML_TYPE_F32); + GGML_ASSERT(c->ne[0] >= n_dims / 2); + } + bool is_node = false; if (a->grad) { @@ -6271,6 +6277,7 @@ static struct ggml_tensor * ggml_rope_impl( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; + result->src[2] = c; return result; } @@ -6283,7 +6290,7 @@ struct ggml_tensor * ggml_rope( int mode, int n_ctx) { return ggml_rope_impl( - ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false + ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false ); } @@ -6295,7 +6302,49 @@ struct ggml_tensor * ggml_rope_inplace( int mode, int n_ctx) { return ggml_rope_impl( - ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true + ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true + ); +} + +struct ggml_tensor * ggml_rope_ext( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + int n_dims, + int mode, + int n_ctx, + int n_orig_ctx, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow) { + return ggml_rope_impl( + ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false + ); +} + +struct ggml_tensor * ggml_rope_ext_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + int n_dims, + int mode, + int n_ctx, + int n_orig_ctx, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow) { + return ggml_rope_impl( + ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true ); } @@ -6314,7 +6363,7 @@ struct ggml_tensor * ggml_rope_custom( float beta_fast, float beta_slow) { return ggml_rope_impl( - ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false ); } @@ -6334,27 +6383,18 @@ struct ggml_tensor * ggml_rope_custom_inplace( float beta_fast, float beta_slow) { return ggml_rope_impl( - ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true ); } -struct ggml_tensor * ggml_rope_xpos_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int n_dims, - float base, - bool down) { - return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true); -} - // ggml_rope_back struct ggml_tensor * ggml_rope_back( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, + struct ggml_tensor * c, int n_dims, int mode, int n_ctx, @@ -6370,6 +6410,7 @@ struct ggml_tensor * ggml_rope_back( GGML_ASSERT(ggml_is_vector(b)); GGML_ASSERT(b->type == GGML_TYPE_I32); GGML_ASSERT(a->ne[2] == b->ne[0]); + GGML_ASSERT(c == NULL && "freq factors not implemented yet"); GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet"); @@ -14304,6 +14345,7 @@ static void ggml_compute_forward_rope_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src2 = dst->src[2]; if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; @@ -14363,6 +14405,17 @@ static void ggml_compute_forward_rope_f32( const bool is_neox = mode & 2; const bool is_glm = mode & 4; + const float * freq_factors = NULL; + if (is_neox) { + if (src2 != NULL) { + GGML_ASSERT(src2->type == GGML_TYPE_F32); + GGML_ASSERT(src2->ne[0] >= n_dims / 2); + freq_factors = (const float *) src2->data; + } + } else { + GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for mode 1"); + } + // backward process uses inverse rotation by cos and sin. // cos and sin build a rotation matrix, where the inverse is the transpose. // this essentially just switches the sign of sin. @@ -14439,10 +14492,11 @@ static void ggml_compute_forward_rope_f32( // simplified from `(ib * n_dims + ic) * inv_ndims` float cur_rot = inv_ndims * ic - ib; + float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f; float cos_theta, sin_theta; rope_yarn( - theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, + theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta ); sin_theta *= sin_sign; @@ -18387,6 +18441,7 @@ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct gg static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) { struct ggml_tensor * src0 = tensor->src[0]; struct ggml_tensor * src1 = tensor->src[1]; + struct ggml_tensor * src2 = tensor->src[2]; switch (tensor->op) { case GGML_OP_DUP: @@ -18918,6 +18973,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_rope_back(ctx, tensor->grad, src1, + src2, n_dims, mode, n_ctx, @@ -18957,6 +19013,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_rope_impl(ctx, tensor->grad, src1, + src2, n_dims, mode, n_ctx, @@ -19038,7 +19095,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor masked); } - struct ggml_tensor * src2 = tensor->src[2]; const int64_t elem_q = ggml_nelements(src0); const int64_t elem_k = ggml_nelements(src1); const int64_t elem_v = ggml_nelements(src2); diff --git a/ggml.h b/ggml.h index 774757101..35ac9110c 100644 --- a/ggml.h +++ b/ggml.h @@ -1465,6 +1465,7 @@ extern "C" { // if mode & 4 == 1, ChatGLM style // // b is an int32 vector with size a->ne[2], it contains the positions + // c is freq factors (e.g. phi3-128k), (optional) GGML_API struct ggml_tensor * ggml_rope( struct ggml_context * ctx, struct ggml_tensor * a, @@ -1483,10 +1484,11 @@ extern "C" { int n_ctx); // custom RoPE - GGML_API struct ggml_tensor * ggml_rope_custom( + GGML_API struct ggml_tensor * ggml_rope_ext( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, + struct ggml_tensor * c, int n_dims, int mode, int n_ctx, @@ -1499,7 +1501,23 @@ extern "C" { float beta_slow); // in-place, returns view(a) - GGML_API struct ggml_tensor * ggml_rope_custom_inplace( + GGML_API struct ggml_tensor * ggml_rope_ext_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + int n_dims, + int mode, + int n_ctx, + int n_orig_ctx, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, @@ -1512,20 +1530,28 @@ extern "C" { float ext_factor, float attn_factor, float beta_fast, - float beta_slow); + float beta_slow), + "use ggml_rope_ext instead"); - // compute correction dims for YaRN RoPE scaling - GGML_CALL void ggml_rope_yarn_corr_dims( - int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); - - // xPos RoPE, in-place, returns view(a) - GGML_API struct ggml_tensor * ggml_rope_xpos_inplace( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, int n_dims, - float base, - bool down); + int mode, + int n_ctx, + int n_orig_ctx, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow), + "use ggml_rope_ext_inplace instead"); + + // compute correction dims for YaRN RoPE scaling + GGML_CALL void ggml_rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); // rotary position embedding backward, i.e compute dx from dy // a - dy @@ -1533,6 +1559,7 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, + struct ggml_tensor * c, int n_dims, int mode, int n_ctx, diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 692120f4d..42df2e4d0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -57,12 +57,13 @@ class Keys: CAUSAL = "{arch}.attention.causal" class Rope: - DIMENSION_COUNT = "{arch}.rope.dimension_count" - FREQ_BASE = "{arch}.rope.freq_base" - SCALING_TYPE = "{arch}.rope.scaling.type" - SCALING_FACTOR = "{arch}.rope.scaling.factor" - SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" - SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" + DIMENSION_COUNT = "{arch}.rope.dimension_count" + FREQ_BASE = "{arch}.rope.freq_base" + SCALING_TYPE = "{arch}.rope.scaling.type" + SCALING_FACTOR = "{arch}.rope.scaling.factor" + SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" + SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" + SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" class SSM: CONV_KERNEL = "{arch}.ssm.conv_kernel" @@ -148,6 +149,8 @@ class MODEL_TENSOR(IntEnum): OUTPUT = auto() OUTPUT_NORM = auto() ROPE_FREQS = auto() + ROPE_FACTORS_LONG = auto() + ROPE_FACTORS_SHORT = auto() ATTN_Q = auto() ATTN_K = auto() ATTN_V = auto() @@ -225,6 +228,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.OUTPUT_NORM: "output_norm", MODEL_TENSOR.OUTPUT: "output", MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long", + MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short", MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index d5e323a52..8b41b54ea 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -433,6 +433,9 @@ class GGUFWriter: def add_rope_scaling_factor(self, value: float) -> None: self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value) + def add_rope_scaling_attn_factors(self, value: Sequence[float]) -> None: + self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value) + def add_rope_scaling_orig_ctx_len(self, value: int) -> None: self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value) diff --git a/llama.cpp b/llama.cpp index d26fe559a..abff8c1c0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -304,6 +304,7 @@ enum llm_kv { LLM_KV_ROPE_SCALE_LINEAR, LLM_KV_ROPE_SCALING_TYPE, LLM_KV_ROPE_SCALING_FACTOR, + LLM_KV_ROPE_SCALING_ATTN_FACTOR, LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, LLM_KV_ROPE_SCALING_FINETUNED, @@ -381,6 +382,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, @@ -436,6 +438,8 @@ enum llm_tensor { LLM_TENSOR_OUTPUT, LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ROPE_FACTORS_LONG, + LLM_TENSOR_ROPE_FACTORS_SHORT, LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_K, LLM_TENSOR_ATTN_V, @@ -803,18 +807,20 @@ static const std::map> LLM_TENSOR_NA { LLM_ARCH_PHI3, { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" }, + { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, { @@ -1750,6 +1756,7 @@ struct llama_hparams { float f_norm_eps; float f_norm_rms_eps; + float rope_attn_factor = 1.0f; float rope_freq_base_train; float rope_freq_scale_train; uint32_t n_yarn_orig_ctx; @@ -1798,6 +1805,7 @@ struct llama_hparams { if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; + if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true; if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; @@ -2103,6 +2111,10 @@ struct llama_model { struct ggml_tensor * output; struct ggml_tensor * output_b; + // long rope factors + struct ggml_tensor * rope_long = nullptr; + struct ggml_tensor * rope_short = nullptr; + std::vector layers; llama_split_mode split_mode; @@ -3306,6 +3318,39 @@ struct llama_model_loader { return get_arr_n(llm_kv(kid), result, required); } + template + bool get_arr(const std::string & key, std::vector & result, const bool required = true) { + const int kid = gguf_find_key(meta, key.c_str()); + + if (kid < 0) { + if (required) { + throw std::runtime_error(format("key not found in model: %s", key.c_str())); + } + return false; + } + + struct GGUFMeta::ArrayInfo arr_info = + GGUFMeta::GKV::get_kv(meta, kid); + + if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) { + throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str())); + } + + // GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T)); + GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same::value)); + GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same::value)); + + result.resize(arr_info.length); + result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length); + + return true; + } + + template + bool get_arr(const enum llm_kv kid, T& result, const bool required = true) { + return get_arr(llm_kv(kid), result, required); + } + template bool get_key(const std::string & key, T & result, const bool required = true) { auto it = kv_overrides.find(key); @@ -3849,6 +3894,8 @@ static void llm_load_hparams( } hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; + ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false); + // sanity check for n_rot (optional) { hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head; @@ -4880,6 +4927,7 @@ static bool llm_load_tensors( // create tensors for the weights { const int64_t n_embd = hparams.n_embd; + const int64_t n_embd_head = n_embd / hparams.n_head; const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); const int64_t n_embd_gqa = n_embd_v_gqa; @@ -5591,6 +5639,9 @@ static bool llm_load_tensors( { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }); + model.rope_long = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, false); + model.rope_short = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, false); + // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }); @@ -5601,12 +5652,12 @@ static bool llm_load_tensors( ggml_context* ctx_layer = ctx_for_layer(i); ggml_context* ctx_split = ctx_for_layer_split(i); - auto& layer = model.layers[i]; + auto & layer = model.layers[i]; layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false); - layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }); layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }); @@ -6821,17 +6872,20 @@ struct llm_build_context { cb(lctx.inp_K_shift, "K_shift", -1); ggml_set_input(lctx.inp_K_shift); + struct ggml_tensor * rope_factors = build_rope_factors(); + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = // we rotate only the first n_rot dimensions - ggml_rope_custom_inplace(ctx0, + ggml_rope_ext_inplace(ctx0, ggml_view_3d(ctx0, kv_self.k_l[il], n_embd_head_k, n_head_kv, n_ctx, ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), 0), - lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + cb(tmp, "K_shifted", il); ggml_build_forward_expand(gf, tmp); } @@ -6934,6 +6988,17 @@ struct llm_build_context { return lctx.inp_pos; } + struct ggml_tensor * build_rope_factors() { + // choose long/short freq factors based on the context size + const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; + + if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) { + return model.rope_long; + } + + return model.rope_short; + } + struct ggml_tensor * build_inp_out_ids() { lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); cb(lctx.inp_out_ids, "inp_out_ids", -1); @@ -7041,15 +7106,15 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -7171,13 +7236,13 @@ struct llm_build_context { switch (model.type) { case MODEL_7B: - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -7283,15 +7348,15 @@ struct llm_build_context { struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -7404,14 +7469,14 @@ struct llm_build_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); // using mode = 2 for neox mode - Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -7527,15 +7592,15 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -7679,15 +7744,15 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -8032,15 +8097,15 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -8472,15 +8537,15 @@ struct llm_build_context { } - Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -8592,14 +8657,14 @@ struct llm_build_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); // using mode = 2 for neox mode - Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -8703,15 +8768,15 @@ struct llm_build_context { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -8817,15 +8882,15 @@ struct llm_build_context { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -8969,8 +9034,8 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); @@ -8980,8 +9045,8 @@ struct llm_build_context { Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head))); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -9052,6 +9117,9 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + // rope freq factors for 128k context + struct ggml_tensor * rope_factors = build_rope_factors(); + for (int il = 0; il < n_layer; ++il) { auto residual = inpL; @@ -9088,8 +9156,8 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); @@ -9097,8 +9165,8 @@ struct llm_build_context { Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -9204,14 +9272,14 @@ struct llm_build_context { struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr, n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr, n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); @@ -9412,15 +9480,15 @@ struct llm_build_context { cb(tmpk, "tmpk", il); cb(Vcur, "Vcur", il); - struct ggml_tensor * Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, + struct ggml_tensor * Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, + struct ggml_tensor * Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -9528,15 +9596,15 @@ struct llm_build_context { // cb(Vcur, "Vcur", il); // } - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -9645,15 +9713,15 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -9775,15 +9843,15 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -9895,8 +9963,8 @@ struct llm_build_context { struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr, n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); @@ -9904,8 +9972,8 @@ struct llm_build_context { Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); cb(Qcur, "Qcur_scaled", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr, n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); @@ -10015,15 +10083,15 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -10305,15 +10373,15 @@ struct llm_build_context { cb(Kcur, "Kcur", il); } - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -10436,15 +10504,15 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -15417,6 +15485,7 @@ struct llama_context * llama_new_context_with_model( cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; } + cparams.yarn_attn_factor *= hparams.rope_attn_factor; cparams.causal_attn = hparams.causal_attn; if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index c74e253db..1493a7ca7 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1763,14 +1763,14 @@ struct test_llama : public test_llm { struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur); struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur); - Qcur = ggml_rope_custom( - ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos, + Qcur = ggml_rope_ext( + ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos, nullptr, hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - Kcur = ggml_rope_custom( - ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, + Kcur = ggml_rope_ext( + ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr, hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -1889,13 +1889,13 @@ struct test_falcon : public test_llm { Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens); // using mode = 2 for neox mode - Qcur = ggml_rope_custom( - ctx, Qcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx, + Qcur = ggml_rope_ext( + ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, 0, hp.n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - Kcur = ggml_rope_custom( - ctx, Kcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx, + Kcur = ggml_rope_ext( + ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, 0, hp.n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); From 3e5faa85032ec3106a2ad831bf412be9ff139f47 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 22 May 2024 11:01:35 +0300 Subject: [PATCH 09/34] cuda : fix rope + add tests (#7452) * cuda : fix rope pos data ggml-ci * ggml : drop mode & 1 == 1 support for ggml_rope ggml-ci * ggml : support freq_factors for f16 rope (CPU) ggml-ci * tests : add rope tests using frequency factors ggml-ci --- ggml-cuda/rope.cu | 4 ++-- ggml.c | 20 +++++++++++++++++-- ggml.h | 2 +- tests/test-backend-ops.cpp | 41 ++++++++++++++++++++++++-------------- 4 files changed, 47 insertions(+), 20 deletions(-) diff --git a/ggml-cuda/rope.cu b/ggml-cuda/rope.cu index 4a558f4b3..50f2cf415 100644 --- a/ggml-cuda/rope.cu +++ b/ggml-cuda/rope.cu @@ -283,9 +283,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const bool is_neox = mode & 2; const bool is_glm = mode & 4; - if (is_neox) { - pos = (const int32_t *) src1_d; + pos = (const int32_t *) src1_d; + if (is_neox) { if (src2 != nullptr) { freq_factors = (const float *) src2->data; } diff --git a/ggml.c b/ggml.c index 37b16b7a9..d316e3d31 100644 --- a/ggml.c +++ b/ggml.c @@ -6245,6 +6245,8 @@ static struct ggml_tensor * ggml_rope_impl( float xpos_base, bool xpos_down, bool inplace) { + GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported"); + GGML_ASSERT(ggml_is_vector(b)); GGML_ASSERT(b->type == GGML_TYPE_I32); GGML_ASSERT(a->ne[2] == b->ne[0]); @@ -14413,7 +14415,7 @@ static void ggml_compute_forward_rope_f32( freq_factors = (const float *) src2->data; } } else { - GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for mode 1"); + GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox"); } // backward process uses inverse rotation by cos and sin. @@ -14529,6 +14531,7 @@ static void ggml_compute_forward_rope_f32( } } +// TODO: deduplicate f16/f32 code static void ggml_compute_forward_rope_f16( const struct ggml_compute_params * params, struct ggml_tensor * dst, @@ -14536,6 +14539,7 @@ static void ggml_compute_forward_rope_f16( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src2 = dst->src[2]; if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; @@ -14588,6 +14592,17 @@ static void ggml_compute_forward_rope_f16( const bool is_neox = mode & 2; const bool is_glm = mode & 4; + const float * freq_factors = NULL; + if (is_neox) { + if (src2 != NULL) { + GGML_ASSERT(src2->type == GGML_TYPE_F32); + GGML_ASSERT(src2->ne[0] >= n_dims / 2); + freq_factors = (const float *) src2->data; + } + } else { + GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox"); + } + // backward process uses inverse rotation by cos and sin. // cos and sin build a rotation matrix, where the inverse is the transpose. // this essentially just switches the sign of sin. @@ -14660,10 +14675,11 @@ static void ggml_compute_forward_rope_f16( // simplified from `(ib * n_dims + ic) * inv_ndims` float cur_rot = inv_ndims * ic - ib; + float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f; float cos_theta, sin_theta; rope_yarn( - theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, + theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta ); sin_theta *= sin_sign; diff --git a/ggml.h b/ggml.h index 35ac9110c..08835042c 100644 --- a/ggml.h +++ b/ggml.h @@ -1460,7 +1460,7 @@ extern "C" { struct ggml_tensor * b); // rotary position embedding - // if mode & 1 == 1, skip n_past elements (DEPRECATED) + // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED) // if mode & 2 == 1, GPT-NeoX style // if mode & 4 == 1, ChatGLM style // diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 1493a7ca7..de74585da 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1142,20 +1142,22 @@ struct test_rope : public test_case { int n_dims; int mode; int n_ctx; + bool ff; std::string vars() override { - return VARS_TO_STR5(type, ne, n_dims, mode, n_ctx); + return VARS_TO_STR6(type, ne, n_dims, mode, n_ctx, ff); } test_rope(ggml_type type = GGML_TYPE_F32, std::array ne = {10, 10, 10, 1}, - int n_dims = 10, int mode = 0, int n_ctx = 512) - : type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx) {} + int n_dims = 10, int mode = 0, int n_ctx = 512, bool ff = false) + : type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx), ff(ff) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]); - ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, n_ctx); + ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr; + ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); return out; } @@ -1169,7 +1171,12 @@ struct test_rope : public test_case { } ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int)); } else { - init_tensor_uniform(t); + if (t->ne[0] == n_dims/2) { + // frequency factors in the range [0.9f, 1.1f] + init_tensor_uniform(t, 0.9f, 1.1f); + } else { + init_tensor_uniform(t); + } } } } @@ -2188,16 +2195,20 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f)); for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) { - test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512)); // llama 7B - test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512)); // llama 13B - test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512)); // llama 30B - test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512)); // llama 65B - test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512)); // neox (falcon 7B) - test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512)); // neox (falcon 7B) - test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512)); // neox (falcon 40B) - test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512)); // neox (falcon 40B) - test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512)); // neox (stablelm) - test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512)); // neox (phi-2) + // TODO: ff not supported yet for !neox + test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, false)); // llama 7B + test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, false)); // llama 13B + test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, false)); // llama 30B + test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, false)); // llama 65B + + for (bool ff : {false, true}) { // freq_factors + test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, ff)); // neox (falcon 7B) + test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, ff)); // neox (falcon 7B) + test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, ff)); // neox (falcon 40B) + test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, ff)); // neox (falcon 40B) + test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, ff)); // neox (stablelm) + test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, ff)); // neox (phi-2) + } } test_cases.emplace_back(new test_concat(GGML_TYPE_F32)); From 95fb0aefab568348da159efdd370e064d1b35f97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 22 May 2024 10:24:29 +0200 Subject: [PATCH 10/34] CUDA: remove incorrect precision check (#7454) --- ggml-cuda/fattn-tile-f32.cu | 3 --- 1 file changed, 3 deletions(-) diff --git a/ggml-cuda/fattn-tile-f32.cu b/ggml-cuda/fattn-tile-f32.cu index 130e7cbdb..54db765e2 100644 --- a/ggml-cuda/fattn-tile-f32.cu +++ b/ggml-cuda/fattn-tile-f32.cu @@ -286,9 +286,6 @@ void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_ten const ggml_tensor * KQV = dst; const ggml_tensor * Q = dst->src[0]; - const int32_t precision = KQV->op_params[2]; - GGML_ASSERT(precision == GGML_PREC_DEFAULT); - if (Q->ne[1] <= 16) { constexpr int cols_per_block = 16; constexpr int parallel_blocks = 4; From 9b3d83318931aa98c487baaa977626931d059e6a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 22 May 2024 12:36:37 +0300 Subject: [PATCH 11/34] cuda : fix compile warning (#7454) --- ggml-cuda/fattn-tile-f32.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml-cuda/fattn-tile-f32.cu b/ggml-cuda/fattn-tile-f32.cu index 54db765e2..b8b2f69e1 100644 --- a/ggml-cuda/fattn-tile-f32.cu +++ b/ggml-cuda/fattn-tile-f32.cu @@ -283,8 +283,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * } void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * KQV = dst; - const ggml_tensor * Q = dst->src[0]; + const ggml_tensor * Q = dst->src[0]; if (Q->ne[1] <= 16) { constexpr int cols_per_block = 16; From 03d8900ebe062355e26a562379daee5f17ea099f Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Wed, 22 May 2024 07:08:18 -0400 Subject: [PATCH 12/34] llama : add missing model type names (#7445) --- llama.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama.cpp b/llama.cpp index abff8c1c0..d8c6f29a5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3771,14 +3771,17 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { static const char * llama_model_type_name(e_model type) { switch (type) { + case MODEL_17M: return "17M"; case MODEL_22M: return "22M"; case MODEL_33M: return "33M"; case MODEL_109M: return "109M"; case MODEL_137M: return "137M"; + case MODEL_335M: return "335M"; case MODEL_0_5B: return "0.5B"; case MODEL_1B: return "1B"; case MODEL_2B: return "2B"; case MODEL_3B: return "3B"; + case MODEL_4B: return "4B"; case MODEL_7B: return "7B"; case MODEL_8B: return "8B"; case MODEL_12B: return "12B"; From fcda1128bc5f8eb7e1811708fe9d9867b9aec815 Mon Sep 17 00:00:00 2001 From: "k.h.lai" Date: Wed, 22 May 2024 20:53:21 +0800 Subject: [PATCH 13/34] vulkan: add workaround for iterator boundary check to fix clang-cl debug build (#7426) --- CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cc60039a..c09d834fb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -505,6 +505,12 @@ if (LLAMA_VULKAN) add_compile_definitions(GGML_USE_VULKAN) + # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build + # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector + if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0) + endif() + if (LLAMA_VULKAN_CHECK_RESULTS) add_compile_definitions(GGML_VULKAN_CHECK_RESULTS) endif() From b18532a4efeca8796fea8e36195c81cbfd596a4a Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 22 May 2024 16:10:46 +0200 Subject: [PATCH 14/34] phi3 : duplicate rope factors in each layer (#7447) * phi3 : duplicate rope factors in each layer phi3 : set phi-3 model type as 14B model loader : simplify the process for duplicating model tensors llama-bench : remove default pg test * replace bool parameters in llama_model_loader with named flags --- examples/llama-bench/llama-bench.cpp | 2 +- llama.cpp | 178 ++++++++++++--------------- 2 files changed, 83 insertions(+), 97 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 8b965e199..6bb1f70c3 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -195,7 +195,7 @@ static const cmd_params cmd_params_defaults = { /* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* n_prompt */ {512}, /* n_gen */ {128}, - /* n_pg */ {{512, 128}}, + /* n_pg */ {}, /* n_batch */ {2048}, /* n_ubatch */ {512}, /* type_k */ {GGML_TYPE_F16}, diff --git a/llama.cpp b/llama.cpp index d8c6f29a5..34137c7ad 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1940,6 +1940,10 @@ struct llama_layer { // mamba bias struct ggml_tensor * ssm_conv1d_b; struct ggml_tensor * ssm_dt_b; + + // long rope factors + struct ggml_tensor * rope_long = nullptr; + struct ggml_tensor * rope_short = nullptr; }; struct llama_kv_cell { @@ -2111,10 +2115,6 @@ struct llama_model { struct ggml_tensor * output; struct ggml_tensor * output_b; - // long rope factors - struct ggml_tensor * rope_long = nullptr; - struct ggml_tensor * rope_short = nullptr; - std::vector layers; llama_split_mode split_mode; @@ -3425,11 +3425,15 @@ struct llama_model_loader { return get_tensor_meta(get_tensor_name(i)); } - struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) { + struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) { struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur); ggml_set_name(tensor, ggml_get_name(cur)); - n_created++; + if (duplicated) { + size_data += ggml_nbytes(cur); + } else { + n_created++; + } return tensor; } @@ -3464,14 +3468,17 @@ struct llama_model_loader { return cur; } - struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, bool required = true) { - const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); + static const int TENSOR_NOT_REQUIRED = 1; + static const int TENSOR_DUPLICATED = 2; + + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, int flags = 0) { + const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED)); if (cur == NULL) { return NULL; } - return create_tensor_for(ctx, cur); + return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED); } struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector & ne, size_t offset, bool required = true) { @@ -4139,6 +4146,7 @@ static void llm_load_hparams( switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_1B; break; case 32: model.type = e_model::MODEL_3B; break; + case 40: model.type = e_model::MODEL_14B; break; default: model.type = e_model::MODEL_UNKNOWN; } } break; @@ -4965,12 +4973,10 @@ static bool llm_load_tensors( { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); if (model.arch != LLM_ARCH_MINICPM){ - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } } @@ -4989,10 +4995,10 @@ static bool llm_load_tensors( layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); // optional bias tensors - layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false); - layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false); - layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); - layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); @@ -5003,7 +5009,7 @@ static bool llm_load_tensors( } else { layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); - layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false); + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED); if (layer.ffn_gate_exps) { layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}); layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); @@ -5045,12 +5051,10 @@ static bool llm_load_tensors( // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -5073,7 +5077,7 @@ static bool llm_load_tensors( layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); - layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false); + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED); if (layer.ffn_gate_exps) { layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}); layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); @@ -5175,11 +5179,9 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); if (!model.output) { - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU } } @@ -5192,8 +5194,8 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false); - layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false); + layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); @@ -5211,12 +5213,10 @@ static bool llm_load_tensors( { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); if (!model.output) { // needs to be on GPU - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -5314,14 +5314,14 @@ static bool llm_load_tensors( layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); - layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false); - layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false); + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); - layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false); - layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); @@ -5383,18 +5383,16 @@ static bool llm_load_tensors( case LLM_ARCH_MPT: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false); + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED); // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); if (!model.output) { - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU } } @@ -5405,31 +5403,31 @@ static bool llm_load_tensors( auto & layer = model.layers[i]; layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); - layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); - layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); - layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false); - layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false); + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false); - layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); // AWQ ScaleActivation layer - layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false); + layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); } } break; case LLM_ARCH_STABLELM: @@ -5458,17 +5456,17 @@ static bool llm_load_tensors( layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); // optional bias tensors, present in Stable LM 2 1.6B - layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false); - layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false); - layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); // optional q and k layernorms, present in StableLM 2 12B - layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false); - layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false); + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED); // optional FFN norm, not present in StableLM 2 12B which uses parallel residual - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false); - layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); @@ -5511,12 +5509,10 @@ static bool llm_load_tensors( // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -5614,8 +5610,8 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false); - layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); if (layer.wqkv == nullptr) { layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); @@ -5642,9 +5638,6 @@ static bool llm_load_tensors( { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }); - model.rope_long = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, false); - model.rope_short = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, false); - // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }); @@ -5659,13 +5652,16 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }); - layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }); layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }); + + layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); } } break; case LLM_ARCH_PLAMO: @@ -5834,9 +5830,7 @@ static bool llm_load_tensors( // output model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading const int64_t n_ff = hparams.n_ff; const int64_t n_embd_head_k = hparams.n_embd_head_k; @@ -5871,12 +5865,10 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -5927,12 +5919,10 @@ static bool llm_load_tensors( { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed, duplicated to allow offloading if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -5993,9 +5983,7 @@ static bool llm_load_tensors( { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); // init output from the input tok embed - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } for (int i = 0; i < n_layer; ++i) { @@ -6027,12 +6015,10 @@ static bool llm_load_tensors( // output { - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -6875,9 +6861,9 @@ struct llm_build_context { cb(lctx.inp_K_shift, "K_shift", -1); ggml_set_input(lctx.inp_K_shift); - struct ggml_tensor * rope_factors = build_rope_factors(); for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * rope_factors = build_rope_factors(il); struct ggml_tensor * tmp = // we rotate only the first n_rot dimensions ggml_rope_ext_inplace(ctx0, @@ -6991,15 +6977,15 @@ struct llm_build_context { return lctx.inp_pos; } - struct ggml_tensor * build_rope_factors() { + struct ggml_tensor * build_rope_factors(int il) { // choose long/short freq factors based on the context size const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) { - return model.rope_long; + return model.layers[il].rope_long; } - return model.rope_short; + return model.layers[il].rope_short; } struct ggml_tensor * build_inp_out_ids() { @@ -9120,14 +9106,14 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - // rope freq factors for 128k context - struct ggml_tensor * rope_factors = build_rope_factors(); - for (int il = 0; il < n_layer; ++il) { auto residual = inpL; // self-attention { + // rope freq factors for 128k context + struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, From 38c03478a37e460ecd3a21155b338a83bfed7f90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 22 May 2024 17:58:25 +0200 Subject: [PATCH 15/34] CUDA: fix FA out-of-bounds writes (#7465) --- ggml-cuda/fattn-tile-f16.cu | 4 ++++ ggml-cuda/fattn-tile-f32.cu | 4 ++++ ggml-cuda/fattn-vec-f16.cu | 6 +++++- ggml-cuda/fattn-vec-f32.cu | 6 +++++- 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/ggml-cuda/fattn-tile-f16.cu b/ggml-cuda/fattn-tile-f16.cu index 4a07ac6ad..586d469c0 100644 --- a/ggml-cuda/fattn-tile-f16.cu +++ b/ggml-cuda/fattn-tile-f16.cu @@ -238,6 +238,10 @@ static __global__ void flash_attn_tile_ext_f16( for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) { const int j_VKQ = j_VKQ_0 + threadIdx.y; + if (ic0 + j_VKQ >= ne01) { + return; + } + half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]); kqsum_j = warp_reduce_sum(kqsum_j); diff --git a/ggml-cuda/fattn-tile-f32.cu b/ggml-cuda/fattn-tile-f32.cu index b8b2f69e1..b6ef8eb48 100644 --- a/ggml-cuda/fattn-tile-f32.cu +++ b/ggml-cuda/fattn-tile-f32.cu @@ -237,6 +237,10 @@ static __global__ void flash_attn_tile_ext_f32( for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) { const int j_VKQ = j_VKQ_0 + threadIdx.y; + if (ic0 + j_VKQ >= ne01) { + return; + } + float kqsum_j = kqsum[j_VKQ_0/nwarps]; kqsum_j = warp_reduce_sum(kqsum_j); diff --git a/ggml-cuda/fattn-vec-f16.cu b/ggml-cuda/fattn-vec-f16.cu index 54e1ac5d1..7352dcabf 100644 --- a/ggml-cuda/fattn-vec-f16.cu +++ b/ggml-cuda/fattn-vec-f16.cu @@ -212,6 +212,10 @@ static __global__ void flash_attn_vec_ext_f16( #pragma unroll for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) { + if (ic0 + j_VKQ >= ne01) { + break; + } + kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x]; kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]); @@ -223,7 +227,7 @@ static __global__ void flash_attn_vec_ext_f16( dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val; } - if (parallel_blocks != 1 && tid < ncols) { + if (parallel_blocks != 1 && tid < ncols && ic0 + tid < ne01) { dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]); } #else diff --git a/ggml-cuda/fattn-vec-f32.cu b/ggml-cuda/fattn-vec-f32.cu index 5bcabd092..11476a6c0 100644 --- a/ggml-cuda/fattn-vec-f32.cu +++ b/ggml-cuda/fattn-vec-f32.cu @@ -200,6 +200,10 @@ static __global__ void flash_attn_vec_ext_f32( #pragma unroll for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) { + if (ic0 + j_VKQ >= ne01) { + break; + } + kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x]; kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]); @@ -211,7 +215,7 @@ static __global__ void flash_attn_vec_ext_f32( dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val; } - if (parallel_blocks != 1 && tid < ncols) { + if (parallel_blocks != 1 && tid < ncols && ic0 + tid < ne01) { dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]); } } From 6ff13987ad1a9519bee13dd98b6a21cd98979aab Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 22 May 2024 20:04:20 +0300 Subject: [PATCH 16/34] common : normalize naming style (#7462) * common : normalize naming style ggml-ci * common : match declaration / definition order * zig : try to fix build --- build.zig | 12 +- common/common.cpp | 1338 +++++++++++----------- common/common.h | 90 +- common/sampling.cpp | 83 +- common/sampling.h | 5 + common/train.cpp | 2 +- examples/batched/batched.cpp | 2 +- examples/embedding/embedding.cpp | 4 +- examples/eval-callback/eval-callback.cpp | 4 +- examples/imatrix/imatrix.cpp | 4 +- examples/infill/infill.cpp | 16 +- examples/llama-bench/llama-bench.cpp | 2 +- examples/llava/llava-cli.cpp | 2 +- examples/lookahead/lookahead.cpp | 2 +- examples/lookup/lookup.cpp | 2 +- examples/main/main.cpp | 16 +- examples/parallel/parallel.cpp | 2 +- examples/perplexity/perplexity.cpp | 14 +- examples/quantize/quantize.cpp | 2 +- examples/retrieval/retrieval.cpp | 4 +- examples/server/server.cpp | 10 +- 21 files changed, 823 insertions(+), 793 deletions(-) diff --git a/build.zig b/build.zig index 96783574f..267c976b1 100644 --- a/build.zig +++ b/build.zig @@ -129,14 +129,14 @@ pub fn build(b: *std.build.Builder) !void { const clip = make.obj("clip", "examples/llava/clip.cpp"); const llava = make.obj("llava", "examples/llava/llava.cpp"); - _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser }); - _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo }); - _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo }); - _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo }); - _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train }); + _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, sampling, json_schema_to_grammar, buildinfo, console, grammar_parser }); + _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, sampling, json_schema_to_grammar, buildinfo }); + _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, sampling, json_schema_to_grammar, buildinfo }); + _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, sampling, json_schema_to_grammar, buildinfo }); + _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, sampling, json_schema_to_grammar, buildinfo, train }); _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train }); - const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava }); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, sampling, json_schema_to_grammar, buildinfo, grammar_parser, clip, llava }); if (server.target.isWindows()) { server.linkSystemLibrary("ws2_32"); } diff --git a/common/common.cpp b/common/common.cpp index ae11650b4..7500e08ff 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -73,7 +73,11 @@ using json = nlohmann::ordered_json; -int32_t get_num_physical_cores() { +// +// CPU utils +// + +int32_t cpu_get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores std::unordered_set siblings; @@ -142,9 +146,9 @@ static bool is_running_on_efficiency_core(void) { return core_type == intel_atom; } -static int count_math_cpus(int cpu_count) { +static int cpu_count_math_cpus(int n_cpu) { int result = 0; - for (int cpu = 0; cpu < cpu_count; ++cpu) { + for (int cpu = 0; cpu < n_cpu; ++cpu) { if (pin_cpu(cpu)) { return -1; } @@ -162,16 +166,16 @@ static int count_math_cpus(int cpu_count) { /** * Returns number of CPUs on system that are useful for math. */ -int get_math_cpu_count() { +int32_t cpu_get_num_math() { #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__) - int cpu_count = sysconf(_SC_NPROCESSORS_ONLN); - if (cpu_count < 1) { - return get_num_physical_cores(); + int n_cpu = sysconf(_SC_NPROCESSORS_ONLN); + if (n_cpu < 1) { + return cpu_get_num_physical_cores(); } if (is_hybrid_cpu()) { cpu_set_t affinity; if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) { - int result = count_math_cpus(cpu_count); + int result = cpu_count_math_cpus(n_cpu); pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity); if (result > 0) { return result; @@ -179,108 +183,103 @@ int get_math_cpu_count() { } } #endif - return get_num_physical_cores(); + return cpu_get_num_physical_cores(); } -void process_escapes(std::string & input) { - std::size_t input_len = input.length(); - std::size_t output_idx = 0; +// +// CLI argument parsing +// - for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) { - if (input[input_idx] == '\\' && input_idx + 1 < input_len) { - switch (input[++input_idx]) { - case 'n': input[output_idx++] = '\n'; break; - case 'r': input[output_idx++] = '\r'; break; - case 't': input[output_idx++] = '\t'; break; - case '\'': input[output_idx++] = '\''; break; - case '\"': input[output_idx++] = '\"'; break; - case '\\': input[output_idx++] = '\\'; break; - case 'x': - // Handle \x12, etc - if (input_idx + 2 < input_len) { - const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 }; - char *err_p = nullptr; - const long val = std::strtol(x, &err_p, 16); - if (err_p == x + 2) { - input_idx += 2; - input[output_idx++] = char(val); - break; - } - } - // fall through - default: input[output_idx++] = '\\'; - input[output_idx++] = input[input_idx]; break; +void gpt_params_handle_model_default(gpt_params & params) { + if (!params.hf_repo.empty()) { + // short-hand to avoid specifying --hf-file -> default it to --model + if (params.hf_file.empty()) { + if (params.model.empty()) { + throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); } - } else { - input[output_idx++] = input[input_idx]; + params.hf_file = params.model; + } else if (params.model.empty()) { + std::string cache_directory = fs_get_cache_directory(); + const bool success = fs_create_directory_with_parents(cache_directory); + if (!success) { + throw std::runtime_error("failed to create cache directory: " + cache_directory); + } + params.model = cache_directory + string_split(params.hf_file, '/').back(); + } + } else if (!params.model_url.empty()) { + if (params.model.empty()) { + auto f = string_split(params.model_url, '#').front(); + f = string_split(f, '?').front(); + f = string_split(f, '/').back(); + params.model = "models/" + f; + } + } else if (params.model.empty()) { + params.model = DEFAULT_MODEL_PATH; + } +} + +bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { + bool invalid_param = false; + std::string arg; + const std::string arg_prefix = "--"; + llama_sampling_params & sparams = params.sparams; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) { + throw std::invalid_argument("error: unknown argument: " + arg); + } + if (invalid_param) { + throw std::invalid_argument("error: invalid parameter for argument: " + arg); } } - input.resize(output_idx); + if (params.prompt_cache_all && + (params.interactive || params.interactive_first || + params.instruct)) { + + throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); + } + + gpt_params_handle_model_default(params); + + if (params.escape) { + string_process_escapes(params.prompt); + string_process_escapes(params.input_prefix); + string_process_escapes(params.input_suffix); + string_process_escapes(sparams.cfg_negative_prompt); + for (auto & antiprompt : params.antiprompt) { + string_process_escapes(antiprompt); + } + } + + if (!params.kv_overrides.empty()) { + params.kv_overrides.emplace_back(); + params.kv_overrides.back().key[0] = 0; + } + + return true; } bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { bool result = true; try { if (!gpt_params_parse_ex(argc, argv, params)) { - gpt_print_usage(argc, argv, gpt_params()); + gpt_params_print_usage(argc, argv, gpt_params()); exit(0); } } catch (const std::invalid_argument & ex) { fprintf(stderr, "%s\n", ex.what()); - gpt_print_usage(argc, argv, gpt_params()); + gpt_params_print_usage(argc, argv, gpt_params()); exit(1); } return result; } -bool parse_kv_override(const char * data, std::vector & overrides) { - const char * sep = strchr(data, '='); - if (sep == nullptr || sep - data >= 128) { - fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); - return false; - } - llama_model_kv_override kvo; - std::strncpy(kvo.key, data, sep - data); - kvo.key[sep - data] = 0; - sep++; - if (strncmp(sep, "int:", 4) == 0) { - sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.val_i64 = std::atol(sep); - } else if (strncmp(sep, "float:", 6) == 0) { - sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; - kvo.val_f64 = std::atof(sep); - } else if (strncmp(sep, "bool:", 5) == 0) { - sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; - if (std::strcmp(sep, "true") == 0) { - kvo.val_bool = true; - } else if (std::strcmp(sep, "false") == 0) { - kvo.val_bool = false; - } else { - fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data); - return false; - } - } else if (strncmp(sep, "str:", 4) == 0) { - sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; - if (strlen(sep) > 127) { - fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); - return false; - } - strncpy(kvo.val_str, sep, 127); - kvo.val_str[127] = '\0'; - } else { - fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); - return false; - } - overrides.emplace_back(std::move(kvo)); - return true; -} - bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { llama_sampling_params & sparams = params.sparams; @@ -546,7 +545,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } const auto sampler_names = string_split(argv[i], ';'); - sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); + sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true); return true; } if (arg == "--sampling-seq") { @@ -554,7 +553,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } - sparams.samplers_sequence = sampler_types_from_chars(argv[i]); + sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]); return true; } if (arg == "--top-p") { @@ -1240,7 +1239,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-h" || arg == "--help") { - gpt_print_usage(argc, argv, gpt_params()); + gpt_params_print_usage(argc, argv, gpt_params()); exit(0); } if (arg == "--version") { @@ -1311,7 +1310,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } - if (!parse_kv_override(argv[i], params.kv_overrides)) { + if (!string_parse_kv_override(argv[i], params.kv_overrides)) { fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); invalid_param = true; return true; @@ -1345,88 +1344,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return false; } -void gpt_params_handle_model_default(gpt_params & params) { - if (!params.hf_repo.empty()) { - // short-hand to avoid specifying --hf-file -> default it to --model - if (params.hf_file.empty()) { - if (params.model.empty()) { - throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); - } - params.hf_file = params.model; - } else if (params.model.empty()) { - std::string cache_directory = get_cache_directory(); - const bool success = create_directory_with_parents(cache_directory); - if (!success) { - throw std::runtime_error("failed to create cache directory: " + cache_directory); - } - params.model = cache_directory + string_split(params.hf_file, '/').back(); - } - } else if (!params.model_url.empty()) { - if (params.model.empty()) { - auto f = string_split(params.model_url, '#').front(); - f = string_split(f, '?').front(); - f = string_split(f, '/').back(); - params.model = "models/" + f; - } - } else if (params.model.empty()) { - params.model = DEFAULT_MODEL_PATH; - } -} - -bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { - bool invalid_param = false; - std::string arg; - const std::string arg_prefix = "--"; - llama_sampling_params & sparams = params.sparams; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) { - throw std::invalid_argument("error: unknown argument: " + arg); - } - if (invalid_param) { - throw std::invalid_argument("error: invalid parameter for argument: " + arg); - } - } - - if (params.prompt_cache_all && - (params.interactive || params.interactive_first || - params.instruct)) { - - throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); - } - - gpt_params_handle_model_default(params); - - if (params.escape) { - process_escapes(params.prompt); - process_escapes(params.input_prefix); - process_escapes(params.input_suffix); - process_escapes(sparams.cfg_negative_prompt); - for (auto & antiprompt : params.antiprompt) { - process_escapes(antiprompt); - } - } - - if (!params.kv_overrides.empty()) { - params.kv_overrides.emplace_back(); - params.kv_overrides.back().key[0] = 0; - } - - return true; -} - -void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { +void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { const llama_sampling_params & sparams = params.sparams; std::string sampler_type_chars; std::string sampler_type_names; for (const auto sampler_type : sparams.samplers_sequence) { sampler_type_chars += static_cast(sampler_type); - sampler_type_names += sampler_type_to_name_string(sampler_type) + ";"; + sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";"; } sampler_type_names.pop_back(); @@ -1623,7 +1548,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #endif // LOG_DISABLE_LOGS } -std::string get_system_info(const gpt_params & params) { +std::string gpt_params_get_system_info(const gpt_params & params) { std::ostringstream os; os << "system_info: n_threads = " << params.n_threads; @@ -1635,7 +1560,52 @@ std::string get_system_info(const gpt_params & params) { return os.str(); } -std::string gpt_random_prompt(std::mt19937 & rng) { +// +// String utils +// + +std::vector string_split(std::string input, char separator) { + std::vector parts; + size_t separator_pos = input.find(separator); + while (separator_pos != std::string::npos) { + std::string part = input.substr(0, separator_pos); + parts.emplace_back(part); + input = input.substr(separator_pos + 1); + separator_pos = input.find(separator); + } + parts.emplace_back(input); + return parts; +} + +std::string string_strip(const std::string & str) { + size_t start = 0; + size_t end = str.size(); + while (start < end && std::isspace(str[start])) { + start++; + } + while (end > start && std::isspace(str[end - 1])) { + end--; + } + return str.substr(start, end - start); +} + +std::string string_get_sortable_timestamp() { + using clock = std::chrono::system_clock; + + const clock::time_point current_time = clock::now(); + const time_t as_time_t = clock::to_time_t(current_time); + char timestamp_no_ns[100]; + std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t)); + + const int64_t ns = std::chrono::duration_cast( + current_time.time_since_epoch() % 1000000000).count(); + char timestamp_ns[11]; + snprintf(timestamp_ns, 11, "%09" PRId64, ns); + + return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); +} + +std::string string_random_prompt(std::mt19937 & rng) { const int r = rng() % 10; switch (r) { case 0: return "So"; @@ -1653,9 +1623,96 @@ std::string gpt_random_prompt(std::mt19937 & rng) { GGML_UNREACHABLE(); } +void string_process_escapes(std::string & input) { + std::size_t input_len = input.length(); + std::size_t output_idx = 0; + + for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) { + if (input[input_idx] == '\\' && input_idx + 1 < input_len) { + switch (input[++input_idx]) { + case 'n': input[output_idx++] = '\n'; break; + case 'r': input[output_idx++] = '\r'; break; + case 't': input[output_idx++] = '\t'; break; + case '\'': input[output_idx++] = '\''; break; + case '\"': input[output_idx++] = '\"'; break; + case '\\': input[output_idx++] = '\\'; break; + case 'x': + // Handle \x12, etc + if (input_idx + 2 < input_len) { + const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 }; + char *err_p = nullptr; + const long val = std::strtol(x, &err_p, 16); + if (err_p == x + 2) { + input_idx += 2; + input[output_idx++] = char(val); + break; + } + } + // fall through + default: input[output_idx++] = '\\'; + input[output_idx++] = input[input_idx]; break; + } + } else { + input[output_idx++] = input[input_idx]; + } + } + + input.resize(output_idx); +} + +bool string_parse_kv_override(const char * data, std::vector & overrides) { + const char * sep = strchr(data, '='); + if (sep == nullptr || sep - data >= 128) { + fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); + return false; + } + llama_model_kv_override kvo; + std::strncpy(kvo.key, data, sep - data); + kvo.key[sep - data] = 0; + sep++; + if (strncmp(sep, "int:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.val_i64 = std::atol(sep); + } else if (strncmp(sep, "float:", 6) == 0) { + sep += 6; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; + kvo.val_f64 = std::atof(sep); + } else if (strncmp(sep, "bool:", 5) == 0) { + sep += 5; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; + if (std::strcmp(sep, "true") == 0) { + kvo.val_bool = true; + } else if (std::strcmp(sep, "false") == 0) { + kvo.val_bool = false; + } else { + fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data); + return false; + } + } else if (strncmp(sep, "str:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; + if (strlen(sep) > 127) { + fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); + return false; + } + strncpy(kvo.val_str, sep, 127); + kvo.val_str[127] = '\0'; + } else { + fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); + return false; + } + overrides.emplace_back(std::move(kvo)); + return true; +} + +// +// Filesystem utils +// + // Validate if a filename is safe to use // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function -bool validate_file_name(const std::string & filename) { +bool fs_validate_filename(const std::string & filename) { if (!filename.length()) { // Empty filename invalid return false; @@ -1724,120 +1781,194 @@ bool validate_file_name(const std::string & filename) { return true; } -// -// String utils -// +// returns true if successful, false otherwise +bool fs_create_directory_with_parents(const std::string & path) { +#ifdef _WIN32 + std::wstring_convert> converter; + std::wstring wpath = converter.from_bytes(path); -std::vector string_split(std::string input, char separator) { - std::vector parts; - size_t separator_pos = input.find(separator); - while (separator_pos != std::string::npos) { - std::string part = input.substr(0, separator_pos); - parts.emplace_back(part); - input = input.substr(separator_pos + 1); - separator_pos = input.find(separator); + // if the path already exists, check whether it's a directory + const DWORD attributes = GetFileAttributesW(wpath.c_str()); + if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) { + return true; } - parts.emplace_back(input); - return parts; -} -std::string string_strip(const std::string & str) { - size_t start = 0; - size_t end = str.size(); - while (start < end && std::isspace(str[start])) { - start++; - } - while (end > start && std::isspace(str[end - 1])) { - end--; - } - return str.substr(start, end - start); -} + size_t pos_slash = 0; -std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names) { - std::unordered_map sampler_canonical_name_map { - {"top_k", llama_sampler_type::TOP_K}, - {"top_p", llama_sampler_type::TOP_P}, - {"typical_p", llama_sampler_type::TYPICAL_P}, - {"min_p", llama_sampler_type::MIN_P}, - {"tfs_z", llama_sampler_type::TFS_Z}, - {"temperature", llama_sampler_type::TEMPERATURE} - }; + // process path from front to back, procedurally creating directories + while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) { + const std::wstring subpath = wpath.substr(0, pos_slash); + const wchar_t * test = subpath.c_str(); - // since samplers names are written multiple ways - // make it ready for both system names and input names - std::unordered_map sampler_alt_name_map { - {"top-k", llama_sampler_type::TOP_K}, - {"top-p", llama_sampler_type::TOP_P}, - {"nucleus", llama_sampler_type::TOP_P}, - {"typical-p", llama_sampler_type::TYPICAL_P}, - {"typical", llama_sampler_type::TYPICAL_P}, - {"min-p", llama_sampler_type::MIN_P}, - {"tfs-z", llama_sampler_type::TFS_Z}, - {"tfs", llama_sampler_type::TFS_Z}, - {"temp", llama_sampler_type::TEMPERATURE} - }; + const bool success = CreateDirectoryW(test, NULL); + if (!success) { + const DWORD error = GetLastError(); - std::vector sampler_types; - sampler_types.reserve(names.size()); - for (const auto & name : names) - { - auto sampler_item = sampler_canonical_name_map.find(name); - if (sampler_item != sampler_canonical_name_map.end()) - { - sampler_types.push_back(sampler_item->second); - } - else - { - if (allow_alt_names) - { - sampler_item = sampler_alt_name_map.find(name); - if (sampler_item != sampler_alt_name_map.end()) - { - sampler_types.push_back(sampler_item->second); + // if the path already exists, ensure that it's a directory + if (error == ERROR_ALREADY_EXISTS) { + const DWORD attributes = GetFileAttributesW(subpath.c_str()); + if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) { + return false; } + } else { + return false; } } + + pos_slash += 1; } - return sampler_types; -} -std::vector sampler_types_from_chars(const std::string & names_string) { - std::unordered_map sampler_name_map { - {'k', llama_sampler_type::TOP_K}, - {'p', llama_sampler_type::TOP_P}, - {'y', llama_sampler_type::TYPICAL_P}, - {'m', llama_sampler_type::MIN_P}, - {'f', llama_sampler_type::TFS_Z}, - {'t', llama_sampler_type::TEMPERATURE} - }; + return true; +#else + // if the path already exists, check whether it's a directory + struct stat info; + if (stat(path.c_str(), &info) == 0) { + return S_ISDIR(info.st_mode); + } - std::vector sampler_types; - sampler_types.reserve(names_string.size()); - for (const auto & c : names_string) { - const auto sampler_item = sampler_name_map.find(c); - if (sampler_item != sampler_name_map.end()) { - sampler_types.push_back(sampler_item->second); + size_t pos_slash = 1; // skip leading slashes for directory creation + + // process path from front to back, procedurally creating directories + while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) { + const std::string subpath = path.substr(0, pos_slash); + struct stat info; + + // if the path already exists, ensure that it's a directory + if (stat(subpath.c_str(), &info) == 0) { + if (!S_ISDIR(info.st_mode)) { + return false; + } + } else { + // create parent directories + const int ret = mkdir(subpath.c_str(), 0755); + if (ret != 0) { + return false; + } } + + pos_slash += 1; } - return sampler_types; + + return true; +#endif // _WIN32 } -std::string sampler_type_to_name_string(llama_sampler_type sampler_type) { - switch (sampler_type) { - case llama_sampler_type::TOP_K: return "top_k"; - case llama_sampler_type::TFS_Z: return "tfs_z"; - case llama_sampler_type::TYPICAL_P: return "typical_p"; - case llama_sampler_type::TOP_P: return "top_p"; - case llama_sampler_type::MIN_P: return "min_p"; - case llama_sampler_type::TEMPERATURE: return "temperature"; - default : return ""; +std::string fs_get_cache_directory() { + std::string cache_directory = ""; + if (getenv("LLAMA_CACHE")) { + cache_directory = std::getenv("LLAMA_CACHE"); + if (cache_directory.back() != DIRECTORY_SEPARATOR) { + cache_directory += DIRECTORY_SEPARATOR; + } + } else { +#ifdef __linux__ + if (std::getenv("XDG_CACHE_HOME")) { + cache_directory = std::getenv("XDG_CACHE_HOME"); + } else { + cache_directory = std::getenv("HOME") + std::string("/.cache/"); + } +#elif defined(__APPLE__) + cache_directory = std::getenv("HOME") + std::string("/Library/Caches/"); +#elif defined(_WIN32) + cache_directory = std::getenv("APPDATA"); +#endif // __linux__ + cache_directory += "llama.cpp"; + cache_directory += DIRECTORY_SEPARATOR; } + return cache_directory; } + // // Model utils // +std::tuple llama_init_from_gpt_params(gpt_params & params) { + auto mparams = llama_model_params_from_gpt_params(params); + + llama_model * model = nullptr; + + if (!params.hf_repo.empty() && !params.hf_file.empty()) { + model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams); + } else if (!params.model_url.empty()) { + model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams); + } else { + model = llama_load_model_from_file(params.model.c_str(), mparams); + } + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return std::make_tuple(nullptr, nullptr); + } + + auto cparams = llama_context_params_from_gpt_params(params); + + llama_context * lctx = llama_new_context_with_model(model, cparams); + if (lctx == NULL) { + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + + if (!params.control_vectors.empty()) { + if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; + if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); + + const auto cvec = llama_control_vector_load(params.control_vectors); + if (cvec.n_embd == -1) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + + int err = llama_control_vector_apply(lctx, + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + params.control_vector_layer_start, + params.control_vector_layer_end); + if (err) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + } + + for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { + const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); + float lora_scale = std::get<1>(params.lora_adapter[i]); + int err = llama_model_apply_lora_from_file(model, + lora_adapter.c_str(), + lora_scale, + ((i > 0) || params.lora_base.empty()) + ? NULL + : params.lora_base.c_str(), + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + } + + if (params.ignore_eos) { + params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; + } + + if (params.warmup) { + LOG("warming up the model with an empty run\n"); + + std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; + llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); + llama_kv_cache_clear(lctx); + llama_synchronize(lctx); + llama_reset_timings(lctx); + } + + return std::make_tuple(model, lctx); +} + struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { auto mparams = llama_model_default_params(); @@ -1923,27 +2054,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param return cparams; } -void llama_batch_clear(struct llama_batch & batch) { - batch.n_tokens = 0; -} - -void llama_batch_add( - struct llama_batch & batch, - llama_token id, - llama_pos pos, - const std::vector & seq_ids, - bool logits) { - batch.token [batch.n_tokens] = id; - batch.pos [batch.n_tokens] = pos; - batch.n_seq_id[batch.n_tokens] = seq_ids.size(); - for (size_t i = 0; i < seq_ids.size(); ++i) { - batch.seq_id[batch.n_tokens][i] = seq_ids[i]; - } - batch.logits [batch.n_tokens] = logits; - - batch.n_tokens++; -} - #ifdef LLAMA_USE_CURL static bool starts_with(const std::string & str, const std::string & prefix) { @@ -2274,90 +2384,29 @@ struct llama_model * llama_load_model_from_hf( #endif // LLAMA_USE_CURL -std::tuple llama_init_from_gpt_params(gpt_params & params) { - auto mparams = llama_model_params_from_gpt_params(params); +// +// Batch utils +// - llama_model * model = nullptr; +void llama_batch_clear(struct llama_batch & batch) { + batch.n_tokens = 0; +} - if (!params.hf_repo.empty() && !params.hf_file.empty()) { - model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams); - } else if (!params.model_url.empty()) { - model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams); - } else { - model = llama_load_model_from_file(params.model.c_str(), mparams); +void llama_batch_add( + struct llama_batch & batch, + llama_token id, + llama_pos pos, + const std::vector & seq_ids, + bool logits) { + batch.token [batch.n_tokens] = id; + batch.pos [batch.n_tokens] = pos; + batch.n_seq_id[batch.n_tokens] = seq_ids.size(); + for (size_t i = 0; i < seq_ids.size(); ++i) { + batch.seq_id[batch.n_tokens][i] = seq_ids[i]; } + batch.logits [batch.n_tokens] = logits; - if (model == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); - return std::make_tuple(nullptr, nullptr); - } - - auto cparams = llama_context_params_from_gpt_params(params); - - llama_context * lctx = llama_new_context_with_model(model, cparams); - if (lctx == NULL) { - fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); - llama_free_model(model); - return std::make_tuple(nullptr, nullptr); - } - - if (!params.control_vectors.empty()) { - if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; - if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); - - const auto cvec = llama_control_vector_load(params.control_vectors); - if (cvec.n_embd == -1) { - llama_free(lctx); - llama_free_model(model); - return std::make_tuple(nullptr, nullptr); - } - - int err = llama_control_vector_apply(lctx, - cvec.data.data(), - cvec.data.size(), - cvec.n_embd, - params.control_vector_layer_start, - params.control_vector_layer_end); - if (err) { - llama_free(lctx); - llama_free_model(model); - return std::make_tuple(nullptr, nullptr); - } - } - - for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { - const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); - float lora_scale = std::get<1>(params.lora_adapter[i]); - int err = llama_model_apply_lora_from_file(model, - lora_adapter.c_str(), - lora_scale, - ((i > 0) || params.lora_base.empty()) - ? NULL - : params.lora_base.c_str(), - params.n_threads); - if (err != 0) { - fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); - llama_free(lctx); - llama_free_model(model); - return std::make_tuple(nullptr, nullptr); - } - } - - if (params.ignore_eos) { - params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; - } - - if (params.warmup) { - LOG("warming up the model with an empty run\n"); - - std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; - llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); - llama_kv_cache_clear(lctx); - llama_synchronize(lctx); - llama_reset_timings(lctx); - } - - return std::make_tuple(model, lctx); + batch.n_tokens++; } // @@ -2445,346 +2494,11 @@ bool llama_should_add_bos_token(const llama_model * model) { return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM); } -// -// YAML utils -// - -// returns true if successful, false otherwise -bool create_directory_with_parents(const std::string & path) { -#ifdef _WIN32 - std::wstring_convert> converter; - std::wstring wpath = converter.from_bytes(path); - - // if the path already exists, check whether it's a directory - const DWORD attributes = GetFileAttributesW(wpath.c_str()); - if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) { - return true; - } - - size_t pos_slash = 0; - - // process path from front to back, procedurally creating directories - while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) { - const std::wstring subpath = wpath.substr(0, pos_slash); - const wchar_t * test = subpath.c_str(); - - const bool success = CreateDirectoryW(test, NULL); - if (!success) { - const DWORD error = GetLastError(); - - // if the path already exists, ensure that it's a directory - if (error == ERROR_ALREADY_EXISTS) { - const DWORD attributes = GetFileAttributesW(subpath.c_str()); - if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) { - return false; - } - } else { - return false; - } - } - - pos_slash += 1; - } - - return true; -#else - // if the path already exists, check whether it's a directory - struct stat info; - if (stat(path.c_str(), &info) == 0) { - return S_ISDIR(info.st_mode); - } - - size_t pos_slash = 1; // skip leading slashes for directory creation - - // process path from front to back, procedurally creating directories - while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) { - const std::string subpath = path.substr(0, pos_slash); - struct stat info; - - // if the path already exists, ensure that it's a directory - if (stat(subpath.c_str(), &info) == 0) { - if (!S_ISDIR(info.st_mode)) { - return false; - } - } else { - // create parent directories - const int ret = mkdir(subpath.c_str(), 0755); - if (ret != 0) { - return false; - } - } - - pos_slash += 1; - } - - return true; -#endif // _WIN32 -} - -std::string get_cache_directory() { - std::string cache_directory = ""; - if (getenv("LLAMA_CACHE")) { - cache_directory = std::getenv("LLAMA_CACHE"); - if (cache_directory.back() != DIRECTORY_SEPARATOR) { - cache_directory += DIRECTORY_SEPARATOR; - } - } else { -#ifdef __linux__ - if (std::getenv("XDG_CACHE_HOME")) { - cache_directory = std::getenv("XDG_CACHE_HOME"); - } else { - cache_directory = std::getenv("HOME") + std::string("/.cache/"); - } -#elif defined(__APPLE__) - cache_directory = std::getenv("HOME") + std::string("/Library/Caches/"); -#elif defined(_WIN32) - cache_directory = std::getenv("APPDATA"); -#endif // __linux__ - cache_directory += "llama.cpp"; - cache_directory += DIRECTORY_SEPARATOR; - } - return cache_directory; -} - -void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data) { - if (data.empty()) { - fprintf(stream, "%s:\n", prop_name); - return; - } - - fprintf(stream, "%s: [", prop_name); - for (size_t i = 0; i < data.size() - 1; ++i) { - fprintf(stream, "%e, ", data[i]); - } - fprintf(stream, "%e]\n", data.back()); -} - -void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector & data) { - if (data.empty()) { - fprintf(stream, "%s:\n", prop_name); - return; - } - - fprintf(stream, "%s: [", prop_name); - for (size_t i = 0; i < data.size() - 1; ++i) { - fprintf(stream, "%d, ", data[i]); - } - fprintf(stream, "%d]\n", data.back()); -} - -void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) { - std::string data_str(data == NULL ? "" : data); - - if (data_str.empty()) { - fprintf(stream, "%s:\n", prop_name); - return; - } - - size_t pos_start = 0; - size_t pos_found = 0; - - if (std::isspace(data_str[0]) || std::isspace(data_str.back())) { - data_str = std::regex_replace(data_str, std::regex("\n"), "\\n"); - data_str = std::regex_replace(data_str, std::regex("\""), "\\\""); - data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)"); - data_str = "\"" + data_str + "\""; - fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); - return; - } - - if (data_str.find('\n') == std::string::npos) { - fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); - return; - } - - fprintf(stream, "%s: |\n", prop_name); - while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) { - fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str()); - pos_start = pos_found + 1; - } -} - -std::string get_sortable_timestamp() { - using clock = std::chrono::system_clock; - - const clock::time_point current_time = clock::now(); - const time_t as_time_t = clock::to_time_t(current_time); - char timestamp_no_ns[100]; - std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t)); - - const int64_t ns = std::chrono::duration_cast( - current_time.time_since_epoch() % 1000000000).count(); - char timestamp_ns[11]; - snprintf(timestamp_ns, 11, "%09" PRId64, ns); - - return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); -} - -void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx, - const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { - const llama_sampling_params & sparams = params.sparams; - - fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); - fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); - fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); - fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false"); - fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false"); - fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); - fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false"); - fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); - fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); - fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); - fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); - fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); - fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); - fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); - fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); - fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false"); - -#ifdef NDEBUG - fprintf(stream, "debug: false\n"); -#else - fprintf(stream, "debug: true\n"); -#endif // NDEBUG - - fprintf(stream, "model_desc: %s\n", model_desc); - fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx))); - -#ifdef __OPTIMIZE__ - fprintf(stream, "optimize: true\n"); -#else - fprintf(stream, "optimize: false\n"); -#endif // __OPTIMIZE__ - - fprintf(stream, "time: %s\n", timestamp.c_str()); - - fprintf(stream, "\n"); - fprintf(stream, "###############\n"); - fprintf(stream, "# User Inputs #\n"); - fprintf(stream, "###############\n"); - fprintf(stream, "\n"); - - fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str()); - fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch); - dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str()); - fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale); - fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); - fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); - fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); - fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); - fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); - fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq); - dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str()); - fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); - fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); - fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); - - const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx))); - const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY; - fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); - - dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str()); - fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false"); - dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str()); - fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false"); - fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); - fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false"); - fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); - fprintf(stream, "keep: %d # default: 0\n", params.n_keep); - fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); - - fprintf(stream, "logit_bias:\n"); - for (std::pair lb : sparams.logit_bias) { - if (ignore_eos && lb.first == logit_bias_eos->first) { - continue; - } - fprintf(stream, " %d: %f", lb.first, lb.second); - } - - fprintf(stream, "lora:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) != 1.0f) { - continue; - } - fprintf(stream, " - %s\n", std::get<0>(la).c_str()); - } - fprintf(stream, "lora_scaled:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) == 1.0f) { - continue; - } - fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); - } - fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); - fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); - fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); - fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); - fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); - fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); - fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); - fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH); - fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); - fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); - fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); - fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); - fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs); - fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); - fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false"); - fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); - fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); - fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); - dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str()); - fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str()); - fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); - fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); - dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens); - fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false"); - fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat); - - fprintf(stream, "reverse_prompt:\n"); - for (std::string ap : params.antiprompt) { - size_t pos = 0; - while ((pos = ap.find('\n', pos)) != std::string::npos) { - ap.replace(pos, 1, "\\n"); - pos += 1; - } - - fprintf(stream, " - %s\n", ap.c_str()); - } - - fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); - fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); - fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed); - fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); - fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); - fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false"); - fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); - - const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); - dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); - - fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); - fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); - fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); - fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); - fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); - fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); - fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); - fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); -} - // // KV cache utils // -void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { +void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", @@ -2807,7 +2521,7 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { printf("\n=== Done dumping\n"); } -void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { +void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", @@ -2855,6 +2569,10 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { printf("\n=== Done dumping\n"); } +// +// Embedding utils +// + void llama_embd_normalize(const float * inp, float * out, int n) { double sum = 0.0; for (int i = 0; i < n; i++) { @@ -3039,3 +2757,225 @@ llama_control_vector_data llama_control_vector_load(const std::vector & data) { + if (data.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + fprintf(stream, "%s: [", prop_name); + for (size_t i = 0; i < data.size() - 1; ++i) { + fprintf(stream, "%e, ", data[i]); + } + fprintf(stream, "%e]\n", data.back()); +} + +void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector & data) { + if (data.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + fprintf(stream, "%s: [", prop_name); + for (size_t i = 0; i < data.size() - 1; ++i) { + fprintf(stream, "%d, ", data[i]); + } + fprintf(stream, "%d]\n", data.back()); +} + +void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) { + std::string data_str(data == NULL ? "" : data); + + if (data_str.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + size_t pos_start = 0; + size_t pos_found = 0; + + if (std::isspace(data_str[0]) || std::isspace(data_str.back())) { + data_str = std::regex_replace(data_str, std::regex("\n"), "\\n"); + data_str = std::regex_replace(data_str, std::regex("\""), "\\\""); + data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)"); + data_str = "\"" + data_str + "\""; + fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); + return; + } + + if (data_str.find('\n') == std::string::npos) { + fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); + return; + } + + fprintf(stream, "%s: |\n", prop_name); + while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) { + fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str()); + pos_start = pos_found + 1; + } +} + +void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx, + const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { + const llama_sampling_params & sparams = params.sparams; + + fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); + fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); + fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); + fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false"); + fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); + fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false"); + fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false"); + fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); + fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false"); + fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); + fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); + fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); + fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); + fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); + fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); + fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); + fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); + fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false"); + +#ifdef NDEBUG + fprintf(stream, "debug: false\n"); +#else + fprintf(stream, "debug: true\n"); +#endif // NDEBUG + + fprintf(stream, "model_desc: %s\n", model_desc); + fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx))); + +#ifdef __OPTIMIZE__ + fprintf(stream, "optimize: true\n"); +#else + fprintf(stream, "optimize: false\n"); +#endif // __OPTIMIZE__ + + fprintf(stream, "time: %s\n", timestamp.c_str()); + + fprintf(stream, "\n"); + fprintf(stream, "###############\n"); + fprintf(stream, "# User Inputs #\n"); + fprintf(stream, "###############\n"); + fprintf(stream, "\n"); + + fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str()); + fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch); + yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str()); + fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale); + fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); + fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); + fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); + fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); + fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); + fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq); + yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str()); + fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); + fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); + fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); + + const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx))); + const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY; + fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); + + yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str()); + fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false"); + yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str()); + fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false"); + fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); + fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false"); + fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); + fprintf(stream, "keep: %d # default: 0\n", params.n_keep); + fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); + + fprintf(stream, "logit_bias:\n"); + for (std::pair lb : sparams.logit_bias) { + if (ignore_eos && lb.first == logit_bias_eos->first) { + continue; + } + fprintf(stream, " %d: %f", lb.first, lb.second); + } + + fprintf(stream, "lora:\n"); + for (std::tuple la : params.lora_adapter) { + if (std::get<1>(la) != 1.0f) { + continue; + } + fprintf(stream, " - %s\n", std::get<0>(la).c_str()); + } + fprintf(stream, "lora_scaled:\n"); + for (std::tuple la : params.lora_adapter) { + if (std::get<1>(la) == 1.0f) { + continue; + } + fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); + } + fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); + fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); + fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); + fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); + fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); + fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); + fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); + fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH); + fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); + fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); + fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); + fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); + fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs); + fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); + fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false"); + fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); + fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); + fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); + yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str()); + fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str()); + fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); + fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); + yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens); + fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false"); + fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat); + + fprintf(stream, "reverse_prompt:\n"); + for (std::string ap : params.antiprompt) { + size_t pos = 0; + while ((pos = ap.find('\n', pos)) != std::string::npos) { + ap.replace(pos, 1, "\\n"); + pos += 1; + } + + fprintf(stream, " - %s\n", ap.c_str()); + } + + fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); + fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); + fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed); + fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); + fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); + fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false"); + fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); + + const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); + yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector); + + fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); + fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); + fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); + fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); + fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); + fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); + fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); + fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); +} diff --git a/common/common.h b/common/common.h index a8e5e50e6..f68f3c297 100644 --- a/common/common.h +++ b/common/common.h @@ -27,7 +27,7 @@ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) #define print_build_info() do { \ - fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ + fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ } while(0) @@ -35,14 +35,18 @@ // build info extern int LLAMA_BUILD_NUMBER; -extern char const *LLAMA_COMMIT; -extern char const *LLAMA_COMPILER; -extern char const *LLAMA_BUILD_TARGET; +extern char const * LLAMA_COMMIT; +extern char const * LLAMA_COMPILER; +extern char const * LLAMA_BUILD_TARGET; struct llama_control_vector_load_info; -int get_math_cpu_count(); -int32_t get_num_physical_cores(); +// +// CPU utils +// + +int32_t cpu_get_num_physical_cores(); +int32_t cpu_get_num_math(); // // CLI argument parsing @@ -51,7 +55,7 @@ int32_t get_num_physical_cores(); struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed - int32_t n_threads = get_math_cpu_count(); + int32_t n_threads = cpu_get_num_math(); int32_t n_threads_draft = -1; int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_threads_batch_draft = -1; @@ -179,33 +183,34 @@ struct gpt_params { void gpt_params_handle_model_default(gpt_params & params); -bool parse_kv_override(const char * data, std::vector & overrides); +bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params); +bool gpt_params_parse (int argc, char ** argv, gpt_params & params); +bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param); +void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params); -bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params); - -bool gpt_params_parse(int argc, char ** argv, gpt_params & params); - -void gpt_print_usage(int argc, char ** argv, const gpt_params & params); - -bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param); - -std::string get_system_info(const gpt_params & params); - -std::string gpt_random_prompt(std::mt19937 & rng); - -void process_escapes(std::string& input); - -bool validate_file_name(const std::string & filename); +std::string gpt_params_get_system_info(const gpt_params & params); // // String utils // -std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names); -std::vector sampler_types_from_chars(const std::string & names_string); std::vector string_split(std::string input, char separator); + std::string string_strip(const std::string & str); -std::string sampler_type_to_name_string(llama_sampler_type sampler_type); +std::string string_get_sortable_timestamp(); +std::string string_random_prompt(std::mt19937 & rng); + +bool string_parse_kv_override(const char * data, std::vector & overrides); +void string_process_escapes(std::string & input); + +// +// Filesystem utils +// + +bool fs_validate_filename(const std::string & filename); +bool fs_create_directory_with_parents(const std::string & path); + +std::string fs_get_cache_directory(); // // Model utils @@ -276,30 +281,15 @@ std::string llama_detokenize_bpe( // defaults to true when model type is SPM, otherwise false. bool llama_should_add_bos_token(const llama_model * model); -// -// YAML utils -// - -bool create_directory_with_parents(const std::string & path); -std::string get_cache_directory(); -void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data); -void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector & data); -void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data); -std::string get_sortable_timestamp(); - -void dump_non_result_info_yaml( - FILE * stream, const gpt_params & params, const llama_context * lctx, - const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); - // // KV cache utils // // Dump the KV cache view with the number of sequences per cell. -void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80); +void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80); // Dump the KV cache view showing individual sequences in each cell (long output). -void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40); +void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40); // // Embedding utils @@ -333,6 +323,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector & data); +void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector & data); +void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data); + +void yaml_dump_non_result_info( + FILE * stream, const gpt_params & params, const llama_context * lctx, + const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); + diff --git a/common/sampling.cpp b/common/sampling.cpp index 7fc2e2158..f1f803516 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -125,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) { std::string result = "CFG -> Penalties "; if (params.mirostat == 0) { for (auto sampler_type : params.samplers_sequence) { - const auto sampler_type_name = sampler_type_to_name_string(sampler_type); + const auto sampler_type_name = llama_sampling_type_to_str(sampler_type); if (!sampler_type_name.empty()) { result += "-> " + sampler_type_name + " "; } @@ -137,6 +137,87 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) { return result; } +std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) { + switch (sampler_type) { + case llama_sampler_type::TOP_K: return "top_k"; + case llama_sampler_type::TFS_Z: return "tfs_z"; + case llama_sampler_type::TYPICAL_P: return "typical_p"; + case llama_sampler_type::TOP_P: return "top_p"; + case llama_sampler_type::MIN_P: return "min_p"; + case llama_sampler_type::TEMPERATURE: return "temperature"; + default : return ""; + } +} + +std::vector llama_sampling_types_from_names(const std::vector & names, bool allow_alt_names) { + std::unordered_map sampler_canonical_name_map { + {"top_k", llama_sampler_type::TOP_K}, + {"top_p", llama_sampler_type::TOP_P}, + {"typical_p", llama_sampler_type::TYPICAL_P}, + {"min_p", llama_sampler_type::MIN_P}, + {"tfs_z", llama_sampler_type::TFS_Z}, + {"temperature", llama_sampler_type::TEMPERATURE} + }; + + // since samplers names are written multiple ways + // make it ready for both system names and input names + std::unordered_map sampler_alt_name_map { + {"top-k", llama_sampler_type::TOP_K}, + {"top-p", llama_sampler_type::TOP_P}, + {"nucleus", llama_sampler_type::TOP_P}, + {"typical-p", llama_sampler_type::TYPICAL_P}, + {"typical", llama_sampler_type::TYPICAL_P}, + {"min-p", llama_sampler_type::MIN_P}, + {"tfs-z", llama_sampler_type::TFS_Z}, + {"tfs", llama_sampler_type::TFS_Z}, + {"temp", llama_sampler_type::TEMPERATURE} + }; + + std::vector sampler_types; + sampler_types.reserve(names.size()); + for (const auto & name : names) + { + auto sampler_item = sampler_canonical_name_map.find(name); + if (sampler_item != sampler_canonical_name_map.end()) + { + sampler_types.push_back(sampler_item->second); + } + else + { + if (allow_alt_names) + { + sampler_item = sampler_alt_name_map.find(name); + if (sampler_item != sampler_alt_name_map.end()) + { + sampler_types.push_back(sampler_item->second); + } + } + } + } + return sampler_types; +} + +std::vector llama_sampling_types_from_chars(const std::string & names_string) { + std::unordered_map sampler_name_map { + {'k', llama_sampler_type::TOP_K}, + {'p', llama_sampler_type::TOP_P}, + {'y', llama_sampler_type::TYPICAL_P}, + {'m', llama_sampler_type::MIN_P}, + {'f', llama_sampler_type::TFS_Z}, + {'t', llama_sampler_type::TEMPERATURE} + }; + + std::vector sampler_types; + sampler_types.reserve(names_string.size()); + for (const auto & c : names_string) { + const auto sampler_item = sampler_name_map.find(c); + if (sampler_item != sampler_name_map.end()) { + sampler_types.push_back(sampler_item->second); + } + } + return sampler_types; +} + // no reasons to expose this function in header static void sampler_queue( struct llama_context * ctx_main, diff --git a/common/sampling.h b/common/sampling.h index 655732ad1..eeaa53b8b 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -116,6 +116,11 @@ std::string llama_sampling_print(const llama_sampling_params & params); // Print sampling order into a string std::string llama_sampling_order_print(const llama_sampling_params & params); +std::string llama_sampling_type_to_str(llama_sampler_type sampler_type); + +std::vector llama_sampling_types_from_names(const std::vector & names, bool allow_alt_names); +std::vector llama_sampling_types_from_chars(const std::string & names_string); + // this is a common sampling function used across the examples for convenience // it can serve as a starting point for implementing your own sampling function // Note: When using multiple sequences, it is the caller's responsibility to call diff --git a/common/train.cpp b/common/train.cpp index 0dbfd24df..2d41a1d29 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -1380,7 +1380,7 @@ bool consume_common_train_arg( void finish_processing_train_args(struct train_params_common * params) { if (params->escape) { - process_escapes(params->sample_start); + string_process_escapes(params->sample_start); } } diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index be30d20bf..591bc6e57 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -48,7 +48,7 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; } - process_escapes(params.prompt); + string_process_escapes(params.prompt); // init LLM diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 0c921ed69..004399b5f 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -80,7 +80,7 @@ int main(int argc, char ** argv) { std::mt19937 rng(params.seed); if (params.random_prompt) { - params.prompt = gpt_random_prompt(rng); + params.prompt = string_random_prompt(rng); } llama_backend_init(); @@ -107,7 +107,7 @@ int main(int argc, char ** argv) { // print system information { fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", get_system_info(params).c_str()); + fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); } // split the prompt into lines diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index e670d3769..51d67d6d9 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -152,7 +152,7 @@ int main(int argc, char ** argv) { std::mt19937 rng(params.seed); if (params.random_prompt) { - params.prompt = gpt_random_prompt(rng); + params.prompt = string_random_prompt(rng); } llama_backend_init(); @@ -176,7 +176,7 @@ int main(int argc, char ** argv) { // print system information { fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", get_system_info(params).c_str()); + fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); } bool OK = run(ctx, params); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 82b19fc4f..25a2351cc 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -598,7 +598,7 @@ int main(int argc, char ** argv) { std::mt19937 rng(params.seed); if (params.random_prompt) { - params.prompt = gpt_random_prompt(rng); + params.prompt = string_random_prompt(rng); } sparams.dataset = params.prompt_file; @@ -667,7 +667,7 @@ int main(int argc, char ** argv) { // print system information { fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", get_system_info(params).c_str()); + fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); } bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index afac145f6..539f78184 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -50,9 +50,9 @@ static void write_logfile( return; } - const std::string timestamp = get_sortable_timestamp(); + const std::string timestamp = string_get_sortable_timestamp(); - const bool success = create_directory_with_parents(params.logdir); + const bool success = fs_create_directory_with_parents(params.logdir); if (!success) { fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str()); @@ -70,7 +70,7 @@ static void write_logfile( fprintf(logfile, "binary: infill\n"); char model_desc[128]; llama_model_desc(model, model_desc, sizeof(model_desc)); - dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc); + yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc); fprintf(logfile, "\n"); fprintf(logfile, "######################\n"); @@ -78,8 +78,8 @@ static void write_logfile( fprintf(logfile, "######################\n"); fprintf(logfile, "\n"); - dump_string_yaml_multiline(logfile, "output", output.c_str()); - dump_vector_int_yaml(logfile, "output_tokens", output_tokens); + yaml_dump_string_multiline(logfile, "output", output.c_str()); + yaml_dump_vector_int(logfile, "output_tokens", output_tokens); llama_dump_timing_info_yaml(logfile, ctx); fclose(logfile); @@ -236,7 +236,7 @@ int main(int argc, char ** argv) { // print system information { LOG_TEE("\n"); - LOG_TEE("%s\n", get_system_info(params).c_str()); + LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); } const bool add_bos = llama_should_add_bos_token(model); GGML_ASSERT(llama_add_eos_token(model) != 1); @@ -621,8 +621,8 @@ int main(int argc, char ** argv) { if (params.escape) { //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here - process_escapes(params.input_prefix); - process_escapes(params.input_suffix); + string_process_escapes(params.input_prefix); + string_process_escapes(params.input_suffix); } suff_rm_leading_spc = params.escape; if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 6bb1f70c3..2afdb3abd 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -200,7 +200,7 @@ static const cmd_params cmd_params_defaults = { /* n_ubatch */ {512}, /* type_k */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16}, - /* n_threads */ {get_math_cpu_count()}, + /* n_threads */ {cpu_get_num_math()}, /* n_gpu_layers */ {99}, /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* main_gpu */ {0}, diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index a6d67e5d7..c974900f2 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -290,7 +290,7 @@ int main(int argc, char ** argv) { #endif // LOG_DISABLE_LOGS if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { - gpt_print_usage(argc, argv, params); + gpt_params_print_usage(argc, argv, params); show_additional_info(argc, argv); return 1; } diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 9c3540b20..54f060a85 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -174,7 +174,7 @@ int main(int argc, char ** argv) { // debug if (dump_kv_cache) { llama_kv_cache_view_update(ctx, &kvc_view); - dump_kv_cache_view_seqs(kvc_view, 40); + llama_kv_cache_dump_view_seqs(kvc_view, 40); } // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/ diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index eebbd00a5..83dbee91a 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -121,7 +121,7 @@ int main(int argc, char ** argv){ // debug if (dump_kv_cache) { llama_kv_cache_view_update(ctx, &kvc_view); - dump_kv_cache_view_seqs(kvc_view, 40); + llama_kv_cache_dump_view_seqs(kvc_view, 40); } // print current draft sequence diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 832b51ee0..791dc61a7 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -60,9 +60,9 @@ static void write_logfile( return; } - const std::string timestamp = get_sortable_timestamp(); + const std::string timestamp = string_get_sortable_timestamp(); - const bool success = create_directory_with_parents(params.logdir); + const bool success = fs_create_directory_with_parents(params.logdir); if (!success) { fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str()); @@ -80,7 +80,7 @@ static void write_logfile( fprintf(logfile, "binary: main\n"); char model_desc[128]; llama_model_desc(model, model_desc, sizeof(model_desc)); - dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc); + yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc); fprintf(logfile, "\n"); fprintf(logfile, "######################\n"); @@ -88,8 +88,8 @@ static void write_logfile( fprintf(logfile, "######################\n"); fprintf(logfile, "\n"); - dump_string_yaml_multiline(logfile, "output", output.c_str()); - dump_vector_int_yaml(logfile, "output_tokens", output_tokens); + yaml_dump_string_multiline(logfile, "output", output.c_str()); + yaml_dump_vector_int(logfile, "output_tokens", output_tokens); llama_dump_timing_info_yaml(logfile, ctx); fclose(logfile); @@ -181,7 +181,7 @@ int main(int argc, char ** argv) { std::mt19937 rng(params.seed); if (params.random_prompt) { - params.prompt = gpt_random_prompt(rng); + params.prompt = string_random_prompt(rng); } LOG("%s: llama backend init\n", __func__); @@ -219,7 +219,7 @@ int main(int argc, char ** argv) { // print system information { LOG_TEE("\n"); - LOG_TEE("%s\n", get_system_info(params).c_str()); + LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); } std::string path_session = params.path_prompt_cache; @@ -879,7 +879,7 @@ int main(int argc, char ** argv) { embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end()); } if (params.escape) { - process_escapes(buffer); + string_process_escapes(buffer); } const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 7c5595d6e..c731abb72 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -210,7 +210,7 @@ int main(int argc, char ** argv) { while (true) { if (dump_kv_cache) { llama_kv_cache_view_update(ctx, &kvc_view); - dump_kv_cache_view_seqs(kvc_view, 40); + llama_kv_cache_dump_view_seqs(kvc_view, 40); } llama_batch_clear(batch); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index bae014e6f..30e5e282e 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -44,9 +44,9 @@ static void write_logfile( return; } - const std::string timestamp = get_sortable_timestamp(); + const std::string timestamp = string_get_sortable_timestamp(); - const bool success = create_directory_with_parents(params.logdir); + const bool success = fs_create_directory_with_parents(params.logdir); if (!success) { fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str()); @@ -64,7 +64,7 @@ static void write_logfile( fprintf(logfile, "binary: main\n"); char model_desc[128]; llama_model_desc(model, model_desc, sizeof(model_desc)); - dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc); + yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc); fprintf(logfile, "\n"); fprintf(logfile, "######################\n"); @@ -72,9 +72,9 @@ static void write_logfile( fprintf(logfile, "######################\n"); fprintf(logfile, "\n"); - dump_vector_float_yaml(logfile, "logits", results.logits); + yaml_dump_vector_float(logfile, "logits", results.logits); fprintf(logfile, "ppl_value: %f\n", results.ppl_value); - dump_vector_float_yaml(logfile, "probs", results.probs); + yaml_dump_vector_float(logfile, "probs", results.probs); llama_dump_timing_info_yaml(logfile, ctx); fclose(logfile); @@ -2007,7 +2007,7 @@ int main(int argc, char ** argv) { std::mt19937 rng(params.seed); if (params.random_prompt) { - params.prompt = gpt_random_prompt(rng); + params.prompt = string_random_prompt(rng); } llama_backend_init(); @@ -2035,7 +2035,7 @@ int main(int argc, char ** argv) { // print system information { fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", get_system_info(params).c_str()); + fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); } struct results_perplexity results; diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index cbb452334..28584e14b 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -259,7 +259,7 @@ int main(int argc, char ** argv) { usage(argv[0]); } } else if (strcmp(argv[arg_idx], "--override-kv") == 0) { - if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) { + if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) { usage(argv[0]); } } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 5ba71e76a..4e7530706 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -11,7 +11,7 @@ struct retrieval_params { }; static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) { - gpt_print_usage(argc, argv, gpt_params); + gpt_params_print_usage(argc, argv, gpt_params); printf("retrieval options:\n"); printf(" --context-file FNAME file containing context to embed.\n"); printf(" specify multiple files by providing --context-file option multiple times.\n"); @@ -226,7 +226,7 @@ int main(int argc, char ** argv) { // print system information { fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", get_system_info(params).c_str()); + fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); } // max batch size diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6af5cb96e..e9904263d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1019,7 +1019,7 @@ struct server_context { sampler_names.emplace_back(sampler_name); } } - slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false); + slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false); } else { slot.sparams.samplers_sequence = default_sparams.samplers_sequence; } @@ -1256,7 +1256,7 @@ struct server_context { std::vector samplers_sequence; samplers_sequence.reserve(slot.sparams.samplers_sequence.size()); for (const auto & sampler_type : slot.sparams.samplers_sequence) { - samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type)); + samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type)); } return json { @@ -2852,7 +2852,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, invalid_param = true; break; } - if (!parse_kv_override(argv[i], params.kv_overrides)) { + if (!string_parse_kv_override(argv[i], params.kv_overrides)) { fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); invalid_param = true; break; @@ -3310,7 +3310,7 @@ int main(int argc, char ** argv) { const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) { json request_data = json::parse(req.body); std::string filename = request_data.at("filename"); - if (!validate_file_name(filename)) { + if (!fs_validate_filename(filename)) { res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); return; } @@ -3340,7 +3340,7 @@ int main(int argc, char ** argv) { const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) { json request_data = json::parse(req.body); std::string filename = request_data.at("filename"); - if (!validate_file_name(filename)) { + if (!fs_validate_filename(filename)) { res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); return; } From 197ff91462dd05bb9a3be03578114abf0c355536 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 22 May 2024 20:05:38 +0300 Subject: [PATCH 17/34] build : remove zig (#7471) --- .github/workflows/zig-build.yml | 29 ------ build.zig | 172 -------------------------------- 2 files changed, 201 deletions(-) delete mode 100644 .github/workflows/zig-build.yml delete mode 100644 build.zig diff --git a/.github/workflows/zig-build.yml b/.github/workflows/zig-build.yml deleted file mode 100644 index 747c35cc0..000000000 --- a/.github/workflows/zig-build.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: Zig CI - -on: - pull_request: - push: - branches: - - master - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - build: - strategy: - fail-fast: false - matrix: - runs-on: [ubuntu-latest, macos-latest, windows-latest] - runs-on: ${{ matrix.runs-on }} - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - fetch-depth: 0 - - uses: goto-bus-stop/setup-zig@v2 - with: - version: 0.11.0 - - name: Build Summary - run: zig build --summary all -freference-trace diff --git a/build.zig b/build.zig deleted file mode 100644 index 267c976b1..000000000 --- a/build.zig +++ /dev/null @@ -1,172 +0,0 @@ -// Compatible with Zig Version 0.11.0 -const std = @import("std"); -const ArrayList = std.ArrayList; -const Compile = std.Build.Step.Compile; -const ConfigHeader = std.Build.Step.ConfigHeader; -const Mode = std.builtin.Mode; -const CrossTarget = std.zig.CrossTarget; - -const Maker = struct { - builder: *std.build.Builder, - target: CrossTarget, - optimize: Mode, - enable_lto: bool, - - include_dirs: ArrayList([]const u8), - cflags: ArrayList([]const u8), - cxxflags: ArrayList([]const u8), - objs: ArrayList(*Compile), - - fn addInclude(m: *Maker, dir: []const u8) !void { - try m.include_dirs.append(dir); - } - fn addProjectInclude(m: *Maker, path: []const []const u8) !void { - try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path)); - } - fn addCFlag(m: *Maker, flag: []const u8) !void { - try m.cflags.append(flag); - } - fn addCxxFlag(m: *Maker, flag: []const u8) !void { - try m.cxxflags.append(flag); - } - fn addFlag(m: *Maker, flag: []const u8) !void { - try m.addCFlag(flag); - try m.addCxxFlag(flag); - } - - fn init(builder: *std.build.Builder) !Maker { - const target = builder.standardTargetOptions(.{}); - const zig_version = @import("builtin").zig_version_string; - const commit_hash = try std.ChildProcess.exec( - .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } }, - ); - try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt( - \\int LLAMA_BUILD_NUMBER = {}; - \\char const *LLAMA_COMMIT = "{s}"; - \\char const *LLAMA_COMPILER = "Zig {s}"; - \\char const *LLAMA_BUILD_TARGET = "{s}"; - \\ - , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) })); - var m = Maker{ - .builder = builder, - .target = target, - .optimize = builder.standardOptimizeOption(.{}), - .enable_lto = false, - .include_dirs = ArrayList([]const u8).init(builder.allocator), - .cflags = ArrayList([]const u8).init(builder.allocator), - .cxxflags = ArrayList([]const u8).init(builder.allocator), - .objs = ArrayList(*Compile).init(builder.allocator), - }; - - try m.addCFlag("-std=c11"); - try m.addCxxFlag("-std=c++11"); - try m.addProjectInclude(&.{}); - try m.addProjectInclude(&.{"common"}); - return m; - } - - fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile { - const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize }); - if (o.target.getAbi() != .msvc) - o.defineCMacro("_GNU_SOURCE", null); - - if (std.mem.endsWith(u8, src, ".c")) { - o.addCSourceFiles(&.{src}, m.cflags.items); - o.linkLibC(); - } else { - o.addCSourceFiles(&.{src}, m.cxxflags.items); - if (o.target.getAbi() == .msvc) { - o.linkLibC(); // need winsdk + crt - } else { - // linkLibCpp already add (libc++ + libunwind + libc) - o.linkLibCpp(); - } - } - for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i }); - o.want_lto = m.enable_lto; - return o; - } - - fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile { - const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize }); - e.addCSourceFiles(&.{src}, m.cxxflags.items); - for (deps) |d| e.addObject(d); - for (m.objs.items) |o| e.addObject(o); - for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i }); - - // https://github.com/ziglang/zig/issues/15448 - if (e.target.getAbi() == .msvc) { - e.linkLibC(); // need winsdk + crt - } else { - // linkLibCpp already add (libc++ + libunwind + libc) - e.linkLibCpp(); - } - m.builder.installArtifact(e); - e.want_lto = m.enable_lto; - return e; - } -}; - -pub fn build(b: *std.build.Builder) !void { - var make = try Maker.init(b); - make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false; - - const ggml = make.obj("ggml", "ggml.c"); - const sgemm = make.obj("sgemm", "sgemm.cpp"); - const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); - const ggml_backend = make.obj("ggml-backend", "ggml-backend.c"); - const ggml_quants = make.obj("ggml-quants", "ggml-quants.c"); - const unicode = make.obj("unicode", "unicode.cpp"); - const unicode_data = make.obj("unicode-data", "unicode-data.cpp"); - const llama = make.obj("llama", "llama.cpp"); - const buildinfo = make.obj("common", "common/build-info.cpp"); - const common = make.obj("common", "common/common.cpp"); - const console = make.obj("console", "common/console.cpp"); - const sampling = make.obj("sampling", "common/sampling.cpp"); - const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp"); - const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp"); - const train = make.obj("train", "common/train.cpp"); - const clip = make.obj("clip", "examples/llava/clip.cpp"); - const llava = make.obj("llava", "examples/llava/llava.cpp"); - - _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, sampling, json_schema_to_grammar, buildinfo, console, grammar_parser }); - _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, sampling, json_schema_to_grammar, buildinfo }); - _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, sampling, json_schema_to_grammar, buildinfo }); - _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, sampling, json_schema_to_grammar, buildinfo }); - _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, sampling, json_schema_to_grammar, buildinfo, train }); - _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train }); - - const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, sampling, json_schema_to_grammar, buildinfo, grammar_parser, clip, llava }); - if (server.target.isWindows()) { - server.linkSystemLibrary("ws2_32"); - } - - const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" }; - for (server_assets) |asset| { - const input_path = b.fmt("examples/server/public/{s}", .{asset}); - const output_path = b.fmt("examples/server/{s}.hpp", .{asset}); - - // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`: - - const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize)); - defer b.allocator.free(input); - - var buf = std.ArrayList(u8).init(b.allocator); - defer buf.deinit(); - - for (input) |byte| { - try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte}); - } - - var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_"); - defer b.allocator.free(name); - std.mem.replaceScalar(u8, name, '.', '_'); - - try std.fs.cwd().writeFile(output_path, b.fmt( - "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n", - .{ name, buf.items, name, input.len }, - )); - - std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path }); - } -} From 1e374365d170b7f692fd7753c145e21bc14486c8 Mon Sep 17 00:00:00 2001 From: HanishKVC Date: Wed, 22 May 2024 23:23:21 +0530 Subject: [PATCH 18/34] SimpleChat: a simple and dumb web front end for testing /chat/completions and /completions end points and try chat (#7350) * SimpleChat: Add a skeletal html page Contains a div placeholder for showing chat messages till now a text-input for allowing user to enter next chat message/query to the model. a submit button to allow sending of the user entered message and chat till now to the model. * SimpleChat: A js skeleton with SimpleChat class Allows maintaining an array of chat message. Allows adding chat message (from any of the roles be it system, user, assistant, ...) Allows showing chat messages till now, in a given div element. * SimpleChat: request_json, globals, startme * SimpleChatJS: Roles Class, submitClick Define Role class with static members corresponding to the roles. Update startme to * Get hold of the ui elements. * Attach a click handler to submit button, which adds the user input to xchats array and shows the chat messages till now in chat div element. Trap DOMContentLoaded to trigger startme * SimpleChat:HTML: Bring in the js file * SimpleChat: Rather value wrt input text element * SimpleChat: Also add completions related prompt * SimpleChat: Use common helper logic wrt json data * SimpleChat: Move handling of submit request into its own func * SimpleChat: Try handshake with llm over its web service endpoint * SimpleChat:JS: Extract model response and show to user * SimpleChat:JS: Messages/Prompt, indicate working to end user * SimpleChat: Try keep input element in view * SimpleChat: Diff user/assistant msgs, Make input wider Also show a default message to user Also add some metas * SimpleChat: Move into its own sub directory to avoid confusion * SimpleChat:sh: Add simple shell script to run python3 http.server So one needs to run the llm server locally then run this script and access it using a local browser * SimpleChat:JS: Try trap enter key press wrt input text field So user can either press submit button or press enter key * SimpleChat: Allow user to select chat or completion mode * SimpleChat: Dont submit if already submitted and waiting Also make chat the default selection wrt mode * SimpleChat:JS: Handle difference in response Try read the assistance response from appropriate field in the response got. Also examples/server seems to return the response in a slightly different field, so try account for that also. * SimpleChat:JS: Force completion mode be single message by default * SimpleChat: Add a simple readme file * SimpleChat:HTML: Cleanup/structure UI a bit, Add input for system * SimpleChat:Allow system prompt to be set, if provided before user * SimpleChat: Ignore empty user input, without trimming * SimpleChat:Alert user if they provide sysprompt late or change it * SimpleChat: Move handling systemprompt into its own func * SimpleChat:HTML: Add a style for system role message * SimpleChat: Update the readme file * SimpleChat:CSS: Move style info into its own css file To keep it simple, clean and seperate so that things are not unnecessarily cluttered. * SimpleChat:CSS: Allow for chat div to be scrollable * SimpleChat:JS: Try ensure the last entry in chat is visible Needed because now only the chat div is scrollable and not the full page. In last commit the chat div size was fixed to 75% vertical height, so the full page no longer scrolls, so the old bring user-input element to view wont work, instead now the last element in the chat div should be brought into view. * SimpleChat:JS: bottom of element visible, Set focus to user input As the generated text could be multiple lines and occupy more space that the full scrollable div's vertical space, make the bottom of the last element (which can be such a generated text) in the div visible by scrolling. Ensure that the user input box has focus * SimpleChat: Update notes a bit. Try keep browser happy Avoid browser quirk mode with DOCTYPE. Help with accessibility a bit by specifying the language explicitly. Specify the char encoding explicitly, inturn utf-8 is a safe bet, even with intermixing of languages if reqd in future. Add a cache-control http-equiv meta tag, which in all probability will be ignored. Defer js loading and execution, just for fun and future, not that critical here as it stands now. * SimpleChat:HTML:Group user input+btn together; Note about multichat * SimpleChat:JS: Allow for changing system prompt anytime for future * SimpleChat:Readme: Note about handle_systemprompt begin/anytime * SimpleChat:HTML: Add viewport meta for better mobile friendliness Without this the page content may look too small. * SimpleChat:HtmlCss: Cleanup UI flow set margin wrt vmin rather than vw or vh so portrait/landscape ok. Use flex and flex-grow to put things on the same line as well as distribute available space as needed. Given two main elements/line so it remains simple. In each line have one element with grows and one sits with a basic comfortably fixed size. * SimpleChat: textarea for multiline user chat, inturn shift+enter 4 enter * SimpleChat: Make vertical layout better responsive (flex based) Also needed to make things cleaner and properly usable whether landscape or portrait, after changing to multiline textarea rather than single line user input. Avoid hardcoding the chat-till-now display area height, instead make it a flex-growable within a flex column of ui elements within a fixed vertical area. * SimpleChat: Rename simplechat.html to index.html, update readme Instead of providing a seperate shell script, update the readme wrt how to run/use this web front end. * SimpleChat: Screen fixed view and scrolling, Printing full * SimpleChat:JS:CI: Avoid space at end of jsdoc param line * SimpleChat:JS: MultiChat initial skeleton Will help maintain multiple independent chats in future * SimpleChat:JS: Move system prompt begin/anytime into SimpleChat * SimpleChat:JS:Keep MultiChatUI simple for now Worry about different chats with different servers for later. * SimpleChat:JS: Move handle submit into MultiChat, build on same Create an instance of MultiChatUI and inturn a instance of chat session, which is what the UI will inturn work on. * SimpleChat:JS: Move to dictionary of SimpleChat, instead of array * SimpleChat: Move ui elements into MultiChatUI, Update el IDs Move ui elements into MultiChatUI, so that current handleUserSubmit doesnt need to take the element arguments. Also in future, when user is allowed to switch between different chat sessions, the UI can be updated as needed by using the elements in UI already known to MultiChatUI instance. Rename the element ids' so that they follow a common convention, as well as one can identify what the element represents in a more consistant manner. * SimpleChat:MCUI:Show available chat sessions, try switch btw them Previous commits brought in / consolidated existing logic into MultiChatUI class. Now start adding logic towards multichat support * show buttons indicating available chat sessions * on sessin button click, try switch to that session * SimpleChat:MCUI: Store and use current chat session id Also allow to switch chat session optionally, wrt some of the related helpers. setup for two chat sessions by default. * SimpleChat:MCUI: Delay enabling user-input to avoid race Re-enable user-input, only after response to a user query has been updated to the chat-div. This ensures that if user tries to switch chat session, it wont be allowed till chat-request-response flow is done. * SimpleChat: Take care of system prompt Helper to get the latest system prompt and inturn use same to set the system prompt ui, when switching. Ensure that system prompt is set if and when enter key is pressed. * SimpleChat:GetSystemLatest, fix a oversight. * SimpleChat:MCUI: Allow selected chat-session btn to be highlighted Also have a general helper for setting class of children. * SimpleChat:Cleanup corners Show system prompt in chat space, when it is set by pressing enter, as a feedback to user. Alert user, if they try to switch chat session in the middle of waiting for a response from the ai model. * SimpleChat:MCUI: Ensure req-resp failure doesnt lock up things * SimpleChat:MCUI: Support for new chat sessions Also a general create button helper. * SimpleChat:MCUI: CreateSessionBtn helper, use wrt NewChat Also fix a oversight wrt using stale data wrt the list of chat sessions. * SimpleChat:MCUI: NewChat btn first before existing chat sessions * SimpleChat:MCUI:CornerCases:Skip new chat, show only if current Skip NewChat if user cancels or if one waiting for response from the ai model. Dont show a chat with newly got ai model response, if current chat session has changed, some how. Chat session shouldnt be allowed to change, if there is a pending response, but still as a additional sanity check. * SimpleChat: Update readme, title, show usage if no chat to show * SimpleChat: Cleanup the log/dialog messages a bit --- examples/server/public_simplechat/index.html | 52 ++ examples/server/public_simplechat/readme.md | 81 +++ .../server/public_simplechat/simplechat.css | 61 +++ .../server/public_simplechat/simplechat.js | 478 ++++++++++++++++++ 4 files changed, 672 insertions(+) create mode 100644 examples/server/public_simplechat/index.html create mode 100644 examples/server/public_simplechat/readme.md create mode 100644 examples/server/public_simplechat/simplechat.css create mode 100644 examples/server/public_simplechat/simplechat.js diff --git a/examples/server/public_simplechat/index.html b/examples/server/public_simplechat/index.html new file mode 100644 index 000000000..1eb390b85 --- /dev/null +++ b/examples/server/public_simplechat/index.html @@ -0,0 +1,52 @@ + + + + SimpleChat (LlamaCPP, ...) + + + + + + + + + + +
+ +
+

SimpleChat

+
+ + +
+
+ +
+ +
+
+ + +
+ +
+
+

Enter the system prompt above, before entering/submitting any user query.

+

Enter your text to the ai assistant below.

+

Use shift+enter for inserting enter.

+

Refresh the page to start over fresh.

+
+ +
+
+ + +
+ +
+ + diff --git a/examples/server/public_simplechat/readme.md b/examples/server/public_simplechat/readme.md new file mode 100644 index 000000000..5ac8258f2 --- /dev/null +++ b/examples/server/public_simplechat/readme.md @@ -0,0 +1,81 @@ + +# SimpleChat + +by Humans for All. + + +## overview + +This simple web frontend, allows triggering/testing the server's /completions or /chat/completions endpoints +in a simple way with minimal code from a common code base. Inturn additionally it tries to allow single or +multiple independent back and forth chatting to an extent, with the ai llm model at a basic level, with their +own system prompts. + +The UI follows a responsive web design so that the layout can adapt to available display space in a usable +enough manner, in general. + +NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and +culling of old messages from the chat. + +NOTE: It doesnt set any parameters other than temperature for now. However if someone wants they can update +the js file as needed. + + +## usage + +One could run this web frontend directly using server itself or if anyone is thinking of adding a built in web +frontend to configure the server over http(s) or so, then run this web frontend using something like python's +http module. + +### running using examples/server + +bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT] + +### running using python3's server module + +first run examples/server +* bin/server -m path/model.gguf + +next run this web front end in examples/server/public_simplechat +* cd ../examples/server/public_simplechat +* python3 -m http.server PORT + +### using the front end + +Open this simple web front end from your local browser +* http://127.0.0.1:PORT/index.html + +Once inside +* Select between chat and completion mode. By default it is set to chat mode. +* If you want to provide a system prompt, then ideally enter it first, before entering any user query. + * if chat.add_system_begin is used + * you cant change the system prompt, after it is has been submitted once along with user query. + * you cant set a system prompt, after you have submitted any user query + * if chat.add_system_anytime is used + * one can change the system prompt any time during chat, by changing the contents of system prompt. + * inturn the updated/changed system prompt will be inserted into the chat session. + * this allows for the subsequent user chatting to be driven by the new system prompt set above. +* Enter your query and either press enter or click on the submit button. + If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter. +* Wait for the logic to communicate with the server and get the response. + * the user is not allowed to enter any fresh query during this time. + * the user input box will be disabled and a working message will be shown in it. +* just refresh the page, to reset wrt the chat history and or system prompt and start afresh. +* Using NewChat one can start independent chat sessions. + * two independent chat sessions are setup by default. + + +## Devel note + +Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js +may not be visible. Also remember that just refreshing/reloading page in browser or for that +matter clearing site data, dont directly override site caching in all cases. Worst case you may +have to change port. Or in dev tools of browser, you may be able to disable caching fully. + +Concept of multiple chat sessions with different servers, as well as saving and restoring of +those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and +its instances relatively easily, however given the current goal of keeping this simple, it has +not been added, for now. + +By switching between chat.add_system_begin/anytime, one can control whether one can change +the system prompt, anytime during the conversation or only at the beginning. diff --git a/examples/server/public_simplechat/simplechat.css b/examples/server/public_simplechat/simplechat.css new file mode 100644 index 000000000..d45f50a95 --- /dev/null +++ b/examples/server/public_simplechat/simplechat.css @@ -0,0 +1,61 @@ +/** + * the styling of the simplechat web frontend + * by Humans for All + */ + +#fullbody { + height: 98vh; +} + +.heading { + background-color: lightgray; +} + +.session-selected { + background-color: lightblue; +} + +.role-system { + background-color: lightblue; +} +.role-user { + background-color: lightgray; +} + +.flex-grow { + flex-grow: 1; +} +.float-right { + float: right; +} + +#chat-div { + overflow: scroll; + flex-grow: 1; + flex-shrink: 1; + min-height: 40vh; +} +button { + min-width: 8vw; +} + +.sameline { + display: flex; + flex-direction: row; +} +.samecolumn { + display: flex; + flex-direction: column; +} + +* { + margin: 0.6vmin; +} + +@media print { + + #fullbody { + height: auto; + } + +} diff --git a/examples/server/public_simplechat/simplechat.js b/examples/server/public_simplechat/simplechat.js new file mode 100644 index 000000000..3fc4dbc20 --- /dev/null +++ b/examples/server/public_simplechat/simplechat.js @@ -0,0 +1,478 @@ +// @ts-check +// A simple completions and chat/completions test related web front end logic +// by Humans for All + +class Roles { + static System = "system"; + static User = "user"; + static Assistant = "assistant"; +} + +class ApiEP { + static Chat = "chat"; + static Completion = "completion"; +} + +let gUsageMsg = ` +

Enter the system prompt above, before entering/submitting any user query.

+

Enter your text to the ai assistant below.

+

Use shift+enter for inserting enter.

+

Refresh the page to start over fresh.

+`; + +class SimpleChat { + + constructor() { + /** + * Maintain in a form suitable for common LLM web service chat/completions' messages entry + * @type {{role: string, content: string}[]} + */ + this.xchat = []; + this.iLastSys = -1; + } + + /** + * Add an entry into xchat + * @param {string} role + * @param {string|undefined|null} content + */ + add(role, content) { + if ((content == undefined) || (content == null) || (content == "")) { + return false; + } + this.xchat.push( {role: role, content: content} ); + if (role == Roles.System) { + this.iLastSys = this.xchat.length - 1; + } + return true; + } + + /** + * Show the contents in the specified div + * @param {HTMLDivElement} div + * @param {boolean} bClear + */ + show(div, bClear=true) { + if (bClear) { + div.replaceChildren(); + } + let last = undefined; + for(const x of this.xchat) { + let entry = document.createElement("p"); + entry.className = `role-${x.role}`; + entry.innerText = `${x.role}: ${x.content}`; + div.appendChild(entry); + last = entry; + } + if (last !== undefined) { + last.scrollIntoView(false); + } else { + if (bClear) { + div.innerHTML = gUsageMsg; + } + } + } + + /** + * Add needed fields wrt json object to be sent wrt LLM web services completions endpoint + * Convert the json into string. + * @param {Object} obj + */ + request_jsonstr(obj) { + obj["temperature"] = 0.7; + return JSON.stringify(obj); + } + + /** + * Return a string form of json object suitable for chat/completions + */ + request_messages_jsonstr() { + let req = { + messages: this.xchat, + } + return this.request_jsonstr(req); + } + + /** + * Return a string form of json object suitable for /completions + */ + request_prompt_jsonstr() { + let prompt = ""; + for(const chat of this.xchat) { + prompt += `${chat.role}: ${chat.content}\n`; + } + let req = { + prompt: prompt, + } + return this.request_jsonstr(req); + } + + /** + * Allow setting of system prompt, but only at begining. + * @param {string} sysPrompt + * @param {string} msgTag + */ + add_system_begin(sysPrompt, msgTag) { + if (this.xchat.length == 0) { + if (sysPrompt.length > 0) { + return this.add(Roles.System, sysPrompt); + } + } else { + if (sysPrompt.length > 0) { + if (this.xchat[0].role !== Roles.System) { + console.error(`ERRR:SimpleChat:SC:${msgTag}:You need to specify system prompt before any user query, ignoring...`); + } else { + if (this.xchat[0].content !== sysPrompt) { + console.error(`ERRR:SimpleChat:SC:${msgTag}:You cant change system prompt, mid way through, ignoring...`); + } + } + } + } + return false; + } + + /** + * Allow setting of system prompt, at any time. + * @param {string} sysPrompt + * @param {string} msgTag + */ + add_system_anytime(sysPrompt, msgTag) { + if (sysPrompt.length <= 0) { + return false; + } + + if (this.iLastSys < 0) { + return this.add(Roles.System, sysPrompt); + } + + let lastSys = this.xchat[this.iLastSys].content; + if (lastSys !== sysPrompt) { + return this.add(Roles.System, sysPrompt); + } + return false; + } + + /** + * Retrieve the latest system prompt. + */ + get_system_latest() { + if (this.iLastSys == -1) { + return ""; + } + let sysPrompt = this.xchat[this.iLastSys].content; + return sysPrompt; + } + +} + + +let gBaseURL = "http://127.0.0.1:8080"; +let gChatURL = { + 'chat': `${gBaseURL}/chat/completions`, + 'completion': `${gBaseURL}/completions`, +} +const gbCompletionFreshChatAlways = true; + + +/** + * Set the class of the children, based on whether it is the idSelected or not. + * @param {HTMLDivElement} elBase + * @param {string} idSelected + * @param {string} classSelected + * @param {string} classUnSelected + */ +function el_children_config_class(elBase, idSelected, classSelected, classUnSelected="") { + for(let child of elBase.children) { + if (child.id == idSelected) { + child.className = classSelected; + } else { + child.className = classUnSelected; + } + } +} + +/** + * Create button and set it up. + * @param {string} id + * @param {(this: HTMLButtonElement, ev: MouseEvent) => any} callback + * @param {string | undefined} name + * @param {string | undefined} innerText + */ +function el_create_button(id, callback, name=undefined, innerText=undefined) { + if (!name) { + name = id; + } + if (!innerText) { + innerText = id; + } + let btn = document.createElement("button"); + btn.id = id; + btn.name = name; + btn.innerText = innerText; + btn.addEventListener("click", callback); + return btn; +} + + +class MultiChatUI { + + constructor() { + /** @type {Object} */ + this.simpleChats = {}; + /** @type {string} */ + this.curChatId = ""; + + // the ui elements + this.elInSystem = /** @type{HTMLInputElement} */(document.getElementById("system-in")); + this.elDivChat = /** @type{HTMLDivElement} */(document.getElementById("chat-div")); + this.elBtnUser = /** @type{HTMLButtonElement} */(document.getElementById("user-btn")); + this.elInUser = /** @type{HTMLInputElement} */(document.getElementById("user-in")); + this.elSelectApiEP = /** @type{HTMLSelectElement} */(document.getElementById("api-ep")); + this.elDivSessions = /** @type{HTMLDivElement} */(document.getElementById("sessions-div")); + + this.validate_element(this.elInSystem, "system-in"); + this.validate_element(this.elDivChat, "chat-div"); + this.validate_element(this.elInUser, "user-in"); + this.validate_element(this.elSelectApiEP, "api-ep"); + this.validate_element(this.elDivChat, "sessions-div"); + } + + /** + * Check if the element got + * @param {HTMLElement | null} el + * @param {string} msgTag + */ + validate_element(el, msgTag) { + if (el == null) { + throw Error(`ERRR:SimpleChat:MCUI:${msgTag} element missing in html...`); + } else { + console.debug(`INFO:SimpleChat:MCUI:${msgTag} Id[${el.id}] Name[${el["name"]}]`); + } + } + + /** + * Reset user input ui. + * * clear user input + * * enable user input + * * set focus to user input + */ + ui_reset_userinput() { + this.elInUser.value = ""; + this.elInUser.disabled = false; + this.elInUser.focus(); + } + + /** + * Setup the needed callbacks wrt UI, curChatId to defaultChatId and + * optionally switch to specified defaultChatId. + * @param {string} defaultChatId + * @param {boolean} bSwitchSession + */ + setup_ui(defaultChatId, bSwitchSession=false) { + + this.curChatId = defaultChatId; + if (bSwitchSession) { + this.handle_session_switch(this.curChatId); + } + + this.elBtnUser.addEventListener("click", (ev)=>{ + if (this.elInUser.disabled) { + return; + } + this.handle_user_submit(this.curChatId, this.elSelectApiEP.value).catch((/** @type{Error} */reason)=>{ + let msg = `ERRR:SimpleChat\nMCUI:HandleUserSubmit:${this.curChatId}\n${reason.name}:${reason.message}`; + console.debug(msg.replace("\n", ":")); + alert(msg); + this.ui_reset_userinput(); + }); + }); + + this.elInUser.addEventListener("keyup", (ev)=> { + // allow user to insert enter into their message using shift+enter. + // while just pressing enter key will lead to submitting. + if ((ev.key === "Enter") && (!ev.shiftKey)) { + this.elBtnUser.click(); + ev.preventDefault(); + } + }); + + this.elInSystem.addEventListener("keyup", (ev)=> { + // allow user to insert enter into the system prompt using shift+enter. + // while just pressing enter key will lead to setting the system prompt. + if ((ev.key === "Enter") && (!ev.shiftKey)) { + let chat = this.simpleChats[this.curChatId]; + chat.add_system_anytime(this.elInSystem.value, this.curChatId); + chat.show(this.elDivChat); + ev.preventDefault(); + } + }); + + } + + /** + * Setup a new chat session and optionally switch to it. + * @param {string} chatId + * @param {boolean} bSwitchSession + */ + new_chat_session(chatId, bSwitchSession=false) { + this.simpleChats[chatId] = new SimpleChat(); + if (bSwitchSession) { + this.handle_session_switch(chatId); + } + } + + /** + * Handle user query submit request, wrt specified chat session. + * @param {string} chatId + * @param {string} apiEP + */ + async handle_user_submit(chatId, apiEP) { + + let chat = this.simpleChats[chatId]; + + chat.add_system_anytime(this.elInSystem.value, chatId); + + let content = this.elInUser.value; + if (!chat.add(Roles.User, content)) { + console.debug(`WARN:SimpleChat:MCUI:${chatId}:HandleUserSubmit:Ignoring empty user input...`); + return; + } + chat.show(this.elDivChat); + + let theBody; + let theUrl = gChatURL[apiEP] + if (apiEP == ApiEP.Chat) { + theBody = chat.request_messages_jsonstr(); + } else { + theBody = chat.request_prompt_jsonstr(); + } + + this.elInUser.value = "working..."; + this.elInUser.disabled = true; + console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:${theUrl}:ReqBody:${theBody}`); + let resp = await fetch(theUrl, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: theBody, + }); + + let respBody = await resp.json(); + console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`); + let assistantMsg; + if (apiEP == ApiEP.Chat) { + assistantMsg = respBody["choices"][0]["message"]["content"]; + } else { + try { + assistantMsg = respBody["choices"][0]["text"]; + } catch { + assistantMsg = respBody["content"]; + } + } + chat.add(Roles.Assistant, assistantMsg); + if (chatId == this.curChatId) { + chat.show(this.elDivChat); + } else { + console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`); + } + // Purposefully clear at end rather than begin of this function + // so that one can switch from chat to completion mode and sequece + // in a completion mode with multiple user-assistant chat data + // from before to be sent/occur once. + if ((apiEP == ApiEP.Completion) && (gbCompletionFreshChatAlways)) { + chat.xchat.length = 0; + } + this.ui_reset_userinput(); + } + + /** + * Show buttons for NewChat and available chat sessions, in the passed elDiv. + * If elDiv is undefined/null, then use this.elDivSessions. + * Take care of highlighting the selected chat-session's btn. + * @param {HTMLDivElement | undefined} elDiv + */ + show_sessions(elDiv=undefined) { + if (!elDiv) { + elDiv = this.elDivSessions; + } + elDiv.replaceChildren(); + // Btn for creating new chat session + let btnNew = el_create_button("New CHAT", (ev)=> { + if (this.elInUser.disabled) { + console.error(`ERRR:SimpleChat:MCUI:NewChat:Current session [${this.curChatId}] awaiting response, ignoring request...`); + alert("ERRR:SimpleChat\nMCUI:NewChat\nWait for response to pending query, before starting new chat session"); + return; + } + let chatId = `Chat${Object.keys(this.simpleChats).length}`; + let chatIdGot = prompt("INFO:SimpleChat\nMCUI:NewChat\nEnter id for new chat session", chatId); + if (!chatIdGot) { + console.error("ERRR:SimpleChat:MCUI:NewChat:Skipping based on user request..."); + return; + } + this.new_chat_session(chatIdGot, true); + this.create_session_btn(elDiv, chatIdGot); + el_children_config_class(elDiv, chatIdGot, "session-selected", ""); + }); + elDiv.appendChild(btnNew); + // Btns for existing chat sessions + let chatIds = Object.keys(this.simpleChats); + for(let cid of chatIds) { + let btn = this.create_session_btn(elDiv, cid); + if (cid == this.curChatId) { + btn.className = "session-selected"; + } + } + } + + create_session_btn(elDiv, cid) { + let btn = el_create_button(cid, (ev)=>{ + let target = /** @type{HTMLButtonElement} */(ev.target); + console.debug(`DBUG:SimpleChat:MCUI:SessionClick:${target.id}`); + if (this.elInUser.disabled) { + console.error(`ERRR:SimpleChat:MCUI:SessionClick:${target.id}:Current session [${this.curChatId}] awaiting response, ignoring switch...`); + alert("ERRR:SimpleChat\nMCUI:SessionClick\nWait for response to pending query, before switching"); + return; + } + this.handle_session_switch(target.id); + el_children_config_class(elDiv, target.id, "session-selected", ""); + }); + elDiv.appendChild(btn); + return btn; + } + + /** + * Switch ui to the specified chatId and set curChatId to same. + * @param {string} chatId + */ + async handle_session_switch(chatId) { + let chat = this.simpleChats[chatId]; + if (chat == undefined) { + console.error(`ERRR:SimpleChat:MCUI:HandleSessionSwitch:${chatId} missing...`); + return; + } + this.elInSystem.value = chat.get_system_latest(); + this.elInUser.value = ""; + chat.show(this.elDivChat); + this.elInUser.focus(); + this.curChatId = chatId; + console.log(`INFO:SimpleChat:MCUI:HandleSessionSwitch:${chatId} entered...`); + } + +} + + +let gMuitChat; +const gChatIds = [ "Default", "Other" ]; + +function startme() { + console.log("INFO:SimpleChat:StartMe:Starting..."); + gMuitChat = new MultiChatUI(); + for (let cid of gChatIds) { + gMuitChat.new_chat_session(cid); + } + gMuitChat.setup_ui(gChatIds[0]); + gMuitChat.show_sessions(); +} + +document.addEventListener("DOMContentLoaded", startme); From cd93a28cb1446319af5e2f4b416174c3a8e43546 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 23 May 2024 00:31:20 +0200 Subject: [PATCH 19/34] CUDA: fix FA out-of-bounds reads (#7479) --- ggml-cuda/fattn-tile-f16.cu | 2 +- ggml-cuda/fattn-tile-f32.cu | 2 +- ggml-cuda/fattn-vec-f16.cu | 6 +++--- ggml-cuda/fattn-vec-f32.cu | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ggml-cuda/fattn-tile-f16.cu b/ggml-cuda/fattn-tile-f16.cu index 586d469c0..cdb5eaff7 100644 --- a/ggml-cuda/fattn-tile-f16.cu +++ b/ggml-cuda/fattn-tile-f16.cu @@ -83,7 +83,7 @@ static __global__ void flash_attn_tile_ext_f16( for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { const int i = i0 + threadIdx.x; - const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i]; + const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f); Q_h2[j][i] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y); } } diff --git a/ggml-cuda/fattn-tile-f32.cu b/ggml-cuda/fattn-tile-f32.cu index b6ef8eb48..5a3de2918 100644 --- a/ggml-cuda/fattn-tile-f32.cu +++ b/ggml-cuda/fattn-tile-f32.cu @@ -79,7 +79,7 @@ static __global__ void flash_attn_tile_ext_f32( #pragma unroll for (int i0 = 0; i0 < D; i0 += 2*WARP_SIZE) { - float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x]; + float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x] : make_float2(0.0f, 0.0f); Q_f[j][i0 + 0*WARP_SIZE + threadIdx.x] = tmp.x * scale; Q_f[j][i0 + 1*WARP_SIZE + threadIdx.x] = tmp.y * scale; } diff --git a/ggml-cuda/fattn-vec-f16.cu b/ggml-cuda/fattn-vec-f16.cu index 7352dcabf..808e8f362 100644 --- a/ggml-cuda/fattn-vec-f16.cu +++ b/ggml-cuda/fattn-vec-f16.cu @@ -94,7 +94,7 @@ static __global__ void flash_attn_vec_ext_f16( for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { const int i = i0 + threadIdx.x; - const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i]; + const float2 tmp = ncols <= 2 || ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f); Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y); } } @@ -212,7 +212,7 @@ static __global__ void flash_attn_vec_ext_f16( #pragma unroll for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) { - if (ic0 + j_VKQ >= ne01) { + if (ncols > 2 && ic0 + j_VKQ >= ne01) { break; } @@ -227,7 +227,7 @@ static __global__ void flash_attn_vec_ext_f16( dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val; } - if (parallel_blocks != 1 && tid < ncols && ic0 + tid < ne01) { + if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) { dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]); } #else diff --git a/ggml-cuda/fattn-vec-f32.cu b/ggml-cuda/fattn-vec-f32.cu index 11476a6c0..b4652301b 100644 --- a/ggml-cuda/fattn-vec-f32.cu +++ b/ggml-cuda/fattn-vec-f32.cu @@ -91,7 +91,7 @@ static __global__ void flash_attn_vec_ext_f32( for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { const int i = i0 + threadIdx.x; - Q_h2[j][i0/WARP_SIZE] = Q_f2[j*(nb01/sizeof(float2)) + i]; + Q_h2[j][i0/WARP_SIZE] = ncols <= 2 || ic0 + j ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f); Q_h2[j][i0/WARP_SIZE].x *= scale; Q_h2[j][i0/WARP_SIZE].y *= scale; } @@ -200,7 +200,7 @@ static __global__ void flash_attn_vec_ext_f32( #pragma unroll for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) { - if (ic0 + j_VKQ >= ne01) { + if (ncols > 2 && ic0 + j_VKQ >= ne01) { break; } @@ -215,7 +215,7 @@ static __global__ void flash_attn_vec_ext_f32( dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val; } - if (parallel_blocks != 1 && tid < ncols && ic0 + tid < ne01) { + if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) { dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]); } } From fbf777d2b9c30e7569e3d1c149501c1e31d9b5b9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 May 2024 09:43:24 +0300 Subject: [PATCH 20/34] main : minor (#7462) --- examples/main/main.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 791dc61a7..09fa85fce 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -474,12 +474,12 @@ int main(int argc, char ** argv) { LOG_TEE("\n\n"); if (params.interactive) { - const char *control_message; + const char * control_message; if (params.multiline_input) { - control_message = " - To return control to LLaMa, end your input with '\\'.\n" + control_message = " - To return control to the AI, end your input with '\\'.\n" " - To return control without starting a new line, end your input with '/'.\n"; } else { - control_message = " - Press Return to return control to LLaMa.\n" + control_message = " - Press Return to return control to the AI.\n" " - To return control without starting a new line, end your input with '/'.\n" " - If you want to submit another line, end your input with '\\'.\n"; } From 1b1e27cb49158123ef4902aa41eb368c9e76e6a1 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Thu, 23 May 2024 08:59:59 +0200 Subject: [PATCH 21/34] Update vulkan rope implementation to support frequency factors (#7475) --- ggml-vulkan-shaders.hpp | 1235 ++++++++++++++++++----------------- ggml-vulkan.cpp | 259 +++++--- ggml_vk_generate_shaders.py | 7 +- 3 files changed, 825 insertions(+), 676 deletions(-) diff --git a/ggml-vulkan-shaders.hpp b/ggml-vulkan-shaders.hpp index 70c4043d3..e8cb5f52c 100644 --- a/ggml-vulkan-shaders.hpp +++ b/ggml-vulkan-shaders.hpp @@ -78882,342 +78882,15 @@ const uint64_t rope_f32_len = 3072; unsigned char rope_neox_f16_data[] = { 0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00, -0x5f,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00, +0x75,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00, 0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00, 0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,0x47,0x4c,0x53,0x4c, 0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,0x00,0x00,0x00,0x00, 0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00, -0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00, +0x0f,0x00,0x0b,0x00,0x05,0x00,0x00,0x00,0x04,0x00,0x00,0x00, 0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,0x2c,0x00,0x00,0x00, 0x68,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x9e,0x00,0x00,0x00, -0xcf,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00, -0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x01,0x00,0x00, -0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x29,0x00,0x00,0x00, -0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00, -0x2a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00, -0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00, -0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x02,0x00,0x00,0x00, -0x23,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00, -0x2a,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00, -0x0c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00, -0x04,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00, -0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x05,0x00,0x00,0x00, -0x23,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00, -0x2a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00, -0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00, -0x07,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00, -0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x08,0x00,0x00,0x00, -0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00, -0x2a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00, -0x30,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2a,0x00,0x00,0x00, -0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x68,0x00,0x00,0x00, -0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x47,0x00,0x04,0x00, -0x95,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00, -0x48,0x00,0x04,0x00,0x96,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x96,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x47,0x00,0x03,0x00,0x96,0x00,0x00,0x00,0x02,0x00,0x00,0x00, -0x47,0x00,0x04,0x00,0x98,0x00,0x00,0x00,0x22,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x98,0x00,0x00,0x00, -0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00, -0x9b,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00, -0x48,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x9c,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x47,0x00,0x03,0x00,0x9c,0x00,0x00,0x00,0x02,0x00,0x00,0x00, -0x47,0x00,0x04,0x00,0x9e,0x00,0x00,0x00,0x22,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9e,0x00,0x00,0x00, -0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00, -0xcc,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00, -0x48,0x00,0x04,0x00,0xcd,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xcd,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x47,0x00,0x03,0x00,0xcd,0x00,0x00,0x00,0x02,0x00,0x00,0x00, -0x47,0x00,0x04,0x00,0xcf,0x00,0x00,0x00,0x22,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xcf,0x00,0x00,0x00, -0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00, -0x16,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00, -0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00, -0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x16,0x00,0x03,0x00, -0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x15,0x00,0x04,0x00, -0x07,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x17,0x00,0x00,0x00, -0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, -0x1b,0x00,0x00,0x00,0x6f,0x12,0x83,0x3a,0x2b,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x00,0x00,0x80,0x3f, -0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00, -0x28,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x1c,0x00,0x04,0x00, -0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x28,0x00,0x00,0x00, -0x1e,0x00,0x0c,0x00,0x2a,0x00,0x00,0x00,0x07,0x00,0x00,0x00, -0x07,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x07,0x00,0x00,0x00, -0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00, -0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00, -0x20,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,0x09,0x00,0x00,0x00, -0x2a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x2b,0x00,0x00,0x00, -0x2c,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00, -0x2d,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00, -0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00, -0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2f,0x00,0x00,0x00, -0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, -0x2d,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x02,0x00,0x00,0x00, -0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x39,0x00,0x00,0x00, -0x05,0x00,0x00,0x00,0x14,0x00,0x02,0x00,0x3c,0x00,0x00,0x00, -0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x41,0x00,0x00,0x00, -0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00, -0x42,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, -0x2d,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x01,0x00,0x00,0x00, -0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00, -0xcd,0xcc,0xcc,0x3d,0x17,0x00,0x04,0x00,0x66,0x00,0x00,0x00, -0x07,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00, -0x67,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x66,0x00,0x00,0x00, -0x3b,0x00,0x04,0x00,0x67,0x00,0x00,0x00,0x68,0x00,0x00,0x00, -0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00, -0x69,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00, -0x6a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00, -0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x6f,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x73,0x00,0x00,0x00, -0x09,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x16,0x00,0x03,0x00, -0x94,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1d,0x00,0x03,0x00, -0x95,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x1e,0x00,0x03,0x00, -0x96,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x20,0x00,0x04,0x00, -0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x96,0x00,0x00,0x00, -0x3b,0x00,0x04,0x00,0x97,0x00,0x00,0x00,0x98,0x00,0x00,0x00, -0x0c,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x9b,0x00,0x00,0x00, -0x94,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x9c,0x00,0x00,0x00, -0x9b,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x9d,0x00,0x00,0x00, -0x0c,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0x3b,0x00,0x04,0x00, -0x9d,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, -0x20,0x00,0x04,0x00,0xa1,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, -0x94,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00, -0xbc,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, -0x2d,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x09,0x00,0x00,0x00, -0x1d,0x00,0x03,0x00,0xcc,0x00,0x00,0x00,0x2d,0x00,0x00,0x00, -0x1e,0x00,0x03,0x00,0xcd,0x00,0x00,0x00,0xcc,0x00,0x00,0x00, -0x20,0x00,0x04,0x00,0xce,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, -0xcd,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xce,0x00,0x00,0x00, -0xcf,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00, -0xd1,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00, -0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0xda,0x00,0x00,0x00, -0x08,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00, -0x15,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x2c,0x00,0x06,0x00, -0x66,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0x69,0x00,0x00,0x00, -0x15,0x01,0x00,0x00,0x69,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x00,0x00,0x00,0x3f, -0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00, -0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x17,0x01,0x00,0x00, -0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,0x6f,0x00,0x00,0x00, -0x18,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x18,0x01,0x00,0x00, -0x41,0x00,0x05,0x00,0x6a,0x00,0x00,0x00,0x6b,0x00,0x00,0x00, -0x68,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x07,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x6b,0x00,0x00,0x00, -0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x6d,0x00,0x00,0x00, -0x6c,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00, -0x6a,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x68,0x00,0x00,0x00, -0x6f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00, -0x71,0x00,0x00,0x00,0x70,0x00,0x00,0x00,0x41,0x00,0x05,0x00, -0x73,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x2c,0x00,0x00,0x00, -0x42,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00, -0x75,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0xae,0x00,0x05,0x00, -0x3c,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x6d,0x00,0x00,0x00, -0x75,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x78,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x76,0x00,0x00,0x00, -0x77,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0xf8,0x00,0x02,0x00, -0x77,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x17,0x01,0x00,0x00, -0xf8,0x00,0x02,0x00,0x78,0x00,0x00,0x00,0x41,0x00,0x05,0x00, -0x73,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x2c,0x00,0x00,0x00, -0x45,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00, -0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x86,0x00,0x05,0x00, -0x07,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00, -0x7d,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x07,0x00,0x00,0x00, -0x83,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x7d,0x00,0x00,0x00, -0xac,0x00,0x05,0x00,0x3c,0x00,0x00,0x00,0x85,0x00,0x00,0x00, -0x7e,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0xf7,0x00,0x03,0x00, -0x87,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00, -0x85,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x87,0x00,0x00,0x00, -0xf8,0x00,0x02,0x00,0x86,0x00,0x00,0x00,0x84,0x00,0x05,0x00, -0x07,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x71,0x00,0x00,0x00, -0x75,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00, -0x90,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00, -0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x91,0x00,0x00,0x00, -0x8c,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00, -0x07,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00, -0x83,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00, -0xa2,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x42,0x00,0x00,0x00, -0x93,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x94,0x00,0x00,0x00, -0xa3,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0x41,0x00,0x06,0x00, -0xa1,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x98,0x00,0x00,0x00, -0x42,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x3e,0x00,0x03,0x00, -0xa4,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x80,0x00,0x05,0x00, -0x07,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0x93,0x00,0x00,0x00, -0x69,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00, -0xa9,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0x42,0x00,0x00,0x00, -0xa6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x94,0x00,0x00,0x00, -0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0x41,0x00,0x06,0x00, -0xa1,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0x98,0x00,0x00,0x00, -0x42,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0x3e,0x00,0x03,0x00, -0xab,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00, -0x17,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x87,0x00,0x00,0x00, -0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xb1,0x00,0x00,0x00, -0x71,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x84,0x00,0x05,0x00, -0x07,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x7e,0x00,0x00,0x00, -0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00, -0xb6,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0xb5,0x00,0x00,0x00, -0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xb8,0x00,0x00,0x00, -0x83,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x80,0x00,0x05,0x00, -0x07,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0xb6,0x00,0x00,0x00, -0xb8,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x73,0x00,0x00,0x00, -0xbd,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0xbc,0x00,0x00,0x00, -0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00, -0xbd,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00, -0xbf,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0xbe,0x00,0x00,0x00, -0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,0xc2,0x00,0x00,0x00, -0x2c,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0xc2,0x00,0x00,0x00, -0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc5,0x00,0x00,0x00, -0x83,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00, -0xc8,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x7f,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0xc8,0x00,0x00,0x00, -0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0xc9,0x00,0x00,0x00, -0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xc3,0x00,0x00,0x00, -0xc5,0x00,0x00,0x00,0x5b,0x01,0x00,0x00,0x41,0x00,0x06,0x00, -0xd1,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0xcf,0x00,0x00,0x00, -0x42,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x2d,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0xd2,0x00,0x00,0x00, -0x6f,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xd6,0x00,0x00,0x00, -0xd3,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00, -0xd7,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x33,0x00,0x00,0x00, -0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xd8,0x00,0x00,0x00, -0xd7,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0xd9,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0xd8,0x00,0x00,0x00, -0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,0xdb,0x00,0x00,0x00, -0x2c,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0xdb,0x00,0x00,0x00, -0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xde,0x00,0x00,0x00, -0x6d,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0xe0,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x5c,0x01,0x00,0x00, -0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00, -0x01,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,0xdc,0x00,0x00,0x00, -0xe0,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0xe2,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0xe1,0x00,0x00,0x00, -0x6d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0xe5,0x00,0x00,0x00, -0xc9,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00, -0x22,0x01,0x00,0x00,0x2c,0x00,0x00,0x00,0x2e,0x00,0x00,0x00, -0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x23,0x01,0x00,0x00, -0x22,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x26,0x01,0x00,0x00,0xd8,0x00,0x00,0x00,0xe2,0x00,0x00,0x00, -0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,0x28,0x01,0x00,0x00, -0x2c,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x28,0x01,0x00,0x00, -0xb7,0x00,0x05,0x00,0x3c,0x00,0x00,0x00,0x2a,0x01,0x00,0x00, -0x29,0x01,0x00,0x00,0x20,0x00,0x00,0x00,0xf7,0x00,0x03,0x00, -0x43,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00, -0x2a,0x01,0x00,0x00,0x2b,0x01,0x00,0x00,0x43,0x01,0x00,0x00, -0xf8,0x00,0x02,0x00,0x2b,0x01,0x00,0x00,0x41,0x00,0x06,0x00, -0x2f,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,0x2c,0x00,0x00,0x00, -0x41,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0x2d,0x01,0x00,0x00,0x2c,0x01,0x00,0x00, -0x41,0x00,0x06,0x00,0x2f,0x00,0x00,0x00,0x2e,0x01,0x00,0x00, -0x2c,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x45,0x00,0x00,0x00, -0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x2f,0x01,0x00,0x00, -0x2e,0x01,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00, -0x4f,0x01,0x00,0x00,0xe5,0x00,0x00,0x00,0x17,0x00,0x00,0x00, -0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x50,0x01,0x00,0x00, -0x4f,0x01,0x00,0x00,0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x51,0x01,0x00,0x00,0x50,0x01,0x00,0x00,0x2d,0x01,0x00,0x00, -0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x52,0x01,0x00,0x00, -0x2f,0x01,0x00,0x00,0x2d,0x01,0x00,0x00,0x0c,0x00,0x07,0x00, -0x06,0x00,0x00,0x00,0x53,0x01,0x00,0x00,0x01,0x00,0x00,0x00, -0x28,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x52,0x01,0x00,0x00, -0x88,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x54,0x01,0x00,0x00, -0x51,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x0c,0x00,0x07,0x00, -0x06,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0x01,0x00,0x00,0x00, -0x28,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x54,0x01,0x00,0x00, -0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x57,0x01,0x00,0x00, -0x01,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x1f,0x00,0x00,0x00, -0x56,0x01,0x00,0x00,0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x58,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,0x57,0x01,0x00,0x00, -0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x33,0x01,0x00,0x00, -0x58,0x01,0x00,0x00,0x29,0x01,0x00,0x00,0x83,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0x5d,0x01,0x00,0x00,0x57,0x01,0x00,0x00, -0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00, -0x36,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00, -0x5d,0x01,0x00,0x00,0x29,0x01,0x00,0x00,0x1f,0x00,0x00,0x00, -0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x39,0x01,0x00,0x00, -0xe2,0x00,0x00,0x00,0x33,0x01,0x00,0x00,0x0c,0x00,0x08,0x00, -0x06,0x00,0x00,0x00,0x3a,0x01,0x00,0x00,0x01,0x00,0x00,0x00, -0x32,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0x36,0x01,0x00,0x00, -0x39,0x01,0x00,0x00,0x88,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x3d,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,0xd8,0x00,0x00,0x00, -0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x3e,0x01,0x00,0x00, -0x01,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x3d,0x01,0x00,0x00, -0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00, -0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, -0x3e,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,0x85,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0x42,0x01,0x00,0x00,0x23,0x01,0x00,0x00, -0x40,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x43,0x01,0x00,0x00, -0xf8,0x00,0x02,0x00,0x43,0x01,0x00,0x00,0xf5,0x00,0x07,0x00, -0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x23,0x01,0x00,0x00, -0x87,0x00,0x00,0x00,0x42,0x01,0x00,0x00,0x2b,0x01,0x00,0x00, -0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x59,0x01,0x00,0x00, -0x26,0x01,0x00,0x00,0x87,0x00,0x00,0x00,0x3a,0x01,0x00,0x00, -0x2b,0x01,0x00,0x00,0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00, -0x45,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x0e,0x00,0x00,0x00, -0x59,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x47,0x01,0x00,0x00,0x45,0x01,0x00,0x00,0x5a,0x01,0x00,0x00, -0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00,0x49,0x01,0x00,0x00, -0x01,0x00,0x00,0x00,0x0d,0x00,0x00,0x00,0x59,0x01,0x00,0x00, -0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4b,0x01,0x00,0x00, -0x49,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,0x41,0x00,0x06,0x00, -0xa1,0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x9e,0x00,0x00,0x00, -0x42,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x94,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0xf0,0x00,0x00,0x00, -0x73,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf2,0x00,0x00,0x00, -0xf1,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00, -0xf7,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x17,0x00,0x00,0x00, -0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xf8,0x00,0x00,0x00, -0xb9,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0x41,0x00,0x06,0x00, -0xa1,0x00,0x00,0x00,0xf9,0x00,0x00,0x00,0x9e,0x00,0x00,0x00, -0x42,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x94,0x00,0x00,0x00,0xfa,0x00,0x00,0x00,0xf9,0x00,0x00,0x00, -0x73,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xfb,0x00,0x00,0x00, -0xfa,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x03,0x01,0x00,0x00,0xfb,0x00,0x00,0x00,0x4b,0x01,0x00,0x00, -0x7f,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x5e,0x01,0x00,0x00, -0x03,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00, -0x04,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00, -0xf2,0x00,0x00,0x00,0x47,0x01,0x00,0x00,0x5e,0x01,0x00,0x00, -0x73,0x00,0x04,0x00,0x94,0x00,0x00,0x00,0x05,0x01,0x00,0x00, -0x04,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00, -0x06,0x01,0x00,0x00,0x98,0x00,0x00,0x00,0x42,0x00,0x00,0x00, -0xb9,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x06,0x01,0x00,0x00, -0x05,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x11,0x01,0x00,0x00,0xfb,0x00,0x00,0x00,0x47,0x01,0x00,0x00, -0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0x12,0x01,0x00,0x00, -0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xf2,0x00,0x00,0x00, -0x4b,0x01,0x00,0x00,0x11,0x01,0x00,0x00,0x73,0x00,0x04,0x00, -0x94,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0x12,0x01,0x00,0x00, -0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00,0x14,0x01,0x00,0x00, -0x98,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xf8,0x00,0x00,0x00, -0x3e,0x00,0x03,0x00,0x14,0x01,0x00,0x00,0x13,0x01,0x00,0x00, -0xf9,0x00,0x02,0x00,0x17,0x01,0x00,0x00,0xf8,0x00,0x02,0x00, -0x17,0x01,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00, - -}; -const uint64_t rope_neox_f16_len = 3876; - -unsigned char rope_neox_f32_data[] = { -0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00, -0x5a,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00, -0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00, -0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30, -0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00, -0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00, -0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00, -0x2c,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x97,0x00,0x00,0x00, -0x9d,0x00,0x00,0x00,0xce,0x00,0x00,0x00,0x10,0x00,0x06,0x00, +0xcf,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0x10,0x00,0x06,0x00, 0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00, 0x00,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00, 0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00, @@ -79237,51 +78910,61 @@ unsigned char rope_neox_f32_data[] = { 0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00, 0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x2c,0x00,0x00,0x00, 0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x09,0x00,0x00,0x00, -0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x47,0x00,0x03,0x00, -0x2a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00, -0x68,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00, -0x47,0x00,0x04,0x00,0x94,0x00,0x00,0x00,0x06,0x00,0x00,0x00, -0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x95,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00, -0x95,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x95,0x00,0x00,0x00, -0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x97,0x00,0x00,0x00, -0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00, -0x97,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00, -0x47,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,0x06,0x00,0x00,0x00, -0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x9b,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00, -0x9b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x9b,0x00,0x00,0x00, -0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00, -0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00, -0x9d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x47,0x00,0x04,0x00,0xcb,0x00,0x00,0x00,0x06,0x00,0x00,0x00, -0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xcc,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00, -0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xcc,0x00,0x00,0x00, -0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xce,0x00,0x00,0x00, -0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00, -0xce,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00, -0x47,0x00,0x04,0x00,0x11,0x01,0x00,0x00,0x0b,0x00,0x00,0x00, -0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00, -0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00, -0x16,0x00,0x03,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00, -0x15,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x20,0x00,0x00,0x00, +0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x48,0x00,0x05,0x00, +0x2a,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x23,0x00,0x00,0x00, +0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x2a,0x00,0x00,0x00, +0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x68,0x00,0x00,0x00, +0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x47,0x00,0x04,0x00, +0x95,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00, +0x48,0x00,0x04,0x00,0x96,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x96,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x47,0x00,0x03,0x00,0x96,0x00,0x00,0x00,0x02,0x00,0x00,0x00, +0x47,0x00,0x04,0x00,0x98,0x00,0x00,0x00,0x22,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x98,0x00,0x00,0x00, +0x21,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x47,0x00,0x04,0x00, +0x9b,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00, +0x48,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x9c,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x47,0x00,0x03,0x00,0x9c,0x00,0x00,0x00,0x02,0x00,0x00,0x00, +0x47,0x00,0x04,0x00,0x9e,0x00,0x00,0x00,0x22,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9e,0x00,0x00,0x00, +0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00, +0xcc,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00, +0x48,0x00,0x04,0x00,0xcd,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xcd,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x47,0x00,0x03,0x00,0xcd,0x00,0x00,0x00,0x02,0x00,0x00,0x00, +0x47,0x00,0x04,0x00,0xcf,0x00,0x00,0x00,0x22,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xcf,0x00,0x00,0x00, +0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00, +0xdc,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00, +0x48,0x00,0x04,0x00,0xdd,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0xdd,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x47,0x00,0x03,0x00,0xdd,0x00,0x00,0x00,0x02,0x00,0x00,0x00, +0x47,0x00,0x04,0x00,0xdf,0x00,0x00,0x00,0x22,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xdf,0x00,0x00,0x00, +0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00, +0x2b,0x01,0x00,0x00,0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00, +0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00, +0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x16,0x00,0x03,0x00, +0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x15,0x00,0x04,0x00, +0x07,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x17,0x00,0x00,0x00, +0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x1b,0x00,0x00,0x00,0x6f,0x12,0x83,0x3a,0x2b,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,0x00,0x00,0x80,0x3f, +0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00, -0x17,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x6f,0x12,0x83,0x3a, -0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1f,0x00,0x00,0x00, -0x00,0x00,0x80,0x3f,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, -0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, -0x07,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x04,0x00,0x00,0x00, -0x1c,0x00,0x04,0x00,0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00, -0x28,0x00,0x00,0x00,0x1e,0x00,0x0c,0x00,0x2a,0x00,0x00,0x00, -0x07,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x06,0x00,0x00,0x00, -0x07,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00, -0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00, -0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2b,0x00,0x00,0x00, +0x28,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x1c,0x00,0x04,0x00, +0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x28,0x00,0x00,0x00, +0x1e,0x00,0x0d,0x00,0x2a,0x00,0x00,0x00,0x07,0x00,0x00,0x00, +0x07,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x07,0x00,0x00,0x00, +0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x07,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2b,0x00,0x00,0x00, 0x09,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x3b,0x00,0x04,0x00, 0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x09,0x00,0x00,0x00, 0x15,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x20,0x00,0x00,0x00, @@ -79306,228 +78989,614 @@ unsigned char rope_neox_f32_data[] = { 0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00, 0x6f,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00, 0x73,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x07,0x00,0x00,0x00, -0x1d,0x00,0x03,0x00,0x94,0x00,0x00,0x00,0x06,0x00,0x00,0x00, -0x1e,0x00,0x03,0x00,0x95,0x00,0x00,0x00,0x94,0x00,0x00,0x00, -0x20,0x00,0x04,0x00,0x96,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, -0x95,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x96,0x00,0x00,0x00, -0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x1d,0x00,0x03,0x00, -0x9a,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x03,0x00, -0x9b,0x00,0x00,0x00,0x9a,0x00,0x00,0x00,0x20,0x00,0x04,0x00, -0x9c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x9b,0x00,0x00,0x00, -0x3b,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,0x9d,0x00,0x00,0x00, -0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xa0,0x00,0x00,0x00, -0x0c,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, -0x2d,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x03,0x00,0x00,0x00, -0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0xc0,0x00,0x00,0x00, -0x09,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0xcb,0x00,0x00,0x00, -0x2d,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xcc,0x00,0x00,0x00, -0xcb,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xcd,0x00,0x00,0x00, -0x0c,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0x3b,0x00,0x04,0x00, -0xcd,0x00,0x00,0x00,0xce,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, -0x20,0x00,0x04,0x00,0xd0,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, +0x16,0x00,0x03,0x00,0x94,0x00,0x00,0x00,0x10,0x00,0x00,0x00, +0x1d,0x00,0x03,0x00,0x95,0x00,0x00,0x00,0x94,0x00,0x00,0x00, +0x1e,0x00,0x03,0x00,0x96,0x00,0x00,0x00,0x95,0x00,0x00,0x00, +0x20,0x00,0x04,0x00,0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, +0x96,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x97,0x00,0x00,0x00, +0x98,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x1d,0x00,0x03,0x00, +0x9b,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x1e,0x00,0x03,0x00, +0x9c,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x20,0x00,0x04,0x00, +0x9d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x9c,0x00,0x00,0x00, +0x3b,0x00,0x04,0x00,0x9d,0x00,0x00,0x00,0x9e,0x00,0x00,0x00, +0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xa1,0x00,0x00,0x00, +0x0c,0x00,0x00,0x00,0x94,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x2d,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x03,0x00,0x00,0x00, +0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0xc1,0x00,0x00,0x00, +0x09,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0xcc,0x00,0x00,0x00, +0x2d,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0xcd,0x00,0x00,0x00, +0xcc,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xce,0x00,0x00,0x00, +0x0c,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0x3b,0x00,0x04,0x00, +0xce,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, +0x20,0x00,0x04,0x00,0xd1,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, 0x2d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00, -0xd9,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, -0x07,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x00,0x01,0x00,0x00, -0x2c,0x00,0x06,0x00,0x66,0x00,0x00,0x00,0x11,0x01,0x00,0x00, -0x69,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x69,0x00,0x00,0x00, -0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x57,0x01,0x00,0x00, -0x00,0x00,0x00,0x3f,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00, -0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00, -0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00, -0x12,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00, -0x6f,0x00,0x00,0x00,0x13,0x01,0x00,0x00,0xf8,0x00,0x02,0x00, -0x13,0x01,0x00,0x00,0x41,0x00,0x05,0x00,0x6a,0x00,0x00,0x00, -0x6b,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x69,0x00,0x00,0x00, -0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x6c,0x00,0x00,0x00, -0x6b,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00, -0x6d,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0x17,0x00,0x00,0x00, -0x41,0x00,0x05,0x00,0x6a,0x00,0x00,0x00,0x70,0x00,0x00,0x00, -0x68,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x07,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x70,0x00,0x00,0x00, -0x41,0x00,0x05,0x00,0x73,0x00,0x00,0x00,0x74,0x00,0x00,0x00, -0x2c,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x07,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x74,0x00,0x00,0x00, -0xae,0x00,0x05,0x00,0x3c,0x00,0x00,0x00,0x76,0x00,0x00,0x00, -0x6d,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0xf7,0x00,0x03,0x00, -0x78,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00, -0x76,0x00,0x00,0x00,0x77,0x00,0x00,0x00,0x78,0x00,0x00,0x00, -0xf8,0x00,0x02,0x00,0x77,0x00,0x00,0x00,0xf9,0x00,0x02,0x00, -0x12,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x78,0x00,0x00,0x00, -0x41,0x00,0x05,0x00,0x73,0x00,0x00,0x00,0x7c,0x00,0x00,0x00, -0x2c,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x07,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00, -0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x7e,0x00,0x00,0x00, -0x6d,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x89,0x00,0x05,0x00, -0x07,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x6d,0x00,0x00,0x00, -0x7d,0x00,0x00,0x00,0xac,0x00,0x05,0x00,0x3c,0x00,0x00,0x00, -0x85,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x6f,0x00,0x00,0x00, -0xf7,0x00,0x03,0x00,0x87,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0xfa,0x00,0x04,0x00,0x85,0x00,0x00,0x00,0x86,0x00,0x00,0x00, -0x87,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x86,0x00,0x00,0x00, -0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x8c,0x00,0x00,0x00, -0x71,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x84,0x00,0x05,0x00, -0x07,0x00,0x00,0x00,0x90,0x00,0x00,0x00,0x7e,0x00,0x00,0x00, -0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00, -0x91,0x00,0x00,0x00,0x8c,0x00,0x00,0x00,0x90,0x00,0x00,0x00, -0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x93,0x00,0x00,0x00, -0x91,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x41,0x00,0x06,0x00, -0xa0,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,0x9d,0x00,0x00,0x00, -0x42,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,0xa1,0x00,0x00,0x00, -0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00,0xa3,0x00,0x00,0x00, -0x97,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x93,0x00,0x00,0x00, -0x3e,0x00,0x03,0x00,0xa3,0x00,0x00,0x00,0xa2,0x00,0x00,0x00, -0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xa5,0x00,0x00,0x00, -0x93,0x00,0x00,0x00,0x69,0x00,0x00,0x00,0x41,0x00,0x06,0x00, -0xa0,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0x9d,0x00,0x00,0x00, -0x42,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,0xa8,0x00,0x00,0x00, -0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00,0xaa,0x00,0x00,0x00, -0x97,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xa5,0x00,0x00,0x00, -0x3e,0x00,0x03,0x00,0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00, -0xf9,0x00,0x02,0x00,0x12,0x01,0x00,0x00,0xf8,0x00,0x02,0x00, -0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00, -0xb0,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x75,0x00,0x00,0x00, -0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xb4,0x00,0x00,0x00, +0xd5,0x00,0x00,0x00,0x0a,0x00,0x00,0x00,0x1d,0x00,0x03,0x00, +0xdc,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x1e,0x00,0x03,0x00, +0xdd,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0x20,0x00,0x04,0x00, +0xde,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xdd,0x00,0x00,0x00, +0x3b,0x00,0x04,0x00,0xde,0x00,0x00,0x00,0xdf,0x00,0x00,0x00, +0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xe2,0x00,0x00,0x00, +0x0c,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x2d,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0x08,0x00,0x00,0x00, +0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x2a,0x01,0x00,0x00, +0x00,0x01,0x00,0x00,0x2c,0x00,0x06,0x00,0x66,0x00,0x00,0x00, +0x2b,0x01,0x00,0x00,0x69,0x00,0x00,0x00,0x2a,0x01,0x00,0x00, +0x69,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x72,0x01,0x00,0x00,0x00,0x00,0x00,0x3f,0x36,0x00,0x05,0x00, +0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00, +0xf7,0x00,0x03,0x00,0x2c,0x01,0x00,0x00,0x00,0x00,0x00,0x00, +0xfb,0x00,0x03,0x00,0x6f,0x00,0x00,0x00,0x2d,0x01,0x00,0x00, +0xf8,0x00,0x02,0x00,0x2d,0x01,0x00,0x00,0x41,0x00,0x05,0x00, +0x6a,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x68,0x00,0x00,0x00, +0x69,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00, +0x6c,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x84,0x00,0x05,0x00, +0x07,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x6c,0x00,0x00,0x00, +0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x6a,0x00,0x00,0x00, +0x70,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x6f,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x71,0x00,0x00,0x00, +0x70,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x73,0x00,0x00,0x00, +0x74,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x42,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x75,0x00,0x00,0x00, +0x74,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x3c,0x00,0x00,0x00, +0x76,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x75,0x00,0x00,0x00, +0xf7,0x00,0x03,0x00,0x78,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0xfa,0x00,0x04,0x00,0x76,0x00,0x00,0x00,0x77,0x00,0x00,0x00, +0x78,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x77,0x00,0x00,0x00, +0xf9,0x00,0x02,0x00,0x2c,0x01,0x00,0x00,0xf8,0x00,0x02,0x00, +0x78,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x73,0x00,0x00,0x00, +0x7c,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x45,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x7d,0x00,0x00,0x00, +0x7c,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00, +0x7e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x7d,0x00,0x00,0x00, +0x89,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x83,0x00,0x00,0x00, +0x6d,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0xac,0x00,0x05,0x00, +0x3c,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x7e,0x00,0x00,0x00, +0x6f,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x87,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x85,0x00,0x00,0x00, +0x86,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0xf8,0x00,0x02,0x00, +0x86,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00, +0x8c,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x75,0x00,0x00,0x00, +0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x90,0x00,0x00,0x00, 0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00, -0x07,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb0,0x00,0x00,0x00, -0xb4,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00, -0xb7,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x17,0x00,0x00,0x00, -0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xb8,0x00,0x00,0x00, -0xb5,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x41,0x00,0x05,0x00, -0x73,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x2c,0x00,0x00,0x00, -0xbb,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00, -0xbd,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x86,0x00,0x05,0x00, -0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,0x71,0x00,0x00,0x00, -0xbd,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00, -0xc1,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0xc0,0x00,0x00,0x00, -0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc2,0x00,0x00,0x00, -0xc1,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00, -0xc4,0x00,0x00,0x00,0x83,0x00,0x00,0x00,0x70,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0x7e,0x00,0x00,0x00, -0x7f,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x56,0x01,0x00,0x00, -0xc7,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00, -0xc8,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00, -0xc2,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0x56,0x01,0x00,0x00, -0x41,0x00,0x06,0x00,0xd0,0x00,0x00,0x00,0xd1,0x00,0x00,0x00, -0xce,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xbe,0x00,0x00,0x00, -0x3d,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0xd2,0x00,0x00,0x00, -0xd1,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x06,0x00,0x00,0x00, -0xd5,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0x41,0x00,0x05,0x00, -0x2f,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0x2c,0x00,0x00,0x00, -0x33,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00, -0xd7,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0x85,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xd5,0x00,0x00,0x00, -0xd7,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00, -0xda,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0xd9,0x00,0x00,0x00, -0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xdb,0x00,0x00,0x00, -0xda,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00, -0xdd,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x85,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0xdd,0x00,0x00,0x00, -0x57,0x01,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00, -0xe0,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x1a,0x00,0x00,0x00, -0xdb,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0x85,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xd8,0x00,0x00,0x00, -0xe0,0x00,0x00,0x00,0x6d,0x00,0x04,0x00,0x07,0x00,0x00,0x00, -0xe4,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0x41,0x00,0x05,0x00, -0x2f,0x00,0x00,0x00,0x1d,0x01,0x00,0x00,0x2c,0x00,0x00,0x00, +0x07,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x8c,0x00,0x00,0x00, +0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00, +0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x83,0x00,0x00,0x00, +0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00,0xa2,0x00,0x00,0x00, +0x9e,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x93,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x94,0x00,0x00,0x00,0xa3,0x00,0x00,0x00, +0xa2,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00, +0xa4,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x42,0x00,0x00,0x00, +0x93,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xa4,0x00,0x00,0x00, +0xa3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00, +0xa6,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x69,0x00,0x00,0x00, +0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00,0xa9,0x00,0x00,0x00, +0x9e,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xa6,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x94,0x00,0x00,0x00,0xaa,0x00,0x00,0x00, +0xa9,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00, +0xab,0x00,0x00,0x00,0x98,0x00,0x00,0x00,0x42,0x00,0x00,0x00, +0xa6,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xab,0x00,0x00,0x00, +0xaa,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x2c,0x01,0x00,0x00, +0xf8,0x00,0x02,0x00,0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00, +0x07,0x00,0x00,0x00,0xb1,0x00,0x00,0x00,0x71,0x00,0x00,0x00, +0x75,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00, +0xb5,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00, +0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xb6,0x00,0x00,0x00, +0xb1,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0x86,0x00,0x05,0x00, +0x07,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0x83,0x00,0x00,0x00, +0x17,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00, +0xb9,0x00,0x00,0x00,0xb6,0x00,0x00,0x00,0xb8,0x00,0x00,0x00, +0x41,0x00,0x05,0x00,0x73,0x00,0x00,0x00,0xbd,0x00,0x00,0x00, +0x2c,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, +0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,0xbd,0x00,0x00,0x00, +0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xbf,0x00,0x00,0x00, +0x71,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,0x41,0x00,0x05,0x00, +0x2f,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x2c,0x00,0x00,0x00, +0xc1,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0xc3,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x70,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x83,0x00,0x00,0x00, +0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc8,0x00,0x00,0x00, +0x7e,0x00,0x00,0x00,0x7f,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x71,0x01,0x00,0x00,0xc8,0x00,0x00,0x00,0x0c,0x00,0x08,0x00, +0x06,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,0x01,0x00,0x00,0x00, +0x32,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0xc5,0x00,0x00,0x00, +0x71,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0xd1,0x00,0x00,0x00, +0xd2,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0x42,0x00,0x00,0x00, +0xbf,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x2d,0x00,0x00,0x00, +0xd3,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0x41,0x00,0x05,0x00, +0x73,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0x2c,0x00,0x00,0x00, +0xd5,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00, +0xd7,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,0xab,0x00,0x05,0x00, +0x3c,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0xd7,0x00,0x00,0x00, +0x6f,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0xdb,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd8,0x00,0x00,0x00, +0xda,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0xf8,0x00,0x02,0x00, +0xda,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xe2,0x00,0x00,0x00, +0xe3,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,0x42,0x00,0x00,0x00, +0xb8,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0xe4,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0xf9,0x00,0x02,0x00, +0xdb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xe5,0x00,0x00,0x00, +0xf9,0x00,0x02,0x00,0xdb,0x00,0x00,0x00,0xf8,0x00,0x02,0x00, +0xdb,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00, +0x6e,0x01,0x00,0x00,0xe4,0x00,0x00,0x00,0xda,0x00,0x00,0x00, +0x1f,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x6f,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xd3,0x00,0x00,0x00, +0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,0xea,0x00,0x00,0x00, +0x2c,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0xea,0x00,0x00,0x00, +0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xec,0x00,0x00,0x00, +0xe9,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x41,0x00,0x05,0x00, +0x2f,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x2c,0x00,0x00,0x00, +0xed,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0xef,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x70,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x6d,0x00,0x00,0x00, +0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00, +0xf1,0x00,0x00,0x00,0x72,0x01,0x00,0x00,0x0c,0x00,0x07,0x00, +0x06,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0x01,0x00,0x00,0x00, +0x1a,0x00,0x00,0x00,0xef,0x00,0x00,0x00,0xf3,0x00,0x00,0x00, +0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00, +0xec,0x00,0x00,0x00,0xf4,0x00,0x00,0x00,0x88,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0xf5,0x00,0x00,0x00, +0x6e,0x01,0x00,0x00,0x6d,0x00,0x04,0x00,0x07,0x00,0x00,0x00, +0xfa,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,0x41,0x00,0x05,0x00, +0x2f,0x00,0x00,0x00,0x37,0x01,0x00,0x00,0x2c,0x00,0x00,0x00, 0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00, -0x1e,0x01,0x00,0x00,0x1d,0x01,0x00,0x00,0x85,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0x21,0x01,0x00,0x00,0xd7,0x00,0x00,0x00, -0xe1,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00, -0x23,0x01,0x00,0x00,0x2c,0x00,0x00,0x00,0x39,0x00,0x00,0x00, -0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x24,0x01,0x00,0x00, -0x23,0x01,0x00,0x00,0xb7,0x00,0x05,0x00,0x3c,0x00,0x00,0x00, -0x25,0x01,0x00,0x00,0x24,0x01,0x00,0x00,0x20,0x00,0x00,0x00, -0xf7,0x00,0x03,0x00,0x3e,0x01,0x00,0x00,0x00,0x00,0x00,0x00, -0xfa,0x00,0x04,0x00,0x25,0x01,0x00,0x00,0x26,0x01,0x00,0x00, -0x3e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x26,0x01,0x00,0x00, -0x41,0x00,0x06,0x00,0x2f,0x00,0x00,0x00,0x27,0x01,0x00,0x00, +0x38,0x01,0x00,0x00,0x37,0x01,0x00,0x00,0x85,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x3b,0x01,0x00,0x00,0xeb,0x00,0x00,0x00, +0xf7,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00, +0x3d,0x01,0x00,0x00,0x2c,0x00,0x00,0x00,0x39,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3e,0x01,0x00,0x00, +0x3d,0x01,0x00,0x00,0xb7,0x00,0x05,0x00,0x3c,0x00,0x00,0x00, +0x3f,0x01,0x00,0x00,0x3e,0x01,0x00,0x00,0x20,0x00,0x00,0x00, +0xf7,0x00,0x03,0x00,0x58,0x01,0x00,0x00,0x00,0x00,0x00,0x00, +0xfa,0x00,0x04,0x00,0x3f,0x01,0x00,0x00,0x40,0x01,0x00,0x00, +0x58,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x40,0x01,0x00,0x00, +0x41,0x00,0x06,0x00,0x2f,0x00,0x00,0x00,0x41,0x01,0x00,0x00, 0x2c,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x42,0x00,0x00,0x00, -0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x28,0x01,0x00,0x00, -0x27,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x2f,0x00,0x00,0x00, -0x29,0x01,0x00,0x00,0x2c,0x00,0x00,0x00,0x41,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x42,0x01,0x00,0x00, +0x41,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x2f,0x00,0x00,0x00, +0x43,0x01,0x00,0x00,0x2c,0x00,0x00,0x00,0x41,0x00,0x00,0x00, 0x45,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00, -0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00,0x86,0x00,0x05,0x00, -0x07,0x00,0x00,0x00,0x4a,0x01,0x00,0x00,0xe4,0x00,0x00,0x00, +0x44,0x01,0x00,0x00,0x43,0x01,0x00,0x00,0x86,0x00,0x05,0x00, +0x07,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0xfa,0x00,0x00,0x00, 0x17,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00, -0x4b,0x01,0x00,0x00,0x4a,0x01,0x00,0x00,0x83,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0x4c,0x01,0x00,0x00,0x4b,0x01,0x00,0x00, -0x28,0x01,0x00,0x00,0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x4d,0x01,0x00,0x00,0x2a,0x01,0x00,0x00,0x28,0x01,0x00,0x00, -0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x4e,0x01,0x00,0x00, +0x65,0x01,0x00,0x00,0x64,0x01,0x00,0x00,0x83,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x66,0x01,0x00,0x00,0x65,0x01,0x00,0x00, +0x42,0x01,0x00,0x00,0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00, +0x67,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x42,0x01,0x00,0x00, +0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x68,0x01,0x00,0x00, 0x01,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x1b,0x00,0x00,0x00, -0x4d,0x01,0x00,0x00,0x88,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x4f,0x01,0x00,0x00,0x4c,0x01,0x00,0x00,0x4e,0x01,0x00,0x00, -0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00, +0x67,0x01,0x00,0x00,0x88,0x00,0x05,0x00,0x06,0x00,0x00,0x00, +0x69,0x01,0x00,0x00,0x66,0x01,0x00,0x00,0x68,0x01,0x00,0x00, +0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x6b,0x01,0x00,0x00, 0x01,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x20,0x00,0x00,0x00, -0x4f,0x01,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00, -0x52,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x25,0x00,0x00,0x00, -0x1f,0x00,0x00,0x00,0x51,0x01,0x00,0x00,0x83,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0x53,0x01,0x00,0x00,0x1f,0x00,0x00,0x00, -0x52,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x2e,0x01,0x00,0x00,0x53,0x01,0x00,0x00,0x24,0x01,0x00,0x00, -0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x58,0x01,0x00,0x00, -0x52,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00, -0x06,0x00,0x00,0x00,0x31,0x01,0x00,0x00,0x01,0x00,0x00,0x00, -0x32,0x00,0x00,0x00,0x58,0x01,0x00,0x00,0x24,0x01,0x00,0x00, +0x69,0x01,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00, +0x6c,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x25,0x00,0x00,0x00, +0x1f,0x00,0x00,0x00,0x6b,0x01,0x00,0x00,0x83,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x6d,0x01,0x00,0x00,0x1f,0x00,0x00,0x00, +0x6c,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, +0x48,0x01,0x00,0x00,0x6d,0x01,0x00,0x00,0x3e,0x01,0x00,0x00, +0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x73,0x01,0x00,0x00, +0x6c,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00, +0x06,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,0x01,0x00,0x00,0x00, +0x32,0x00,0x00,0x00,0x73,0x01,0x00,0x00,0x3e,0x01,0x00,0x00, 0x1f,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x34,0x01,0x00,0x00,0xe1,0x00,0x00,0x00,0x2e,0x01,0x00,0x00, -0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0x35,0x01,0x00,0x00, -0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x21,0x01,0x00,0x00, -0x31,0x01,0x00,0x00,0x34,0x01,0x00,0x00,0x88,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00,0x1f,0x00,0x00,0x00, -0xd7,0x00,0x00,0x00,0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00, -0x39,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x1c,0x00,0x00,0x00, -0x38,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00, -0x3b,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00, -0x53,0x00,0x00,0x00,0x39,0x01,0x00,0x00,0x1f,0x00,0x00,0x00, -0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x01,0x00,0x00, -0x1e,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,0xf9,0x00,0x02,0x00, -0x3e,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x3e,0x01,0x00,0x00, -0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x55,0x01,0x00,0x00, -0x1e,0x01,0x00,0x00,0x87,0x00,0x00,0x00,0x3d,0x01,0x00,0x00, -0x26,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00, -0x54,0x01,0x00,0x00,0x21,0x01,0x00,0x00,0x87,0x00,0x00,0x00, -0x35,0x01,0x00,0x00,0x26,0x01,0x00,0x00,0x0c,0x00,0x06,0x00, -0x06,0x00,0x00,0x00,0x40,0x01,0x00,0x00,0x01,0x00,0x00,0x00, -0x0e,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x85,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0x42,0x01,0x00,0x00,0x40,0x01,0x00,0x00, -0x55,0x01,0x00,0x00,0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00, -0x44,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x0d,0x00,0x00,0x00, -0x54,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x46,0x01,0x00,0x00,0x44,0x01,0x00,0x00,0x55,0x01,0x00,0x00, -0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00,0xef,0x00,0x00,0x00, +0x4e,0x01,0x00,0x00,0xf7,0x00,0x00,0x00,0x48,0x01,0x00,0x00, +0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0x4f,0x01,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x3b,0x01,0x00,0x00, +0x4b,0x01,0x00,0x00,0x4e,0x01,0x00,0x00,0x88,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x52,0x01,0x00,0x00,0x1f,0x00,0x00,0x00, +0xeb,0x00,0x00,0x00,0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00, +0x53,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x1c,0x00,0x00,0x00, +0x52,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00, +0x55,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00, +0x53,0x00,0x00,0x00,0x53,0x01,0x00,0x00,0x1f,0x00,0x00,0x00, +0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x57,0x01,0x00,0x00, +0x38,0x01,0x00,0x00,0x55,0x01,0x00,0x00,0xf9,0x00,0x02,0x00, +0x58,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x58,0x01,0x00,0x00, +0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x70,0x01,0x00,0x00, +0x38,0x01,0x00,0x00,0xdb,0x00,0x00,0x00,0x57,0x01,0x00,0x00, +0x40,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00, +0x6f,0x01,0x00,0x00,0x3b,0x01,0x00,0x00,0xdb,0x00,0x00,0x00, +0x4f,0x01,0x00,0x00,0x40,0x01,0x00,0x00,0x0c,0x00,0x06,0x00, +0x06,0x00,0x00,0x00,0x5a,0x01,0x00,0x00,0x01,0x00,0x00,0x00, +0x0e,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0x85,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,0x5a,0x01,0x00,0x00, +0x70,0x01,0x00,0x00,0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00, +0x5e,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x0d,0x00,0x00,0x00, +0x6f,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, +0x60,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,0x70,0x01,0x00,0x00, +0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00,0x05,0x01,0x00,0x00, +0x9e,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xb9,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x94,0x00,0x00,0x00,0x06,0x01,0x00,0x00, +0x05,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x07,0x01,0x00,0x00,0x06,0x01,0x00,0x00,0x86,0x00,0x05,0x00, +0x07,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,0x7d,0x00,0x00,0x00, +0x17,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00, +0x0d,0x01,0x00,0x00,0xb9,0x00,0x00,0x00,0x0c,0x01,0x00,0x00, +0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00,0x0e,0x01,0x00,0x00, +0x9e,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x0d,0x01,0x00,0x00, +0x3d,0x00,0x04,0x00,0x94,0x00,0x00,0x00,0x0f,0x01,0x00,0x00, +0x0e,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x10,0x01,0x00,0x00,0x0f,0x01,0x00,0x00,0x85,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x18,0x01,0x00,0x00,0x10,0x01,0x00,0x00, +0x60,0x01,0x00,0x00,0x7f,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x74,0x01,0x00,0x00,0x18,0x01,0x00,0x00,0x0c,0x00,0x08,0x00, +0x06,0x00,0x00,0x00,0x19,0x01,0x00,0x00,0x01,0x00,0x00,0x00, +0x32,0x00,0x00,0x00,0x07,0x01,0x00,0x00,0x5c,0x01,0x00,0x00, +0x74,0x01,0x00,0x00,0x73,0x00,0x04,0x00,0x94,0x00,0x00,0x00, +0x1a,0x01,0x00,0x00,0x19,0x01,0x00,0x00,0x41,0x00,0x06,0x00, +0xa1,0x00,0x00,0x00,0x1b,0x01,0x00,0x00,0x98,0x00,0x00,0x00, +0x42,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,0x3e,0x00,0x03,0x00, +0x1b,0x01,0x00,0x00,0x1a,0x01,0x00,0x00,0x85,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0x10,0x01,0x00,0x00, +0x5c,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00, +0x27,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00, +0x07,0x01,0x00,0x00,0x60,0x01,0x00,0x00,0x26,0x01,0x00,0x00, +0x73,0x00,0x04,0x00,0x94,0x00,0x00,0x00,0x28,0x01,0x00,0x00, +0x27,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0xa1,0x00,0x00,0x00, +0x29,0x01,0x00,0x00,0x98,0x00,0x00,0x00,0x42,0x00,0x00,0x00, +0x0d,0x01,0x00,0x00,0x3e,0x00,0x03,0x00,0x29,0x01,0x00,0x00, +0x28,0x01,0x00,0x00,0xf9,0x00,0x02,0x00,0x2c,0x01,0x00,0x00, +0xf8,0x00,0x02,0x00,0x2c,0x01,0x00,0x00,0xfd,0x00,0x01,0x00, +0x38,0x00,0x01,0x00, +}; +const uint64_t rope_neox_f16_len = 4300; + +unsigned char rope_neox_f32_data[] = { +0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00, +0x6f,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00, +0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00, +0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30, +0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x0f,0x00,0x0b,0x00,0x05,0x00,0x00,0x00, +0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00, +0x2c,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x97,0x00,0x00,0x00, +0x9d,0x00,0x00,0x00,0xce,0x00,0x00,0x00,0xde,0x00,0x00,0x00, +0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x01,0x00,0x00,0x00, +0x47,0x00,0x04,0x00,0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x01,0x00,0x00,0x00, +0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00, +0x2a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00, +0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00, +0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, +0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x04,0x00,0x00,0x00, +0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x48,0x00,0x05,0x00, +0x2a,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x23,0x00,0x00,0x00, +0x14,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00, +0x06,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x18,0x00,0x00,0x00, +0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x07,0x00,0x00,0x00, +0x23,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00, +0x2a,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x23,0x00,0x00,0x00, +0x2c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00, +0x09,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x30,0x00,0x00,0x00, +0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x0a,0x00,0x00,0x00, +0x23,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x47,0x00,0x03,0x00, +0x2a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00, +0x68,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00, +0x47,0x00,0x04,0x00,0x94,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x95,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x48,0x00,0x05,0x00, +0x95,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x95,0x00,0x00,0x00, +0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x97,0x00,0x00,0x00, +0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00, +0x97,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x03,0x00,0x00,0x00, +0x47,0x00,0x04,0x00,0x9a,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0x9b,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00, +0x9b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0x9b,0x00,0x00,0x00, +0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x9d,0x00,0x00,0x00, +0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00, +0x9d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x47,0x00,0x04,0x00,0xcb,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xcc,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00, +0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xcc,0x00,0x00,0x00, +0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xce,0x00,0x00,0x00, +0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00, +0xce,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00, +0x47,0x00,0x04,0x00,0xdb,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,0xdc,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x48,0x00,0x05,0x00, +0xdc,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,0xdc,0x00,0x00,0x00, +0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0xde,0x00,0x00,0x00, +0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00, +0xde,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00, +0x47,0x00,0x04,0x00,0x25,0x01,0x00,0x00,0x0b,0x00,0x00,0x00, +0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,0x02,0x00,0x00,0x00, +0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,0x02,0x00,0x00,0x00, +0x16,0x00,0x03,0x00,0x06,0x00,0x00,0x00,0x20,0x00,0x00,0x00, +0x15,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x20,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00, +0x17,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x6f,0x12,0x83,0x3a, +0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x1f,0x00,0x00,0x00, +0x00,0x00,0x80,0x3f,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x20,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x07,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x04,0x00,0x00,0x00, +0x1c,0x00,0x04,0x00,0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x28,0x00,0x00,0x00,0x1e,0x00,0x0d,0x00,0x2a,0x00,0x00,0x00, +0x07,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x07,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x06,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x06,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x20,0x00,0x04,0x00, +0x2b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x2a,0x00,0x00,0x00, +0x3b,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00, +0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x2d,0x00,0x00,0x00, +0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x2d,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x20,0x00,0x04,0x00,0x2f,0x00,0x00,0x00,0x09,0x00,0x00,0x00, +0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00, +0x33,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x2d,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x05,0x00,0x00,0x00, +0x14,0x00,0x02,0x00,0x3c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x2d,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x07,0x00,0x00,0x00, +0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0x42,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00, +0x45,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0x53,0x00,0x00,0x00,0xcd,0xcc,0xcc,0x3d, +0x17,0x00,0x04,0x00,0x66,0x00,0x00,0x00,0x07,0x00,0x00,0x00, +0x03,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x67,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x3b,0x00,0x04,0x00, +0x67,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x01,0x00,0x00,0x00, +0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x69,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x6a,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x07,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x20,0x00,0x04,0x00,0x73,0x00,0x00,0x00,0x09,0x00,0x00,0x00, +0x07,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x94,0x00,0x00,0x00, +0x06,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x95,0x00,0x00,0x00, +0x94,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x96,0x00,0x00,0x00, +0x0c,0x00,0x00,0x00,0x95,0x00,0x00,0x00,0x3b,0x00,0x04,0x00, +0x96,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, +0x1d,0x00,0x03,0x00,0x9a,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x1e,0x00,0x03,0x00,0x9b,0x00,0x00,0x00,0x9a,0x00,0x00,0x00, +0x20,0x00,0x04,0x00,0x9c,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, +0x9b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x9c,0x00,0x00,0x00, +0x9d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00, +0xa0,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00,0xbb,0x00,0x00,0x00, +0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2d,0x00,0x00,0x00, +0xc0,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x1d,0x00,0x03,0x00, +0xcb,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x1e,0x00,0x03,0x00, +0xcc,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,0x20,0x00,0x04,0x00, +0xcd,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0xcc,0x00,0x00,0x00, +0x3b,0x00,0x04,0x00,0xcd,0x00,0x00,0x00,0xce,0x00,0x00,0x00, +0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0xd0,0x00,0x00,0x00, +0x0c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x2d,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0x0a,0x00,0x00,0x00, +0x1d,0x00,0x03,0x00,0xdb,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x1e,0x00,0x03,0x00,0xdc,0x00,0x00,0x00,0xdb,0x00,0x00,0x00, +0x20,0x00,0x04,0x00,0xdd,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, +0xdc,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0xdd,0x00,0x00,0x00, +0xde,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x2d,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0x08,0x00,0x00,0x00, +0x2b,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x24,0x01,0x00,0x00, +0x00,0x01,0x00,0x00,0x2c,0x00,0x06,0x00,0x66,0x00,0x00,0x00, +0x25,0x01,0x00,0x00,0x69,0x00,0x00,0x00,0x24,0x01,0x00,0x00, +0x69,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x6c,0x01,0x00,0x00,0x00,0x00,0x00,0x3f,0x36,0x00,0x05,0x00, +0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00, +0xf7,0x00,0x03,0x00,0x26,0x01,0x00,0x00,0x00,0x00,0x00,0x00, +0xfb,0x00,0x03,0x00,0x6f,0x00,0x00,0x00,0x27,0x01,0x00,0x00, +0xf8,0x00,0x02,0x00,0x27,0x01,0x00,0x00,0x41,0x00,0x05,0x00, +0x6a,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x68,0x00,0x00,0x00, +0x69,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00, +0x6c,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x84,0x00,0x05,0x00, +0x07,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x6c,0x00,0x00,0x00, +0x17,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x6a,0x00,0x00,0x00, +0x70,0x00,0x00,0x00,0x68,0x00,0x00,0x00,0x6f,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x71,0x00,0x00,0x00, +0x70,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x73,0x00,0x00,0x00, +0x74,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x42,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x75,0x00,0x00,0x00, +0x74,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x3c,0x00,0x00,0x00, +0x76,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x75,0x00,0x00,0x00, +0xf7,0x00,0x03,0x00,0x78,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0xfa,0x00,0x04,0x00,0x76,0x00,0x00,0x00,0x77,0x00,0x00,0x00, +0x78,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x77,0x00,0x00,0x00, +0xf9,0x00,0x02,0x00,0x26,0x01,0x00,0x00,0xf8,0x00,0x02,0x00, +0x78,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x73,0x00,0x00,0x00, +0x7c,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x45,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00,0x7d,0x00,0x00,0x00, +0x7c,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00, +0x7e,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x7d,0x00,0x00,0x00, +0x89,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x83,0x00,0x00,0x00, +0x6d,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0xac,0x00,0x05,0x00, +0x3c,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x7e,0x00,0x00,0x00, +0x6f,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x87,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x85,0x00,0x00,0x00, +0x86,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0xf8,0x00,0x02,0x00, +0x86,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00, +0x8c,0x00,0x00,0x00,0x71,0x00,0x00,0x00,0x75,0x00,0x00,0x00, +0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x90,0x00,0x00,0x00, +0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x80,0x00,0x05,0x00, +0x07,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x8c,0x00,0x00,0x00, +0x90,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00, +0x93,0x00,0x00,0x00,0x91,0x00,0x00,0x00,0x83,0x00,0x00,0x00, +0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00,0xa1,0x00,0x00,0x00, +0x9d,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x93,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00, +0xa1,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00, +0xa3,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x42,0x00,0x00,0x00, +0x93,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xa3,0x00,0x00,0x00, +0xa2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00, +0xa5,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x69,0x00,0x00,0x00, +0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00,0xa8,0x00,0x00,0x00, +0x9d,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xa5,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xa9,0x00,0x00,0x00, +0xa8,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00, +0xaa,0x00,0x00,0x00,0x97,0x00,0x00,0x00,0x42,0x00,0x00,0x00, +0xa5,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0xaa,0x00,0x00,0x00, +0xa9,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x26,0x01,0x00,0x00, +0xf8,0x00,0x02,0x00,0x87,0x00,0x00,0x00,0x84,0x00,0x05,0x00, +0x07,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x71,0x00,0x00,0x00, +0x75,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x07,0x00,0x00,0x00, +0xb4,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,0x7d,0x00,0x00,0x00, +0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xb5,0x00,0x00,0x00, +0xb0,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,0x86,0x00,0x05,0x00, +0x07,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,0x83,0x00,0x00,0x00, +0x17,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00, +0xb8,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb7,0x00,0x00,0x00, +0x41,0x00,0x05,0x00,0x73,0x00,0x00,0x00,0xbc,0x00,0x00,0x00, +0x2c,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, +0x07,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0xbc,0x00,0x00,0x00, +0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xbe,0x00,0x00,0x00, +0x71,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0x41,0x00,0x05,0x00, +0x2f,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x2c,0x00,0x00,0x00, +0xc0,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0xc2,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,0x70,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0x83,0x00,0x00,0x00, +0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xc7,0x00,0x00,0x00, +0x7e,0x00,0x00,0x00,0x7f,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x6b,0x01,0x00,0x00,0xc7,0x00,0x00,0x00,0x0c,0x00,0x08,0x00, +0x06,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0x01,0x00,0x00,0x00, +0x32,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0xc4,0x00,0x00,0x00, +0x6b,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0xd0,0x00,0x00,0x00, +0xd1,0x00,0x00,0x00,0xce,0x00,0x00,0x00,0x42,0x00,0x00,0x00, +0xbe,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x2d,0x00,0x00,0x00, +0xd2,0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0x41,0x00,0x05,0x00, +0x73,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0x2c,0x00,0x00,0x00, +0xd4,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x07,0x00,0x00,0x00, +0xd6,0x00,0x00,0x00,0xd5,0x00,0x00,0x00,0xab,0x00,0x05,0x00, +0x3c,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,0xd6,0x00,0x00,0x00, +0x6f,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0xda,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0xd7,0x00,0x00,0x00, +0xd9,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0xf8,0x00,0x02,0x00, +0xd9,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00, +0xe1,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x42,0x00,0x00,0x00, +0xb7,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0xf9,0x00,0x02,0x00, +0xda,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0xe3,0x00,0x00,0x00, +0xf9,0x00,0x02,0x00,0xda,0x00,0x00,0x00,0xf8,0x00,0x02,0x00, +0xda,0x00,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00, +0x68,0x01,0x00,0x00,0xe2,0x00,0x00,0x00,0xd9,0x00,0x00,0x00, +0x1f,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0x6f,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,0xd2,0x00,0x00,0x00, +0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00,0xe8,0x00,0x00,0x00, +0x2c,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xe8,0x00,0x00,0x00, +0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xea,0x00,0x00,0x00, +0xe7,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0x41,0x00,0x05,0x00, +0x2f,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x2c,0x00,0x00,0x00, +0xeb,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0xed,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x70,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0xef,0x00,0x00,0x00,0x6d,0x00,0x00,0x00, +0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf1,0x00,0x00,0x00, +0xef,0x00,0x00,0x00,0x6c,0x01,0x00,0x00,0x0c,0x00,0x07,0x00, +0x06,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x01,0x00,0x00,0x00, +0x1a,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0xf1,0x00,0x00,0x00, +0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00, +0xea,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x88,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0xf3,0x00,0x00,0x00, +0x68,0x01,0x00,0x00,0x6d,0x00,0x04,0x00,0x07,0x00,0x00,0x00, +0xf8,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,0x41,0x00,0x05,0x00, +0x2f,0x00,0x00,0x00,0x31,0x01,0x00,0x00,0x2c,0x00,0x00,0x00, +0x2e,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x32,0x01,0x00,0x00,0x31,0x01,0x00,0x00,0x85,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x35,0x01,0x00,0x00,0xe9,0x00,0x00,0x00, +0xf5,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x2f,0x00,0x00,0x00, +0x37,0x01,0x00,0x00,0x2c,0x00,0x00,0x00,0x39,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x38,0x01,0x00,0x00, +0x37,0x01,0x00,0x00,0xb7,0x00,0x05,0x00,0x3c,0x00,0x00,0x00, +0x39,0x01,0x00,0x00,0x38,0x01,0x00,0x00,0x20,0x00,0x00,0x00, +0xf7,0x00,0x03,0x00,0x52,0x01,0x00,0x00,0x00,0x00,0x00,0x00, +0xfa,0x00,0x04,0x00,0x39,0x01,0x00,0x00,0x3a,0x01,0x00,0x00, +0x52,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x3a,0x01,0x00,0x00, +0x41,0x00,0x06,0x00,0x2f,0x00,0x00,0x00,0x3b,0x01,0x00,0x00, +0x2c,0x00,0x00,0x00,0x41,0x00,0x00,0x00,0x42,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3c,0x01,0x00,0x00, +0x3b,0x01,0x00,0x00,0x41,0x00,0x06,0x00,0x2f,0x00,0x00,0x00, +0x3d,0x01,0x00,0x00,0x2c,0x00,0x00,0x00,0x41,0x00,0x00,0x00, +0x45,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x3e,0x01,0x00,0x00,0x3d,0x01,0x00,0x00,0x86,0x00,0x05,0x00, +0x07,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0xf8,0x00,0x00,0x00, +0x17,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x5f,0x01,0x00,0x00,0x5e,0x01,0x00,0x00,0x83,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x60,0x01,0x00,0x00,0x5f,0x01,0x00,0x00, +0x3c,0x01,0x00,0x00,0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00, +0x61,0x01,0x00,0x00,0x3e,0x01,0x00,0x00,0x3c,0x01,0x00,0x00, +0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x62,0x01,0x00,0x00, +0x01,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x1b,0x00,0x00,0x00, +0x61,0x01,0x00,0x00,0x88,0x00,0x05,0x00,0x06,0x00,0x00,0x00, +0x63,0x01,0x00,0x00,0x60,0x01,0x00,0x00,0x62,0x01,0x00,0x00, +0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x65,0x01,0x00,0x00, +0x01,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x20,0x00,0x00,0x00, +0x63,0x01,0x00,0x00,0x0c,0x00,0x07,0x00,0x06,0x00,0x00,0x00, +0x66,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x25,0x00,0x00,0x00, +0x1f,0x00,0x00,0x00,0x65,0x01,0x00,0x00,0x83,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x1f,0x00,0x00,0x00, +0x66,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, +0x42,0x01,0x00,0x00,0x67,0x01,0x00,0x00,0x38,0x01,0x00,0x00, +0x83,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6d,0x01,0x00,0x00, +0x66,0x01,0x00,0x00,0x1f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00, +0x06,0x00,0x00,0x00,0x45,0x01,0x00,0x00,0x01,0x00,0x00,0x00, +0x32,0x00,0x00,0x00,0x6d,0x01,0x00,0x00,0x38,0x01,0x00,0x00, +0x1f,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, +0x48,0x01,0x00,0x00,0xf5,0x00,0x00,0x00,0x42,0x01,0x00,0x00, +0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0x49,0x01,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x35,0x01,0x00,0x00, +0x45,0x01,0x00,0x00,0x48,0x01,0x00,0x00,0x88,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x4c,0x01,0x00,0x00,0x1f,0x00,0x00,0x00, +0xe9,0x00,0x00,0x00,0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00, +0x4d,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x1c,0x00,0x00,0x00, +0x4c,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00, +0x4f,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00, +0x53,0x00,0x00,0x00,0x4d,0x01,0x00,0x00,0x1f,0x00,0x00,0x00, +0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x51,0x01,0x00,0x00, +0x32,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,0xf9,0x00,0x02,0x00, +0x52,0x01,0x00,0x00,0xf8,0x00,0x02,0x00,0x52,0x01,0x00,0x00, +0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00,0x6a,0x01,0x00,0x00, +0x32,0x01,0x00,0x00,0xda,0x00,0x00,0x00,0x51,0x01,0x00,0x00, +0x3a,0x01,0x00,0x00,0xf5,0x00,0x07,0x00,0x06,0x00,0x00,0x00, +0x69,0x01,0x00,0x00,0x35,0x01,0x00,0x00,0xda,0x00,0x00,0x00, +0x49,0x01,0x00,0x00,0x3a,0x01,0x00,0x00,0x0c,0x00,0x06,0x00, +0x06,0x00,0x00,0x00,0x54,0x01,0x00,0x00,0x01,0x00,0x00,0x00, +0x0e,0x00,0x00,0x00,0x69,0x01,0x00,0x00,0x85,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x56,0x01,0x00,0x00,0x54,0x01,0x00,0x00, +0x6a,0x01,0x00,0x00,0x0c,0x00,0x06,0x00,0x06,0x00,0x00,0x00, +0x58,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x0d,0x00,0x00,0x00, +0x69,0x01,0x00,0x00,0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00, +0x5a,0x01,0x00,0x00,0x58,0x01,0x00,0x00,0x6a,0x01,0x00,0x00, +0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00,0x03,0x01,0x00,0x00, 0x9d,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xb8,0x00,0x00,0x00, -0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf0,0x00,0x00,0x00, -0xef,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00, -0xf5,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,0x17,0x00,0x00,0x00, -0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0xf6,0x00,0x00,0x00, -0xb8,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x41,0x00,0x06,0x00, -0xa0,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,0x9d,0x00,0x00,0x00, -0x42,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0xf8,0x00,0x00,0x00,0xf7,0x00,0x00,0x00, -0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x00,0x01,0x00,0x00, -0xf8,0x00,0x00,0x00,0x46,0x01,0x00,0x00,0x7f,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0x59,0x01,0x00,0x00,0x00,0x01,0x00,0x00, -0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0x01,0x01,0x00,0x00, -0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0xf0,0x00,0x00,0x00, -0x42,0x01,0x00,0x00,0x59,0x01,0x00,0x00,0x41,0x00,0x06,0x00, -0xa0,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x97,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x04,0x01,0x00,0x00, +0x03,0x01,0x00,0x00,0x86,0x00,0x05,0x00,0x07,0x00,0x00,0x00, +0x09,0x01,0x00,0x00,0x7d,0x00,0x00,0x00,0x17,0x00,0x00,0x00, +0x80,0x00,0x05,0x00,0x07,0x00,0x00,0x00,0x0a,0x01,0x00,0x00, +0xb8,0x00,0x00,0x00,0x09,0x01,0x00,0x00,0x41,0x00,0x06,0x00, +0xa0,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,0x9d,0x00,0x00,0x00, +0x42,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,0x3d,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0x0c,0x01,0x00,0x00,0x0b,0x01,0x00,0x00, +0x85,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x14,0x01,0x00,0x00, +0x0c,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,0x7f,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0x6e,0x01,0x00,0x00,0x14,0x01,0x00,0x00, +0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00,0x15,0x01,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x04,0x01,0x00,0x00, +0x56,0x01,0x00,0x00,0x6e,0x01,0x00,0x00,0x41,0x00,0x06,0x00, +0xa0,0x00,0x00,0x00,0x16,0x01,0x00,0x00,0x97,0x00,0x00,0x00, 0x42,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0x3e,0x00,0x03,0x00, -0x02,0x01,0x00,0x00,0x01,0x01,0x00,0x00,0x85,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,0xf8,0x00,0x00,0x00, -0x42,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00, -0x0e,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00, -0xf0,0x00,0x00,0x00,0x46,0x01,0x00,0x00,0x0d,0x01,0x00,0x00, -0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00,0x0f,0x01,0x00,0x00, -0x97,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0xf6,0x00,0x00,0x00, -0x3e,0x00,0x03,0x00,0x0f,0x01,0x00,0x00,0x0e,0x01,0x00,0x00, -0xf9,0x00,0x02,0x00,0x12,0x01,0x00,0x00,0xf8,0x00,0x02,0x00, -0x12,0x01,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00, +0x16,0x01,0x00,0x00,0x15,0x01,0x00,0x00,0x85,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x21,0x01,0x00,0x00,0x0c,0x01,0x00,0x00, +0x56,0x01,0x00,0x00,0x0c,0x00,0x08,0x00,0x06,0x00,0x00,0x00, +0x22,0x01,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00, +0x04,0x01,0x00,0x00,0x5a,0x01,0x00,0x00,0x21,0x01,0x00,0x00, +0x41,0x00,0x06,0x00,0xa0,0x00,0x00,0x00,0x23,0x01,0x00,0x00, +0x97,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x0a,0x01,0x00,0x00, +0x3e,0x00,0x03,0x00,0x23,0x01,0x00,0x00,0x22,0x01,0x00,0x00, +0xf9,0x00,0x02,0x00,0x26,0x01,0x00,0x00,0xf8,0x00,0x02,0x00, +0x26,0x01,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00, }; -const uint64_t rope_neox_f32_len = 3792; +const uint64_t rope_neox_f32_len = 4200; unsigned char scale_f32_data[] = { 0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00, diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 16287a280..79ce1479f 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -290,6 +290,7 @@ struct vk_op_rope_neox_push_constants { float corr_dims[4]; float theta_scale; float inv_ndims; + uint32_t has_freq_facs; }; struct vk_op_soft_max_push_constants { @@ -1522,8 +1523,8 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1); } @@ -3732,7 +3733,7 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx } -static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) { +static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) { switch (op) { case GGML_OP_ADD: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { @@ -3853,6 +3854,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const default: return nullptr; } + + GGML_UNUSED(src2); } static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) { @@ -3880,12 +3883,15 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { } template -static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) { +static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; if (src1 != nullptr) { std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; } + if (src2 != nullptr) { + std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3]; + } std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl; #endif GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT @@ -3896,6 +3902,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c const uint64_t ne02 = src0->ne[2]; const uint64_t ne03 = src0->ne[3]; const uint64_t ne0 = ne00 * ne01; + const bool use_src1 = src1 != nullptr; const uint64_t ne10 = use_src1 ? src1->ne[0] : 0; const uint64_t ne11 = use_src1 ? src1->ne[1] : 0; @@ -3904,7 +3911,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c const uint64_t ne1 = ne10 * ne11; // const uint64_t nb10 = use_src1 ? src1->nb[0] : 0; - vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op); + const bool use_src2 = src2 != nullptr; + const uint64_t ne20 = use_src2 ? src2->ne[0] : 0; + const uint64_t ne21 = use_src2 ? src2->ne[1] : 0; + const uint64_t ne22 = use_src2 ? src2->ne[2] : 0; + const uint64_t ne23 = use_src2 ? src2->ne[3] : 0; + const uint64_t ne2 = ne20 * ne21; + + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op); ggml_vk_func_t op_func; if (pipeline == nullptr) { @@ -3927,15 +3941,18 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; + ggml_tensor_extra_gpu * extra_src2 = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr; vk_buffer d_X = nullptr; size_t x_buf_offset = 0; vk_buffer d_Y = nullptr; size_t y_buf_offset = 0; vk_buffer d_Z = nullptr; + size_t z_buf_offset = 0; bool src0_uma = false; bool src1_uma = false; + bool src2_uma = false; if (ctx->device->uma) { ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset); @@ -3944,10 +3961,15 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset); src1_uma = d_Y != nullptr; } + if (use_src2) { + ggml_vk_host_get(ctx, src2->data, d_Z, z_buf_offset); + src2_uma = d_Z != nullptr; + } } uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment); uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0; + uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0; uint64_t d_sz = ggml_type_size(dst->type) * ne0; vk_buffer d_D = extra->buffer_gpu.lock(); @@ -3970,10 +3992,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c y_buf_offset = extra_src1->offset; GGML_ASSERT(d_Y != nullptr); } + if (use_src2 && !src2_uma) { + d_Z = extra_src2->buffer_gpu.lock(); + z_buf_offset = extra_src2->offset; + GGML_ASSERT(d_Z != nullptr); + } if (op_supports_incontiguous) { x_sz = ggml_nbytes(src0); y_sz = use_src1 ? ggml_nbytes(src1) : 0; + z_sz = use_src2 ? ggml_nbytes(src2) : 0; d_sz = ggml_nbytes(dst); if (x_buf_offset + x_sz >= d_X->size) { @@ -3982,6 +4010,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c if (use_src1 && y_buf_offset + y_sz >= d_Y->size) { y_sz = VK_WHOLE_SIZE; } + if (use_src2 && z_buf_offset + z_sz >= d_Z->size) { + z_sz = VK_WHOLE_SIZE; + } if (d_buf_offset + d_sz >= d_D->size) { d_sz = VK_WHOLE_SIZE; } @@ -4021,13 +4052,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c if (use_src1 && y_sz != VK_WHOLE_SIZE) { y_sz *= ne12 * ne13; } + if (use_src2 && z_sz != VK_WHOLE_SIZE) { + z_sz *= ne22 * ne23; + } if (d_sz != VK_WHOLE_SIZE) { d_sz *= ne02 * ne03; } } if (op == GGML_OP_SOFT_MAX) { - // Empty src1 is possible on soft_max, but the shader needs a buffer + // Empty src1 is possible in soft_max, but the shader needs a buffer vk_subbuffer subbuf_y; if (use_src1) { subbuf_y = { d_Y, y_buf_offset, y_sz }; @@ -4037,6 +4071,28 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + } else if (op == GGML_OP_ROPE) { + const int mode = ((int32_t *) dst->op_params)[2]; + const bool is_neox = mode & 2; + + if (is_neox) { + // Empty src2 is possible in rope, but the shader needs a buffer + vk_subbuffer subbuf_z; + if (use_src2) { + subbuf_z = { d_Z, z_buf_offset, z_sz }; + } else { + subbuf_z = { d_X, 0, d_X->size }; + } + + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + } else { + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + } + } else if (use_src2) { + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (use_src1) { ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); @@ -4047,6 +4103,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c } else { GGML_ASSERT(op != GGML_OP_SOFT_MAX); GGML_ASSERT(op != GGML_OP_ARGSORT); + GGML_ASSERT(!use_src2); ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03); @@ -4088,7 +4145,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c } static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); } static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4096,7 +4153,7 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); - ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_GET_ROWS, { + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, @@ -4111,7 +4168,7 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, cons const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); - ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_ADD, { + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ADD, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, @@ -4126,7 +4183,7 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); - ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_MUL, { + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_MUL, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, @@ -4141,7 +4198,7 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, co const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_SCALE, { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, @@ -4154,7 +4211,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, cons const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_SQR, { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, @@ -4168,7 +4225,7 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, co const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_CLAMP, { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, @@ -4183,7 +4240,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons const uint32_t dst_type_size = ggml_type_size(dst->type); const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_CPY, { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, @@ -4195,21 +4252,21 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); } static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); } static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); } static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { int32_t * op_params = (int32_t *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }); } static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4228,7 +4285,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_SOFT_MAX, { + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, { ncols, src1 != nullptr ? nrows_y : (uint32_t)0, scale, max_bias, @@ -4237,11 +4294,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, }); } -static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { -#pragma message("TODO: implement phi3 frequency factors support") -#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225") - GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet"); - +static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; // const int n_ctx = ((int32_t *) dst->op_params)[3]; @@ -4264,12 +4317,13 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con if (is_neox) { const float theta_scale = powf(freq_base, -2.0f/n_dims); const float inv_ndims = -1.0f / n_dims; - ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, { + ggml_vk_op_f32(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], - freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims + freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims, + src2 != nullptr, }); } else { - ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, { + ggml_vk_op_f32(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} }); @@ -4292,7 +4346,7 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_ARGSORT, { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { ncols, ncols_pad, op_params[0], @@ -5408,6 +5462,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod const ggml_tensor * src0 = node->src[0]; const ggml_tensor * src1 = node->src[1]; + const ggml_tensor * src2 = node->src[2]; switch (node->op) { case GGML_OP_UNARY: @@ -5524,7 +5579,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod break; case GGML_OP_ROPE: - ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node); + ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node); break; case GGML_OP_ARGSORT: @@ -6500,7 +6555,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vectorop) << " gpu=" << (tensor->extra != nullptr) << " backend=" << tensor->backend << std::endl; + std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl; done.push_back(tensor); @@ -6550,7 +6605,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) { void * tensor_data = tensor->data; - if (tensor->backend == GGML_BACKEND_TYPE_GPU) { + if (ggml_backend_buffer_is_vk(tensor->buffer)) { const size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); @@ -6561,12 +6616,12 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso } std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; - std::cerr << "tensor=" << tensor << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl; + std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl; if (tensor->src[0] != nullptr) { - std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " backend=" << tensor->src[0]->backend << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl; + std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl; } if (tensor->src[1] != nullptr) { - std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " backend=" << tensor->src[1]->backend << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl; + std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl; } std::cerr << std::endl << "Result:" << std::endl; ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0); @@ -6577,43 +6632,11 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso std::vector done; ggml_vk_print_graph_origin(tensor, done); - if (tensor->backend == GGML_BACKEND_TYPE_GPU) { + if (ggml_backend_buffer_is_vk(tensor->buffer)) { free(tensor_data); } } -static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) { - return; - GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU); - if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) { - return; - } - for (int i3 = 0; i3 < tensor->ne[3]; i3++) { - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float val = 0.0f; - if (tensor->type == GGML_TYPE_F32) { - val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]); - } else if (tensor->type == GGML_TYPE_F16) { - val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0])); - } - if (std::isnan(val)) { - std::cerr << "ERROR: TENSOR CHECK " << name << ": Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " val=" << val << std::endl; - std::cerr << "tensor=" << tensor << " tensor->type=" << ggml_type_name(tensor->type) << " tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl; - std::cerr << std::endl; - ggml_vk_print_tensor_area(tensor, tensor->data, i0, i1, i2, i3); - std::cerr << std::endl; - std::vector done; - ggml_vk_print_graph_origin(tensor, done); - GGML_ASSERT(false); - } - } - } - } - } -} - void * comp_result; size_t comp_size; size_t comp_nb[GGML_MAX_DIMS]; @@ -6637,6 +6660,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ ggml_tensor * src0 = tensor->src[0]; ggml_tensor * src1 = tensor->src[1]; + ggml_tensor * src2 = tensor->src[2]; struct ggml_init_params iparams = { /*.mem_size =*/ 1024*1024*1024, @@ -6666,10 +6690,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ src0_buffer = malloc(src0_size); src0_clone->data = src0_buffer; - if (src0->backend == GGML_BACKEND_TYPE_CPU) { + if (ggml_backend_buffer_is_host(src0->buffer)) { memcpy(src0_clone->data, src0->data, src0_size); memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); - } else if (src0->backend == GGML_BACKEND_TYPE_GPU) { + } else if (ggml_backend_buffer_is_vk(src0->buffer)) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra; vk_buffer buffer_gpu = extra->buffer_gpu.lock(); uint64_t offset = extra->offset; @@ -6700,8 +6724,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { ggml_vk_print_tensor(ctx, src0, "src0"); } - - ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone); } if (src1 != nullptr) { src1_clone = ggml_dup_tensor(ggml_ctx, src1); @@ -6710,10 +6732,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ src1_buffer = malloc(src1_size); src1_clone->data = src1_buffer; - if (src1->backend == GGML_BACKEND_TYPE_CPU) { + if (ggml_backend_buffer_is_host(src1->buffer)) { memcpy(src1_clone->data, src1->data, src1_size); memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); - } else if (src1->backend == GGML_BACKEND_TYPE_GPU) { + } else if (ggml_backend_buffer_is_vk(src1->buffer)) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra; vk_buffer buffer_gpu = extra->buffer_gpu.lock(); uint64_t offset = extra->offset; @@ -6744,12 +6766,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { ggml_vk_print_tensor(ctx, src1, "src1"); std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl; - std::cerr << "src1_clone=" << tensor << " src1_clone->backend: " << src1_clone->backend << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl; + std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl; if (src1->src[0] != nullptr) { - std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " backend=" << src1->src[0]->backend << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl; + std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl; } if (src1->src[1] != nullptr) { - std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " backend=" << src1->src[1]->backend << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl; + std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl; } std::cerr << std::endl << "Result:" << std::endl; ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0); @@ -6760,8 +6782,64 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ std::vector done; ggml_vk_print_graph_origin(src1_clone, done); } + } + if (src2 != nullptr) { + src2_clone = ggml_dup_tensor(ggml_ctx, src2); - ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src1", src1_clone); + src2_size = ggml_nbytes(src2); + + src2_buffer = malloc(src2_size); + src2_clone->data = src2_buffer; + if (ggml_backend_buffer_is_host(src2->buffer)) { + memcpy(src2_clone->data, src2->data, src2_size); + memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS); + } else if (ggml_backend_buffer_is_vk(src2->buffer)) { + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra; + vk_buffer buffer_gpu = extra->buffer_gpu.lock(); + uint64_t offset = extra->offset; + if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) { + for (int i3 = 0; i3 < src2->ne[3]; i3++) { + for (int i2 = 0; i2 < src2->ne[2]; i2++) { + const int idx = i3*src2->ne[2] + i2; + ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]); + } + } + + src2_clone->nb[0] = src2->nb[0]; + src2_clone->nb[1] = src2->nb[1]; + for (int i = 2; i < GGML_MAX_DIMS; i++) { + src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1]; + } + } else { + if (offset + src2_size >= buffer_gpu->size) { + src2_size = buffer_gpu->size - offset; + } + ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size); + memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS); + } + } else { + GGML_ASSERT(false); + } + + if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { + ggml_vk_print_tensor(ctx, src2, "src2"); + std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl; + std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl; + if (src2->src[0] != nullptr) { + std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl; + } + if (src2->src[1] != nullptr) { + std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl; + } + std::cerr << std::endl << "Result:" << std::endl; + ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0); + std::cerr << std::endl; + std::cerr << std::endl << "Result:" << std::endl; + ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0); + std::cerr << std::endl; + std::vector done; + ggml_vk_print_graph_origin(src2_clone, done); + } } if (tensor->op == GGML_OP_MUL_MAT) { @@ -6799,7 +6877,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ float attn_factor = ((float *) tensor->op_params)[8]; float beta_fast = ((float *) tensor->op_params)[9]; float beta_slow = ((float *) tensor->op_params)[10]; - tensor_clone = ggml_rope_custom(ggml_ctx, src0_clone, src1_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); } else if (tensor->op == GGML_OP_UNARY) { switch (ggml_get_unary_op(tensor)) { case GGML_UNARY_OP_SILU: @@ -6847,7 +6925,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8); - ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone); if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone"); } @@ -6888,7 +6965,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ void * tensor_data = tensor->data; - if (tensor->backend == GGML_BACKEND_TYPE_GPU) { + if (ggml_backend_buffer_is_vk(tensor->buffer)) { size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); @@ -6936,12 +7013,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) { std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl; - std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; + std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; if (src0 != nullptr) { - std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; + std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; } if (src1 != nullptr) { - std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; + std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; } std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; std::cerr << std::endl << "Result:" << std::endl; @@ -6977,12 +7054,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl; - std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; + std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; if (src0 != nullptr) { - std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; + std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; } if (src1 != nullptr) { - std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; + std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; } std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; std::cerr << std::endl << "Result:" << std::endl; @@ -7001,12 +7078,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ if (avg_err > 0.05 || std::isnan(avg_err)) { std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl; - std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; + std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl; if (src0 != nullptr) { - std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; + std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl; } if (src1 != nullptr) { - std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; + std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl; } std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl; std::cerr << std::endl << "Result:" << std::endl; @@ -7018,14 +7095,14 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ ggml_vk_print_graph_origin(tensor, done); GGML_ASSERT(false); } else { - std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " backend=" << tensor->backend << " avg_err=" << avg_err << std::endl; + std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl; } free(comp_result); comp_result = nullptr; comp_size = 0; - if (tensor->backend == GGML_BACKEND_TYPE_GPU) { + if (ggml_backend_buffer_is_vk(tensor->buffer)) { free(tensor_data); } } diff --git a/ggml_vk_generate_shaders.py b/ggml_vk_generate_shaders.py index 8096c03b7..a8f7373df 100644 --- a/ggml_vk_generate_shaders.py +++ b/ggml_vk_generate_shaders.py @@ -2609,7 +2609,8 @@ layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in; layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; layout (binding = 1) readonly buffer Y {int data_b[];}; -layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; +layout (binding = 2) readonly buffer Z {float data_freq_factors[];}; +layout (binding = 3) writeonly buffer D {D_TYPE data_d[];}; layout (push_constant) uniform parameter { uint ncols; @@ -2622,6 +2623,7 @@ layout (push_constant) uniform parameter { float corr_dims[4]; float theta_scale; float inv_ndims; + uint has_freq_facs; } p; float rope_yarn_ramp(const float low, const float high, const uint i0) { @@ -2671,7 +2673,8 @@ void main() { const float cur_rot = p.inv_ndims * ic - ib; const int pos = data_b[i2]; - const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f); + const float freq_factor = p.has_freq_facs != 0 ? data_freq_factors[ic/2] : 1.0f; + const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f) / freq_factor; float cos_theta, sin_theta; rope_yarn(theta_base, uint(cur_rot), cos_theta, sin_theta); From e84b71c2c6da6e69c8f815168ea836f9716a325e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 May 2024 10:00:21 +0300 Subject: [PATCH 22/34] ggml : drop support for QK_K=64 (#7473) * ggml : drop support for QK_K=64 ggml-ci * opencl : restore QK_K=256 define --- CMakeLists.txt | 5 - Makefile | 4 - ci/run.sh | 3 +- ggml-common.h | 54 - ggml-cuda/convert.cu | 138 -- ggml-cuda/dmmv.cu | 151 --- ggml-cuda/mmq.cu | 6 - ggml-cuda/vecdotq.cuh | 126 -- ggml-metal.m | 17 - ggml-metal.metal | 398 +----- ggml-opencl.cpp | 2 +- ggml-quants.c | 2672 +------------------------------------ ggml-sycl.cpp | 472 +------ ggml.c | 12 - gguf-py/gguf/constants.py | 3 +- llama.cpp | 12 +- 16 files changed, 26 insertions(+), 4049 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c09d834fb..ef02ff669 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -124,7 +124,6 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_RPC "llama: use RPC" OFF) -option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_SYCL "llama: use SYCL" OFF) option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device") @@ -384,10 +383,6 @@ if (LLAMA_LLAMAFILE) set(GGML_SOURCES_LLAMAFILE sgemm.cpp) endif() -if (LLAMA_QKK_64) - add_compile_definitions(GGML_QKK_64) -endif() - if (LLAMA_CUBLAS) message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead") set(LLAMA_CUDA ON) diff --git a/Makefile b/Makefile index 6b7c853b3..fe63cbd60 100644 --- a/Makefile +++ b/Makefile @@ -389,10 +389,6 @@ else MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d endif -ifdef LLAMA_QKK_64 - MK_CPPFLAGS += -DGGML_QKK_64 -endif - ifndef LLAMA_NO_ACCELERATE # Mac OS - include Accelerate framework. # `-framework Accelerate` works both with Apple Silicon and Mac Intel diff --git a/ci/run.sh b/ci/run.sh index d5972480b..79dcd0772 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -606,7 +606,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_CUDA} ]; then - test $ret -eq 0 && gg_run open_llama_3b_v2 + #test $ret -eq 0 && gg_run open_llama_3b_v2 + date # dummy else test $ret -eq 0 && gg_run open_llama_7b_v2 fi diff --git a/ggml-common.h b/ggml-common.h index 43c7978a0..77e6bfba4 100644 --- a/ggml-common.h +++ b/ggml-common.h @@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2; // QK = number of values after dequantization // QK_K = super-block size -#ifdef GGML_QKK_64 -#define QK_K 64 -#define K_SCALE_SIZE 4 -#else #define QK_K 256 #define K_SCALE_SIZE 12 -#endif // GGML_QKK_64 #if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL) // QR = QK / number of values before dequantization @@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2; #define QI4_NL (QK4_NL / (4*QR4_NL)) #define QR4_NL 2 -#if QK_K == 64 -#define QI4_XS QI4_NL -#define QR4_XS QR4_NL -#else #define QI4_XS (QK_K / (4*QR4_XS)) #define QR4_XS 8 -#endif #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP @@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro // weight is represented as x = a * q // 16 blocks of 16 elements each // Effectively 3.4375 bits per weight -#ifdef GGML_QKK_64 -typedef struct { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits - uint8_t scales[2]; - ggml_half d; // super-block scale -} block_q3_K; -static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding"); -#else typedef struct { uint8_t hmask[QK_K/8]; // quants - high bit uint8_t qs[QK_K/4]; // quants - low 2 bits @@ -244,20 +225,11 @@ typedef struct { ggml_half d; // super-block scale } block_q3_K; static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding"); -#endif // 4-bit quantization // 8 blocks of 32 elements each // weight is represented as x = a * q + b // Effectively 4.5 bits per weight -#ifdef GGML_QKK_64 -typedef struct { - ggml_half d[2]; // super-block scales/mins - uint8_t scales[2]; // 4-bit block scales/mins - uint8_t qs[QK_K/2]; // 4--bit quants -} block_q4_K; -static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding"); -#else typedef struct { union { struct { @@ -270,21 +242,11 @@ typedef struct { uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding"); -#endif // 5-bit quantization // 8 blocks of 32 elements each // weight is represented as x = a * q + b // Effectively 5.5 bits per weight -#ifdef GGML_QKK_64 -typedef struct { - ggml_half d; // super-block scale - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -} block_q5_K; -static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding"); -#else typedef struct { union { struct { @@ -298,7 +260,6 @@ typedef struct { uint8_t qs[QK_K/2]; // quants, low 4 bits } block_q5_K; static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); -#endif // 6-bit quantization // weight is represented as x = a * q @@ -356,11 +317,7 @@ typedef struct { static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); // 3.4375 bpw -#if QK_K == 64 -#define IQ3S_N_SCALE 2 -#else #define IQ3S_N_SCALE QK_K/64 -#endif typedef struct { ggml_half d; uint8_t qs[QK_K/4]; @@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron typedef struct { uint8_t qs[QK_K/8]; // grid index, low 8 bits uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8) -#if QK_K == 64 - ggml_half d; -#endif uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64) } block_iq1_m; -#if QK_K == 64 -static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding"); -#else static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding"); -#endif // Used by IQ1_M quants typedef union { @@ -406,9 +356,6 @@ typedef struct { } block_iq4_nl; static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding"); -#if QK_K == 64 -#define block_iq4_xs block_iq4_nl -#else typedef struct { ggml_half d; uint16_t scales_h; @@ -416,7 +363,6 @@ typedef struct { uint8_t qs[QK_K/2]; } block_iq4_xs; static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); -#endif #endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL diff --git a/ggml-cuda/convert.cu b/ggml-cuda/convert.cu index 830e2d756..c0a444707 100644 --- a/ggml-cuda/convert.cu +++ b/ggml-cuda/convert.cu @@ -131,7 +131,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t const block_q2_K * x = (const block_q2_K *) vx; const int64_t tid = threadIdx.x; -#if QK_K == 256 const int64_t n = tid/32; const int64_t l = tid - 32*n; const int64_t is = 8*n + l/16; @@ -145,17 +144,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); -#else - const int64_t is = tid/16; // 0 or 1 - const int64_t il = tid%16; // 0...15 - const uint8_t q = x[i].qs[il] >> (2*is); - dst_t * y = yy + i*QK_K + 16*is + il; - float dall = __low2half(x[i].dm); - float dmin = __high2half(x[i].dm); - y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); - y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); -#endif - } template @@ -164,7 +152,6 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t const int64_t i = blockIdx.x; const block_q3_K * x = (const block_q3_K *) vx; -#if QK_K == 256 const int64_t r = threadIdx.x/4; const int64_t tid = r/2; const int64_t is0 = r%2; @@ -188,31 +175,8 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t const uint8_t * hm = x[i].hmask; for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); -#else - const int64_t tid = threadIdx.x; - const int64_t is = tid/16; // 0 or 1 - const int64_t il = tid%16; // 0...15 - const int64_t im = il/8; // 0...1 - const int64_t in = il%8; // 0...7 - - dst_t * y = yy + i*QK_K + 16*is + il; - - const uint8_t q = x[i].qs[il] >> (2*is); - const uint8_t h = x[i].hmask[in] >> (2*is + im); - const float d = (float)x[i].d; - - if (is == 0) { - y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); - y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); - } else { - y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); - y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); - } -#endif - } -#if QK_K == 256 static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { if (j < 4) { d = q[j] & 63; m = q[j + 4] & 63; @@ -221,7 +185,6 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); } } -#endif template static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { @@ -229,7 +192,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t const int64_t i = blockIdx.x; -#if QK_K == 256 // assume 32 threads const int64_t tid = threadIdx.x; const int64_t il = tid/8; @@ -253,15 +215,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t y[l + 0] = d1 * (q[l] & 0xF) - m1; y[l +32] = d2 * (q[l] >> 4) - m2; } -#else - const int64_t tid = threadIdx.x; - const uint8_t * q = x[i].qs; - dst_t * y = yy + i*QK_K; - const float d = (float)x[i].dm[0]; - const float m = (float)x[i].dm[1]; - y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); - y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); -#endif } template @@ -270,7 +223,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t const int64_t i = blockIdx.x; -#if QK_K == 256 // assume 64 threads - this is very slightly better than the one below const int64_t tid = threadIdx.x; const int64_t il = tid/16; // il is in 0...3 @@ -297,18 +249,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t hm <<= 1; y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; -#else - const int64_t tid = threadIdx.x; - const uint8_t q = x[i].qs[tid]; - const int64_t im = tid/8; // 0...3 - const int64_t in = tid%8; // 0...7 - const int64_t is = tid/16; // 0 or 1 - const uint8_t h = x[i].qh[in] >> im; - const float d = x[i].d; - dst_t * y = yy + i*QK_K + tid; - y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); - y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); -#endif } template @@ -316,7 +256,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t const block_q6_K * x = (const block_q6_K *) vx; const int64_t i = blockIdx.x; -#if QK_K == 256 // assume 64 threads - this is very slightly better than the one below const int64_t tid = threadIdx.x; @@ -336,24 +275,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); -#else - - // assume 32 threads - const int64_t tid = threadIdx.x; - const int64_t ip = tid/16; // 0 or 1 - const int64_t il = tid - 16*ip; // 0...15 - - dst_t * y = yy + i*QK_K + 16*ip + il; - - const float d = x[i].d; - - const uint8_t ql = x[i].ql[16*ip + il]; - const uint8_t qh = x[i].qh[il] >> (2*ip); - const int8_t * sc = x[i].scales; - - y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); - y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32); -#endif } template @@ -363,7 +284,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds const block_iq2_xxs * x = (const block_iq2_xxs *) vx; const int64_t tid = threadIdx.x; -#if QK_K == 256 const int64_t il = tid/8; // 0...3 const int64_t ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -374,10 +294,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f; const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -#else - NO_DEVICE_CODE; -#endif - } template @@ -387,7 +303,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst const block_iq2_xs * x = (const block_iq2_xs *) vx; const int64_t tid = threadIdx.x; -#if QK_K == 256 const int64_t il = tid/8; // 0...3 const int64_t ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -396,10 +311,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -#else - NO_DEVICE_CODE; -#endif - } template @@ -409,7 +320,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_ const block_iq2_s * x = (const block_iq2_s *) vx; const int64_t tid = threadIdx.x; -#if QK_K == 256 const int64_t il = tid/8; // 0...3 const int64_t ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -417,10 +327,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_ const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -#else - NO_DEVICE_CODE; -#endif - } template @@ -430,7 +336,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds const block_iq3_xxs * x = (const block_iq3_xxs *) vx; const int64_t tid = threadIdx.x; -#if QK_K == 256 const int64_t il = tid/8; // 0...3 const int64_t ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -445,10 +350,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); } -#else - NO_DEVICE_CODE; -#endif - } template @@ -458,7 +359,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_ const block_iq3_s * x = (const block_iq3_s *) vx; const int64_t tid = threadIdx.x; -#if QK_K == 256 const int64_t il = tid/8; // 0...3 const int64_t ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -471,10 +371,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_ y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); } -#else - NO_DEVICE_CODE; -#endif - } template @@ -484,7 +380,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_ const block_iq1_s * x = (const block_iq1_s *) vx; const int64_t tid = threadIdx.x; -#if QK_K == 256 const int64_t il = tid/8; // 0...3 const int64_t ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -497,10 +392,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_ for (int j = 0; j < 8; ++j) { y[j] = d * (q[j] + delta); } -#else - NO_DEVICE_CODE; -#endif - } template @@ -510,7 +401,6 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_ const block_iq1_m * x = (const block_iq1_m *) vx; const int64_t tid = threadIdx.x; -#if QK_K == 256 const int64_t il = tid/8; // 0...3 const int64_t ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -527,13 +417,8 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_ for (int j = 0; j < 8; ++j) { y[j] = d * (q[j] + delta); } -#else - NO_DEVICE_CODE; -#endif - } - template static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) { @@ -550,10 +435,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; } - } -#if QK_K != 64 template static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { const int64_t i = blockIdx.x; @@ -570,7 +453,6 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; } } -#endif template static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) { @@ -592,21 +474,13 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * template static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { const int nb = k / QK_K; -#if QK_K == 256 dequantize_block_q2_K<<>>(vx, y); -#else - dequantize_block_q2_K<<>>(vx, y); -#endif } template static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { const int nb = k / QK_K; -#if QK_K == 256 dequantize_block_q3_K<<>>(vx, y); -#else - dequantize_block_q3_K<<>>(vx, y); -#endif } template @@ -632,21 +506,13 @@ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k template static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { const int nb = k / QK_K; -#if QK_K == 256 dequantize_block_q5_K<<>>(vx, y); -#else - dequantize_block_q5_K<<>>(vx, y); -#endif } template static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { const int nb = k / QK_K; -#if QK_K == 256 dequantize_block_q6_K<<>>(vx, y); -#else - dequantize_block_q6_K<<>>(vx, y); -#endif } template @@ -700,11 +566,7 @@ static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t template static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { const int nb = (k + QK_K - 1) / QK_K; -#if QK_K == 64 - dequantize_block_iq4_nl<<>>(vx, y); -#else dequantize_block_iq4_xs<<>>(vx, y); -#endif } template diff --git a/ggml-cuda/dmmv.cu b/ggml-cuda/dmmv.cu index 7313e3e17..47d4d5d9e 100644 --- a/ggml-cuda/dmmv.cu +++ b/ggml-cuda/dmmv.cu @@ -22,7 +22,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, float tmp = 0; // partial sum for thread in warp -#if QK_K == 256 const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15 const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 @@ -71,37 +70,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, tmp += dall * sum1 - dmin * sum2; } -#else - const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 - const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 - const int offset = tid * K_QUANTS_PER_ITERATION; - - uint32_t uaux[2]; - const uint8_t * d = (const uint8_t *)uaux; - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + offset; - const uint8_t * q = x[i].qs + offset; - const uint32_t * s = (const uint32_t *)x[i].scales; - - uaux[0] = s[0] & 0x0f0f0f0f; - uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; - - const float2 dall = __half22float2(x[i].dm); - - float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - const uint8_t ql = q[l]; - sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) - + y[l+16] * d[1] * ((ql >> 2) & 3) - + y[l+32] * d[2] * ((ql >> 4) & 3) - + y[l+48] * d[3] * ((ql >> 6) & 3); - sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; - } - tmp += dall.x * sum1 - dall.y * sum2; - } -#endif // sum up partial sums and write back result tmp = warp_reduce_sum(tmp); @@ -123,8 +91,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, float tmp = 0; // partial sum for thread in warp -#if QK_K == 256 - const uint16_t kmask1 = 0x0303; const uint16_t kmask2 = 0x0f0f; @@ -175,34 +141,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, tmp += d * sum; } -#else - - const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 - const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 - const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14 - const int in = offset/8; // 0 or 1 - const int im = offset%8; // 0...7 - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + offset; - const uint8_t * q = x[i].qs + offset; - const uint8_t * s = x[i].scales; - - const float dall = (float)x[i].d; - - float sum = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - const uint8_t hl = x[i].hmask[im+l] >> in; - const uint8_t ql = q[l]; - sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) - + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) - + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) - + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); - } - tmp += sum; - } -#endif // sum up partial sums and write back result tmp = warp_reduce_sum(tmp); @@ -221,7 +159,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const block_q4_K * x = (const block_q4_K *)vx + ib0; -#if QK_K == 256 const uint16_t kmask1 = 0x3f3f; const uint16_t kmask2 = 0x0f0f; const uint16_t kmask3 = 0xc0c0; @@ -306,36 +243,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, #endif } -#else - const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 - const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); - - const int step = tid * K_QUANTS_PER_ITERATION; - - uint16_t aux16[2]; - const uint8_t * s = (const uint8_t *)aux16; - - float tmp = 0; - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - const uint8_t * q = x[i].qs + step; - const float * y = yy + i*QK_K + step; - const uint16_t * a = (const uint16_t *)x[i].scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - const float d = (float)x[i].dm[0]; - const float m = (float)x[i].dm[1]; - float sum = 0.f; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) - + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) - + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3]) - + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]); - } - tmp += sum; - } - -#endif // sum up partial sums and write back result tmp = warp_reduce_sum(tmp); @@ -355,7 +262,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, float tmp = 0; // partial sum for thread in warp -#if QK_K == 256 const uint16_t kmask1 = 0x3f3f; const uint16_t kmask2 = 0x0f0f; const uint16_t kmask3 = 0xc0c0; @@ -426,30 +332,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; } -#else - const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 - const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); - const int step = tid * K_QUANTS_PER_ITERATION; - const int im = step/8; - const int in = step%8; - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - const uint8_t * q = x[i].qs + step; - const int8_t * s = x[i].scales; - const float * y = yy + i*QK_K + step; - const float d = x[i].d; - float sum = 0.f; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - const uint8_t h = x[i].qh[in+j] >> im; - sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) - + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) - + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) - + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); - } - tmp += sum; - } -#endif - // sum up partial sums and write back result tmp = warp_reduce_sum(tmp); @@ -470,8 +352,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const block_q6_K * x = (const block_q6_K *)vx + ib0; -#if QK_K == 256 - const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 @@ -526,37 +406,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, } -#else - - const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7 - const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3 - - const int step = tid * K_QUANTS_PER_ITERATION; - - float tmp = 0; // partial sum for thread in warp - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + step; - const uint8_t * ql = x[i].ql + step; - const uint8_t * qh = x[i].qh + step; - const int8_t * s = x[i].scales; - - const float d = x[i+0].d; - - float sum = 0; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) - + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) - + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32) - + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32); - } - tmp += sum; - - } - -#endif - // sum up partial sums and write back result tmp = warp_reduce_sum(tmp); diff --git a/ggml-cuda/mmq.cu b/ggml-cuda/mmq.cu index 933d799ce..c0a66d9b6 100644 --- a/ggml-cuda/mmq.cu +++ b/ggml-cuda/mmq.cu @@ -826,11 +826,7 @@ template static __device__ __forceinlin const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; -#if QK_K == 256 x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; -#else - x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]}; -#endif } #pragma unroll @@ -933,9 +929,7 @@ template static __device__ __forceinlin const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; -#if QK_K == 256 x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; -#endif } #pragma unroll diff --git a/ggml-cuda/vecdotq.cuh b/ggml-cuda/vecdotq.cuh index 86b87fa93..5ebdddcc7 100644 --- a/ggml-cuda/vecdotq.cuh +++ b/ggml-cuda/vecdotq.cuh @@ -712,7 +712,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( static __device__ __forceinline__ float vec_dot_q4_K_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#ifndef GGML_QKK_64 const block_q4_K * bq4_K = (const block_q4_K *) vbq; int v[2]; @@ -754,58 +753,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( } return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); - -#else - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const block_q4_K * bq4_K = (const block_q4_K *) vbq; - - float sumf_d = 0.0f; - float sumf_m = 0.0f; - - uint16_t aux16[2]; - const uint8_t * s = (const uint8_t *)aux16; - - const uint16_t * a = (const uint16_t *)bq4_K->scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - - const float dall = bq4_K->dm[0]; - const float dmin = bq4_K->dm[1]; - - const float d8_1 = __low2float(bq8_1[0].ds); - const float d8_2 = __low2float(bq8_1[1].ds); - - const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); - const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); - const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); - const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); - - const int * q4 = (const int *)bq4_K->qs + (iqs/2); - const int v1 = q4[0]; - const int v2 = q4[4]; - - const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0)); - const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0)); - const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0)); - const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0)); - - sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]); - sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]); - - return dall * sumf_d - dmin * sumf_m; - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A - -#endif } static __device__ __forceinline__ float vec_dot_q5_K_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#ifndef GGML_QKK_64 const block_q5_K * bq5_K = (const block_q5_K *) vbq; int vl[2]; @@ -847,48 +799,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( } return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); - -#else - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const block_q5_K * bq5_K = (const block_q5_K *) vbq; - - const int8_t * s = bq5_K->scales; - - const float d = bq5_K->d; - - const float d8_1 = __low2half(bq8_1[0].ds); - const float d8_2 = __low2half(bq8_1[1].ds); - - const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); - const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); - const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); - const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); - - const int * ql = (const int *)bq5_K->qs + (iqs/2); - const int vl1 = ql[0]; - const int vl2 = ql[4]; - - const int step = 4 * (iqs/2); // 0, 4, 8, 12 - const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6 - const int in = step%8; // 0, 4, 0, 4 - const int vh = (*((const int *)(bq5_K->qh + in))) >> im; - - const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f); - const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f); - const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f); - const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f); - - const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1]) - + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]); - - return d * sumf_d; - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A - -#endif } static __device__ __forceinline__ float vec_dot_q6_K_q8_1( @@ -919,7 +829,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if QK_K == 256 const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq; #if QR2_XXS == 8 @@ -960,15 +869,11 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( } return d * (sumi1 + sumi2); #endif -#else - NO_DEVICE_CODE; -#endif } static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics -#if QK_K == 256 const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq; const int ib32 = iqs; @@ -1002,17 +907,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( GGML_UNUSED(ksigns64); NO_DEVICE_CODE; #endif -#else - GGML_UNUSED(ksigns64); - NO_DEVICE_CODE; -#endif } // TODO static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics -#if QK_K == 256 const block_iq2_s * bq2 = (const block_iq2_s *) vbq; const int ib32 = iqs; @@ -1048,16 +948,11 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( GGML_UNUSED(ksigns64); NO_DEVICE_CODE; #endif -#else - GGML_UNUSED(ksigns64); - NO_DEVICE_CODE; -#endif } static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics -#if QK_K == 256 const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq; const int ib32 = iqs; @@ -1082,16 +977,12 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( #else NO_DEVICE_CODE; #endif -#else - NO_DEVICE_CODE; -#endif } // TODO: don't use lookup table for signs static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics -#if QK_K == 256 const block_iq3_s * bq2 = (const block_iq3_s *) vbq; const int ib32 = iqs; @@ -1114,14 +1005,10 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( #else NO_DEVICE_CODE; #endif -#else - NO_DEVICE_CODE; -#endif } static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if QK_K == 256 const block_iq1_s * bq1 = (const block_iq1_s *) vbq; const int ib32 = iqs; @@ -1149,14 +1036,10 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( const float d = d1q * __low2float (bq8_1[ib32].ds); const float m = d1q * __high2float(bq8_1[ib32].ds); return d * sumi + m * delta; -#else - NO_DEVICE_CODE; -#endif } static __device__ __forceinline__ float vec_dot_iq1_m_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if QK_K == 256 const block_iq1_m * bq1 = (const block_iq1_m *) vbq; const int ib32 = iqs; @@ -1192,9 +1075,6 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1( scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds); return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1)); -#else - NO_DEVICE_CODE; -#endif } #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -1250,9 +1130,7 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if QK_K == 256 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; const uint8_t * values = (const uint8_t *)kvalues_iq4nl; @@ -1270,10 +1148,6 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( sumi2 = __dp4a(v2, q8[j+4], sumi2); } return d * (sumi1 + sumi2); - -#else - NO_DEVICE_CODE; -#endif #else return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs); #endif diff --git a/ggml-metal.m b/ggml-metal.m index 5d5ad20ad..c9e570dbf 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -381,10 +381,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { // dictionary of preprocessor macros NSMutableDictionary * prep = [NSMutableDictionary dictionary]; -#ifdef GGML_QKK_64 - prep[@"GGML_QKK_64"] = @(1); -#endif - MTLCompileOptions* options = [MTLCompileOptions new]; options.preprocessorMacros = prep; @@ -1773,11 +1769,7 @@ static enum ggml_status ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q3_K) { -#ifdef GGML_QKK_64 - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; -#else [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; -#endif } else if (src0t == GGML_TYPE_Q5_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; @@ -2018,12 +2010,7 @@ static enum ggml_status ggml_metal_graph_compute( { nth0 = 4; nth1 = 16; - #if QK_K == 64 - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline; - #else pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline; - #endif - } break; default: { @@ -2088,11 +2075,7 @@ static enum ggml_status ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q3_K) { -#ifdef GGML_QKK_64 - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; -#else [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; -#endif } else if (src0t == GGML_TYPE_Q5_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; diff --git a/ggml-metal.metal b/ggml-metal.metal index c5eb25280..8ff70d7a7 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -3386,7 +3386,6 @@ void kernel_mul_mv_q2_K_f32_impl( const int step = sizeof(block_q2_K) * nb; -#if QK_K == 256 const int ix = tiisg/8; // 0...3 const int it = tiisg%8; // 0...7 const int iq = it/4; // 0 or 1 @@ -3438,57 +3437,6 @@ void kernel_mul_mv_q2_K_f32_impl( y4 += 4 * QK_K; } -#else - const int ix = tiisg/2; // 0...15 - const int it = tiisg%2; // 0...1 - - device const float * y4 = y + ix * QK_K + 8 * it; - - for (int ib = ix; ib < nb; ib += 16) { - - float4 sumy = {0.f, 0.f, 0.f, 0.f}; - for (int i = 0; i < 8; ++i) { - yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0]; - yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8]; - yl[i+16] = y4[i+32]; sumy[2] += yl[i+16]; - yl[i+24] = y4[i+48]; sumy[3] += yl[i+24]; - } - - device const uint8_t * sc = (device const uint8_t *)x[ib].scales; - device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it; - device const half * dh = &x[ib].d; - - for (int row = 0; row < N_DST; row++) { - - float4 acc1 = {0.f, 0.f, 0.f, 0.f}; - float4 acc2 = {0.f, 0.f, 0.f, 0.f}; - for (int i = 0; i < 8; i += 2) { - acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003); - acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300); - acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c); - acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00); - acc1[2] += yl[i+16] * (qs[i/2] & 0x0030); - acc2[2] += yl[i+17] * (qs[i/2] & 0x3000); - acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0); - acc2[3] += yl[i+25] * (qs[i/2] & 0xc000); - } - - float dall = dh[0]; - float dmin = dh[1]; - sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f + - (acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f + - (acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f + - (acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) - - dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4)); - - qs += step/2; - sc += step; - dh += step/2; - } - - y4 += 16 * QK_K; - } -#endif for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); @@ -3526,7 +3474,6 @@ kernel void kernel_mul_mv_q2_K_f32( kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } -#if QK_K == 256 void kernel_mul_mv_q3_K_f32_impl( device const void * src0, device const float * src1, @@ -3685,84 +3632,6 @@ void kernel_mul_mv_q3_K_f32_impl( } } } -#else -void kernel_mul_mv_q3_K_f32_impl( - device const void * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - threadgroup int8_t * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - - const int nb = ne00/QK_K; - - const int64_t r0 = tgpig.x; - const int64_t r1 = tgpig.y; - const int64_t im = tgpig.z; - - const int row = 2 * r0 + sgitg; - - const uint i12 = im%ne12; - const uint i13 = im/ne12; - - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); - - device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0; - device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1; - - const int ix = tiisg/4; - const int il = 4 * (tiisg%4);// 0, 4, 8, 12 - const int iq = il/8; // 0, 0, 1, 1 - const int in = il%8; // 0, 4, 0, 4 - - float2 sum = {0.f, 0.f}; - - for (int i = ix; i < nb; i += 8) { - - const float d_all = (float)(x[i].d); - - device const uint16_t * q = (device const uint16_t *)(x[i].qs + il); - device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in); - device const uint16_t * s = (device const uint16_t *)(x[i].scales); - device const float * y = yy + i * QK_K + il; - - const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8); - const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f; - const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f; - const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f; - - for (int l = 0; l < 4; l += 2) { - const uint16_t hm = h[l/2] >> iq; - sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 : 4)) - + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16)) - + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64)) - + y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256)); - sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024)) - + y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096)) - + y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384)) - + y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536)); - } - - } - const float sumf = sum[0] + sum[1] * 1.f/256.f; - - const float tot = simd_sum(sumf); - if (tiisg == 0) { - dst[r1*ne0 + im*ne0*ne1 + row] = tot; - } - -} -#endif [[host_name("kernel_mul_mv_q3_K_f32")]] kernel void kernel_mul_mv_q3_K_f32( @@ -3792,7 +3661,6 @@ kernel void kernel_mul_mv_q3_K_f32( kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } -#if QK_K == 256 void kernel_mul_mv_q4_K_f32_impl( device const void * src0, device const float * src1, @@ -3906,103 +3774,6 @@ void kernel_mul_mv_q4_K_f32_impl( } } } -#else -void kernel_mul_mv_q4_K_f32_impl( - device const void * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - threadgroup int8_t * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - - const int ix = tiisg/4; // 0...7 - const int it = tiisg%4; // 0...3 - - const int nb = ne00/QK_K; - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - const int first_row = r0 * N_DST; - const int ib_row = first_row * nb; - - const uint i12 = im%ne12; - const uint i13 = im/ne12; - - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); - - device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; - - float yl[8]; - float yh[8]; - float sumf[N_DST]={0.f}, all_sum; - - const int step = sizeof(block_q4_K) * nb / 2; - - device const float * y4 = y + ix * QK_K + 8 * it; - - uint16_t sc16[4]; - - for (int ib = ix; ib < nb; ib += 8) { - - float2 sumy = {0.f, 0.f}; - for (int i = 0; i < 8; ++i) { - yl[i] = y4[i+ 0]; sumy[0] += yl[i]; - yh[i] = y4[i+32]; sumy[1] += yh[i]; - } - - device const uint16_t * sc = (device const uint16_t *)x[ib].scales; - device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it; - device const half * dh = x[ib].d; - - for (int row = 0; row < N_DST; row++) { - - sc16[0] = sc[0] & 0x000f; - sc16[1] = sc[0] & 0x0f00; - sc16[2] = sc[0] & 0x00f0; - sc16[3] = sc[0] & 0xf000; - - float2 acc1 = {0.f, 0.f}; - float2 acc2 = {0.f, 0.f}; - for (int i = 0; i < 8; i += 2) { - acc1[0] += yl[i+0] * (qs[i/2] & 0x000F); - acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00); - acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0); - acc2[1] += yh[i+1] * (qs[i/2] & 0xF000); - } - - float dall = dh[0]; - float dmin = dh[1]; - sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] + - (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) - - dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f); - - qs += step; - sc += step; - dh += step; - } - - y4 += 8 * QK_K; - } - - for (int row = 0; row < N_DST; ++row) { - all_sum = simd_sum(sumf[row]); - if (tiisg == 0) { - dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; - } - } -} -#endif [[host_name("kernel_mul_mv_q4_K_f32")]] kernel void kernel_mul_mv_q4_K_f32( @@ -4070,8 +3841,6 @@ void kernel_mul_mv_q5_K_f32_impl( const int step = sizeof(block_q5_K) * nb; -#if QK_K == 256 -# float yl[16], yh[16]; const uint16_t kmask1 = 0x3f3f; @@ -4154,54 +3923,6 @@ void kernel_mul_mv_q5_K_f32_impl( y1 += 4 * QK_K; } -#else - float yl[8], yh[8]; - - const int il = 4 * (tiisg/8); // 0, 4, 8, 12 - const int ix = tiisg%8; - const int iq = il/8; // 0, 0, 1, 1 - const int in = il%8; // 0, 4, 0, 4 - - device const float * y = yy + ix*QK_K + il; - - for (int i = ix; i < nb; i += 8) { - - for (int l = 0; l < 4; ++l) { - yl[l+0] = y[l+ 0]; - yl[l+4] = y[l+16]; - yh[l+0] = y[l+32]; - yh[l+4] = y[l+48]; - } - - device const half * dh = &x[i].d; - device const uint8_t * q = x[i].qs + il; - device const uint8_t * h = x[i].qh + in; - device const int8_t * s = x[i].scales; - - for (int row = 0; row < 2; ++row) { - - const float d = dh[0]; - - float2 acc = {0.f, 0.f}; - for (int l = 0; l < 4; ++l) { - const uint8_t hl = h[l] >> iq; - acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16)) - + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16)); - acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256)) - + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256)); - } - sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]); - - q += step; - h += step; - s += step; - dh += step/2; - - } - - y += 8 * QK_K; - } -#endif for (int row = 0; row < 2; ++row) { const float tot = simd_sum(sumf[row]); @@ -4280,7 +4001,6 @@ void kernel_mul_mv_q6_K_f32_impl( float sumf = 0; -#if QK_K == 256 const int tid = tiisg/2; const int ix = tiisg%2; const int ip = tid/8; // 0 or 1 @@ -4316,30 +4036,6 @@ void kernel_mul_mv_q6_K_f32_impl( } -#else - const int ix = tiisg/4; - const int il = 4*(tiisg%4); - - for (int i = ix; i < nb; i += 8) { - device const float * y = yy + i * QK_K + il; - device const uint8_t * ql = x[i].ql + il; - device const uint8_t * qh = x[i].qh + il; - device const int8_t * s = x[i].scales; - - const float d = x[i].d; - - float4 sums = {0.f, 0.f, 0.f, 0.f}; - for (int l = 0; l < 4; ++l) { - sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32); - sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32); - sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) >> 0)) - 32); - sums[3] += y[l+48] * ((int8_t)((ql[l+16] >> 4) | ((qh[l] & kmask4) >> 2)) - 32); - } - sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]); - } - -#endif - const float tot = simd_sum(sumf); if (tiisg == 0) { dst[r1*ne0 + im*ne0*ne1 + row] = tot; @@ -5173,9 +4869,7 @@ void kernel_mul_mv_iq1_m_f32_impl( device const float * y4 = y + 32 * ix; -#if QK_K != 64 iq1m_scale_t scale; -#endif for (int ib32 = ix; ib32 < nb32; ib32 += 32) { @@ -5196,10 +4890,7 @@ void kernel_mul_mv_iq1_m_f32_impl( device const uint16_t * sc = (device const uint16_t *)xr->scales; for (int row = 0; row < N_DST; row++) { - -#if QK_K != 64 scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); -#endif constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700))); @@ -5215,14 +4906,9 @@ void kernel_mul_mv_iq1_m_f32_impl( } const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); -#if QK_K == 64 - const float d = (float) *((device const half *)(sc - 1)); - sumf[row] += d * ((sum[0] + delta1) * (2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1) + - (sum[1] + delta2) * (2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1)); -#else + sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) + (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1)); -#endif sc += nb*sizeof(block_iq1_m)/2; qs += nb*sizeof(block_iq1_m); @@ -5334,7 +5020,6 @@ void kernel_mul_mv_iq4_nl_f32_impl( } } -#if QK_K != 64 void kernel_mul_mv_iq4_xs_f32_impl( device const void * src0, device const float * src1, @@ -5429,7 +5114,6 @@ void kernel_mul_mv_iq4_xs_f32_impl( } } } -#endif [[host_name("kernel_mul_mv_iq1_s_f32")]] kernel void kernel_mul_mv_iq1_s_f32( @@ -5542,11 +5226,7 @@ kernel void kernel_mul_mv_iq4_xs_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { -#if QK_K == 64 - kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); -#else kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); -#endif } //============================= templates and their specializations ============================= @@ -5672,10 +5352,9 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg float dl, ml; uint8_t sc = xb->scales[il]; -#if QK_K == 256 q = q + 32*(il/8) + 16*(il&1); il = (il/2)%4; -#endif + half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4); @@ -5691,7 +5370,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg device const uint8_t * h = (device const uint8_t *)xb->hmask; device const int8_t * scales = (device const int8_t *)xb->scales; -#if QK_K == 256 q = q + 32 * (il/8) + 16 * (il&1); h = h + 16 * (il&1); uint8_t m = 1 << (il/2); @@ -5712,17 +5390,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg for (int i = 0; i < 16; ++i) { reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml); } -#else - float kcoef = il&1 ? 1.f/16.f : 1.f; - uint16_t kmask = il&1 ? 0xF0 : 0x0F; - float dl = d_all * ((scales[il/2] & kmask) * kcoef - 8); - float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); - uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); - uint8_t m = 1<<(il*2); - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef)); - } -#endif } static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) { @@ -5734,7 +5401,6 @@ template void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) { device const uchar * q = xb->qs; -#if QK_K == 256 short is = (il/4) * 2; q = q + (il/4) * 32 + 16 * (il&1); il = il & 3; @@ -5743,16 +5409,7 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg const float min = xb->dmin; const float dl = d * sc[0]; const float ml = min * sc[1]; -#else - (void) get_scale_min_k4_just2; - q = q + 16 * (il&1); - device const uint8_t * s = xb->scales; - device const half2 * dh = (device const half2 *)xb->d; - const float2 d = (float2)dh[0]; - const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h; - const float ml = il<2 ? d[1] * (s[0]>>4) : d[1] * (s[1]>>4); -#endif const ushort mask = il<2 ? 0x0F : 0xF0; for (int i = 0; i < 16; ++i) { reg[i/4][i%4] = dl * (q[i] & mask) - ml; @@ -5764,7 +5421,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg device const uint8_t * q = xb->qs; device const uint8_t * qh = xb->qh; -#if QK_K == 256 short is = (il/4) * 2; q = q + 32 * (il/4) + 16 * (il&1); qh = qh + 16 * (il&1); @@ -5781,17 +5437,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg for (int i = 0; i < 16; ++i) { reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml; } -#else - q = q + 16 * (il&1); - device const int8_t * s = xb->scales; - const float dl = xb->d * s[il]; - uint8_t m = 1<<(il*2); - const float coef = il<2 ? 1.f : 1.f/16.f; - const ushort mask = il<2 ? 0x0F : 0xF0; - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef)); - } -#endif } template @@ -5801,15 +5446,11 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg device const uint8_t * qh = (device const uint8_t *)xb->qh; device const int8_t * scales = (device const int8_t *)xb->scales; -#if QK_K == 256 ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); qh = qh + 32*(il/8) + 16*(il&1); float sc = scales[(il%2) + 2 * ((il/2))]; il = (il/2) & 3; -#else - ql = ql + 16 * (il&1); - float sc = scales[il]; -#endif + const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; const float coef = il>1 ? 1.f/16.f : 1.f; @@ -5966,20 +5607,15 @@ void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & const int ib32 = il/2; il = il%2; device const uint16_t * sc = (device const uint16_t *)xb->scales; -#if QK_K == 64 - const float d = xb->d; -#else + iq1m_scale_t scale; scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); const float d = scale.f16; -#endif + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; device const uint8_t * qh = xb->qh + 2*ib32 + il; -#if QK_K == 64 - const float dl = d * (2*((sc[ib32/2] >> (8*(ib32%2)+4*il)) & 0xf) + 1); -#else + const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1); -#endif const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); @@ -6009,9 +5645,6 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 template void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) { -#if QK_K == 64 - dequantize_iq4_nl(xb, il, reg); -#else // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 const int ib32 = il/2; il = il%2; @@ -6028,7 +5661,6 @@ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; } -#endif } template @@ -6533,11 +6165,7 @@ kernel void kernel_mul_mm_id( sgitg); } -#if QK_K == 256 #define QK_NL 16 -#else -#define QK_NL 4 -#endif // // get rows @@ -6577,11 +6205,7 @@ template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_r template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows; -#if QK_K == 64 -template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows; -#else template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows; -#endif // // matrix-matrix multiplication @@ -6609,11 +6233,7 @@ template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_m template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm; -#if QK_K == 64 -template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm; -#else template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm; -#endif // // indirect matrix-matrix multiplication @@ -6641,11 +6261,7 @@ template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -#if QK_K == 64 -template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -#else template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -#endif // // matrix-vector multiplication @@ -6854,7 +6470,5 @@ template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; -#if QK_K != 64 template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; -#endif diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 922f24837..e28566a7b 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1,4 +1,4 @@ -#include "ggml.h" +#include "ggml.h" #include "ggml-opencl.h" #include "ggml-backend-impl.h" diff --git a/ggml-quants.c b/ggml-quants.c index ed40ca74a..88f58a339 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1888,7 +1888,6 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f return scale; } -#if QK_K == 256 static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; @@ -1897,7 +1896,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * *m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); } } -#endif //========================- 2-bit (de)-quantization @@ -1961,20 +1959,13 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict } } -#if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); } } -#else - for (int l = 0; l < 16; ++l) { - y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); - } -#endif x += QK_K; - } } @@ -1989,7 +1980,6 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6 const uint8_t * q = x[i].qs; -#if QK_K == 256 int is = 0; float dl, ml; for (int n = 0; n < QK_K; n += 128) { @@ -2008,19 +1998,6 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6 } q += 32; } -#else - float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4); - float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4); - float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4); - float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4); - for (int l = 0; l < 16; ++l) { - y[l+ 0] = dl1 * ((int8_t)((q[l] >> 0) & 3)) - ml1; - y[l+16] = dl2 * ((int8_t)((q[l] >> 2) & 3)) - ml2; - y[l+32] = dl3 * ((int8_t)((q[l] >> 4) & 3)) - ml3; - y[l+48] = dl4 * ((int8_t)((q[l] >> 6) & 3)) - ml4; - } - y += QK_K; -#endif } } @@ -2211,36 +2188,9 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri } float dm, mm; -#if QK_K == 64 - float max_scale = 0, max_min = 0; - for (int j = 0; j < QK_K/16; ++j) { - max_scale = MAX(max_scale, scales[j]); - max_min = MAX(max_min, mins[j]); - } - dm = max_scale/15; - mm = max_min/15; - if (max_scale) { - float id = 1/dm; - for (int j = 0; j < QK_K/16; ++j) { - int l = nearest_int(id*scales[j]); - Ls[j] = MAX(0, MIN(15, l)); - } - } else { - memset(Ls, 0, QK_K/16); - } - if (max_min) { - float id = 1/mm; - for (int j = 0; j < QK_K/16; ++j) { - int l = nearest_int(id*mins[j]); - Lm[j] = MAX(0, MIN(15, l)); - } - } else { - memset(Lm, 0, QK_K/16); - } -#else dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw); mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw); -#endif + y[i].d = GGML_FP32_TO_FP16(dm); y[i].dmin = GGML_FP32_TO_FP16(mm); dm = GGML_FP16_TO_FP32(y[i].d); @@ -2263,20 +2213,13 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri } } -#if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); } } -#else - for (int l = 0; l < 16; ++l) { - y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); - } -#endif x += QK_K; - } } @@ -2317,7 +2260,6 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict } } -#if QK_K == 256 memset(y[i].scales, 0, 12); if (max_scale) { float iscale = -32.f/max_scale; @@ -2351,36 +2293,6 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict L[16*j + ii] = l + 4; } } -#else - if (max_scale) { - float iscale = -8.f/max_scale; - for (int j = 0; j < QK_K/16; j+=2) { - int l1 = nearest_int(iscale*scales[j]); - l1 = 8 + MAX(-8, MIN(7, l1)); - int l2 = nearest_int(iscale*scales[j+1]); - l2 = 8 + MAX(-8, MIN(7, l2)); - y[i].scales[j/2] = l1 | (l2 << 4); - } - y[i].d = GGML_FP32_TO_FP16(1/iscale); - } else { - for (int j = 0; j < QK_K/16; j+=2) { - y[i].scales[j/2] = 0; - } - y[i].d = GGML_FP32_TO_FP16(0.f); - } - for (int j = 0; j < QK_K/16; ++j) { - int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4; - float d = GGML_FP16_TO_FP32(y[i].d) * (s - 8); - if (!d) { - continue; - } - for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int(x[16*j + ii]/d); - l = MAX(-4, MIN(3, l)); - L[16*j + ii] = l + 4; - } - } -#endif memset(y[i].hmask, 0, QK_K/8); // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc. @@ -2395,23 +2307,16 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict m = 0; hm <<= 1; } } -#if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); } } -#else - for (int l = 0; l < 16; ++l) { - y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); - } -#endif x += QK_K; } } -#if QK_K == 256 void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2461,49 +2366,12 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6 } } -#else -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) { - assert(k % QK_K == 0); - assert(QK_K == 64); - const int nb = k / QK_K; - - for (int i = 0; i < nb; i++) { - - const float d_all = GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict q = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - - const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); - const float d2 = d_all * ((x[i].scales[0] >> 4) - 8); - const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8); - const float d4 = d_all * ((x[i].scales[1] >> 4) - 8); - - for (int l=0; l<8; ++l) { - uint8_t h = hm[l]; - y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4)); - y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4)); - y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4)); - y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4)); - y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4)); - y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4)); - y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4)); - y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4)); - } - y += QK_K; - } -} -#endif void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) { quantize_row_q3_K_reference(x, vy, k); } static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) { -#if QK_K != 256 - (void)quant_weights; - quantize_row_q3_K_reference(x, y, n_per_row); -#else assert(n_per_row % QK_K == 0); const int nb = n_per_row / QK_K; @@ -2585,7 +2453,6 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri x += QK_K; } -#endif } size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { @@ -2617,7 +2484,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict float scales[QK_K/32]; for (int i = 0; i < nb; i++) { - float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { @@ -2637,7 +2503,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } -#if QK_K == 256 float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; float inv_min = max_min > 0 ? 63.f/max_min : 0.f; for (int j = 0; j < QK_K/32; ++j) { @@ -2669,39 +2534,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict L[32*j + ii] = l; } } -#else - const float s_factor = 15.f; - float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f; - float inv_min = max_min > 0 ? s_factor/max_min : 0.f; - int d1 = nearest_int(inv_scale*scales[0]); - int m1 = nearest_int(inv_min*mins[0]); - int d2 = nearest_int(inv_scale*scales[1]); - int m2 = nearest_int(inv_min*mins[1]); - y[i].scales[0] = d1 | (m1 << 4); - y[i].scales[1] = d2 | (m2 << 4); - y[i].d[0] = GGML_FP32_TO_FP16(max_scale/s_factor); - y[i].d[1] = GGML_FP32_TO_FP16(max_min/s_factor); - float sumlx = 0; - int suml2 = 0; - for (int j = 0; j < QK_K/32; ++j) { - const uint8_t sd = y[i].scales[j] & 0xF; - const uint8_t sm = y[i].scales[j] >> 4; - const float d = GGML_FP16_TO_FP32(y[i].d[0]) * sd; - if (!d) continue; - const float m = GGML_FP16_TO_FP32(y[i].d[1]) * sm; - for (int ii = 0; ii < 32; ++ii) { - int l = nearest_int((x[32*j + ii] + m)/d); - l = MAX(0, MIN(15, l)); - L[32*j + ii] = l; - sumlx += (x[32*j + ii] + m)*l*sd; - suml2 += l*l*sd*sd; - } - } - if (suml2) { - y[i].d[0] = GGML_FP32_TO_FP16(sumlx/suml2); - } -#endif uint8_t * q = y[i].qs; for (int j = 0; j < QK_K; j += 64) { for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4); @@ -2709,7 +2542,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } x += QK_K; - } } @@ -2718,11 +2550,8 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6 const int nb = k / QK_K; for (int i = 0; i < nb; i++) { - const uint8_t * q = x[i].qs; -#if QK_K == 256 - const float d = GGML_FP16_TO_FP32(x[i].d); const float min = GGML_FP16_TO_FP32(x[i].dmin); @@ -2737,18 +2566,6 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6 for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2; q += 32; is += 2; } -#else - const float dall = GGML_FP16_TO_FP32(x[i].d[0]); - const float mall = GGML_FP16_TO_FP32(x[i].d[1]); - const float d1 = dall * (x[i].scales[0] & 0xF), m1 = mall * (x[i].scales[0] >> 4); - const float d2 = dall * (x[i].scales[1] & 0xF), m2 = mall * (x[i].scales[1] >> 4); - for (int l = 0; l < 32; ++l) { - y[l+ 0] = d1 * (q[l] & 0xF) - m1; - y[l+32] = d2 * (q[l] >> 4) - m2; - } - y += QK_K; -#endif - } } @@ -2759,10 +2576,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) } static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) { -#if QK_K != 256 - (void)quant_weights; - quantize_row_q4_K_reference(x, y, n_per_row); -#else assert(n_per_row % QK_K == 0); const int64_t nb = n_per_row / QK_K; @@ -2833,7 +2646,6 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri x += QK_K; } -#endif } size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { @@ -2858,21 +2670,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict assert(k % QK_K == 0); const int64_t nb = k / QK_K; -#if QK_K == 256 uint8_t L[QK_K]; float mins[QK_K/32]; float scales[QK_K/32]; float weights[32]; uint8_t Laux[32]; -#else - int8_t L[QK_K]; - float scales[QK_K/16]; -#endif for (int i = 0; i < nb; i++) { - -#if QK_K == 256 - float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { @@ -2944,55 +2748,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict m1 <<= 2; m2 <<= 2; ql += 32; } -#else - float max_scale = 0, amax = 0; - for (int j = 0; j < QK_K/16; ++j) { - scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1, NULL); - float abs_scale = fabsf(scales[j]); - if (abs_scale > amax) { - amax = abs_scale; - max_scale = scales[j]; - } - } - - float iscale = -128.f/max_scale; - for (int j = 0; j < QK_K/16; ++j) { - int l = nearest_int(iscale*scales[j]); - y[i].scales[j] = MAX(-128, MIN(127, l)); - } - y[i].d = GGML_FP32_TO_FP16(1/iscale); - - for (int j = 0; j < QK_K/16; ++j) { - const float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j]; - if (!d) continue; - for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int(x[16*j + ii]/d); - l = MAX(-16, MIN(15, l)); - L[16*j + ii] = l + 16; - } - } - - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; - memset(qh, 0, QK_K/8); - - for (int j = 0; j < 32; ++j) { - int jm = j%8; - int is = j/8; - int l1 = L[j]; - if (l1 > 15) { - l1 -= 16; qh[jm] |= (1 << is); - } - int l2 = L[j + 32]; - if (l2 > 15) { - l2 -= 16; qh[jm] |= (1 << (4 + is)); - } - ql[j] = l1 | (l2 << 4); - } -#endif x += QK_K; - } } @@ -3001,12 +2758,9 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6 const int64_t nb = k / QK_K; for (int i = 0; i < nb; i++) { - const uint8_t * ql = x[i].qs; const uint8_t * qh = x[i].qh; -#if QK_K == 256 - const float d = GGML_FP16_TO_FP32(x[i].d); const float min = GGML_FP16_TO_FP32(x[i].dmin); @@ -3023,21 +2777,6 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6 ql += 32; is += 2; u1 <<= 2; u2 <<= 2; } -#else - float d = GGML_FP16_TO_FP32(x[i].d); - const int8_t * restrict s = x[i].scales; - for (int l = 0; l < 8; ++l) { - y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16)); - y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16)); - y[l+16] = d * s[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16)); - y[l+24] = d * s[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16)); - y[l+32] = d * s[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16)); - y[l+40] = d * s[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16)); - y[l+48] = d * s[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16)); - y[l+56] = d * s[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16)); - } - y += QK_K; -#endif } } @@ -3048,10 +2787,6 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) } static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) { -#if QK_K != 256 - (void)quant_weights; - quantize_row_q5_K_reference(x, y, n_per_row); -#else assert(n_per_row % QK_K == 0); const int64_t nb = n_per_row / QK_K; @@ -3142,7 +2877,6 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri x += QK_K; } -#endif } size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { @@ -3215,7 +2949,6 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict uint8_t * restrict ql = y[i].ql; uint8_t * restrict qh = y[i].qh; -#if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { const uint8_t q1 = L[j + l + 0] & 0xF; @@ -3229,19 +2962,8 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict ql += 64; qh += 32; } -#else - for (int l = 0; l < 32; ++l) { - const uint8_t q1 = L[l + 0] & 0xF; - const uint8_t q2 = L[l + 32] & 0xF; - ql[l] = q1 | (q2 << 4); - } - for (int l = 0; l < 16; ++l) { - qh[l] = (L[l] >> 4) | ((L[l + 16] >> 4) << 2) | ((L[l + 32] >> 4) << 4) | ((L[l + 48] >> 4) << 6); - } -#endif x += QK_K; - } } @@ -3250,14 +2972,12 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6 const int64_t nb = k / QK_K; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict ql = x[i].ql; const uint8_t * restrict qh = x[i].qh; const int8_t * restrict sc = x[i].scales; -#if QK_K == 256 for (int n = 0; n < QK_K; n += 128) { for (int l = 0; l < 32; ++l) { int is = l/16; @@ -3275,20 +2995,6 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6 qh += 32; sc += 8; } -#else - for (int l = 0; l < 16; ++l) { - const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; - const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; - const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; - const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; - y[l+ 0] = d * sc[0] * q1; - y[l+16] = d * sc[1] * q2; - y[l+32] = d * sc[2] * q3; - y[l+48] = d * sc[3] * q4; - } - y += 64; -#endif - } } @@ -3299,10 +3005,6 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) } static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) { -#if QK_K != 256 - (void)quant_weights; - quantize_row_q6_K_reference(x, y, n_per_row); -#else assert(n_per_row % QK_K == 0); const int64_t nb = n_per_row / QK_K; @@ -3384,7 +3086,6 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri x += QK_K; } -#endif } size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { @@ -3801,30 +3502,21 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in float delta[4]; uint16_t idx[4]; -#if QK_K != 64 iq1m_scale_t scale; -#endif for (int i = 0; i < nb; i++) { const uint16_t * sc = (const uint16_t *)x[i].scales; -#if QK_K == 64 - const float d = GGML_FP16_TO_FP32(x[i].d); -#else scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); const float d = GGML_FP16_TO_FP32(scale.f16); -#endif + const uint8_t * qs = x[i].qs; const uint8_t * qh = x[i].qh; for (int ib = 0; ib < QK_K/32; ++ib) { -#if QK_K == 64 - const float dl1 = d * (2*((sc[ib/2] >> (8*(ib%2)+0)) & 0xf) + 1); - const float dl2 = d * (2*((sc[ib/2] >> (8*(ib%2)+4)) & 0xf) + 1); -#else const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1); const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1); -#endif + idx[0] = qs[0] | ((qh[0] << 8) & 0x700); idx[1] = qs[1] | ((qh[0] << 4) & 0x700); idx[2] = qs[2] | ((qh[1] << 8) & 0x700); @@ -3875,9 +3567,6 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); -#if QK_K == 64 - dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k); -#else const int64_t nb = k / QK_K; for (int i = 0; i < nb; i++) { @@ -3897,7 +3586,6 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, qs += 16; } } -#endif } //===================================== Q8_K ============================================== @@ -5849,7 +5537,6 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r #endif } -#if QK_K == 256 void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); @@ -6433,410 +6120,6 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r #endif } -#else - -void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q2_K * restrict x = vx; - const block_q8_K * restrict y = vy; - - const int nb = n / QK_K; - -#ifdef __ARM_NEON - const uint8x16_t m3 = vdupq_n_u8(0x3); - - const int32x4_t vzero = vdupq_n_s32(0); - - ggml_int8x16x4_t q2bytes; - - uint32_t aux32[2]; - const uint8_t * scales = (const uint8_t *)aux32; - - float sum = 0; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; - - aux32[0] = sc[0] & 0x0f0f0f0f; - aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f; - - sum += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]); - - int isum1 = 0, isum2 = 0; - - const uint8x16_t q2bits = vld1q_u8(q2); - - const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); - - q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3)); - q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3)); - q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3)); - q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3)); - - isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0]; - isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1]; - isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2]; - isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3]; - - sum += d * (isum1 + isum2); - } - - *s = sum; - -#elif defined __AVX2__ - - const __m256i m3 = _mm256_set1_epi8(3); - - __m256 acc = _mm256_setzero_ps(); - - uint32_t ud, um; - const uint8_t * restrict db = (const uint8_t *)&ud; - const uint8_t * restrict mb = (const uint8_t *)&um; - - float summs = 0; - - // TODO: optimize this - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; - ud = (sc[0] >> 0) & 0x0f0f0f0f; - um = (sc[0] >> 4) & 0x0f0f0f0f; - - int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3]; - summs += dmin * smin; - - const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); - const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3); - const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3); - - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); - - const __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0); - const __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1); - - const __m256i p_0 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p0, 0)); - const __m256i p_1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p0, 1)); - const __m256i p_2 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p1, 0)); - const __m256i p_3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p1, 1)); - - acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0), acc); - acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1), acc); - acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2), acc); - acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3), acc); - } - - *s = hsum_float_8(acc) + summs; - -#elif defined __AVX__ - - const __m128i m3 = _mm_set1_epi8(3); - - __m256 acc = _mm256_setzero_ps(); - - uint32_t ud, um; - const uint8_t * restrict db = (const uint8_t *)&ud; - const uint8_t * restrict mb = (const uint8_t *)&um; - - float summs = 0; - - // TODO: optimize this - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; - ud = (sc[0] >> 0) & 0x0f0f0f0f; - um = (sc[0] >> 4) & 0x0f0f0f0f; - - int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3]; - summs += dmin * smin; - - const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); - const __m128i q2_0 = _mm_and_si128(q2bits, m3); - const __m128i q2_1 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3); - const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3); - const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3); - - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); - - const __m128i p0 = _mm_maddubs_epi16(q2_0, _mm256_extractf128_si256(q8_0, 0)); - const __m128i p1 = _mm_maddubs_epi16(q2_1, _mm256_extractf128_si256(q8_0, 1)); - const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0)); - const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1)); - - const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0)); - const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1)); - const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2)); - const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3)); - - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2)), acc); - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3)), acc); - } - - *s = hsum_float_8(acc) + summs; - -#elif defined __riscv_v_intrinsic - - uint32_t aux32[2]; - const uint8_t * scales = (const uint8_t *)aux32; - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; - - aux32[0] = sc[0] & 0x0f0f0f0f; - aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f; - - sumf += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]); - - int isum1 = 0; - int isum2 = 0; - - size_t vl = 16; - - vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1); - - // load Q2 - vuint8mf2_t q2_x = __riscv_vle8_v_u8mf2(q2, vl); - - vint8mf2_t q2_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q2_x, 0x03, vl)); - vint8mf2_t q2_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x2, vl), 0x03 , vl)); - vint8mf2_t q2_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x4, vl), 0x03 , vl)); - vint8mf2_t q2_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x6, vl), 0x03 , vl)); - - // load Q8, and take product with Q2 - vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q2_0, __riscv_vle8_v_i8mf2(q8, vl), vl); - vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q2_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl); - vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q2_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl); - vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q2_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl); - - vint16m1_t vs_0 = __riscv_vredsum_vs_i16m1_i16m1(p0, vzero, vl); - vint16m1_t vs_1 = __riscv_vredsum_vs_i16m1_i16m1(p1, vzero, vl); - vint16m1_t vs_2 = __riscv_vredsum_vs_i16m1_i16m1(p2, vzero, vl); - vint16m1_t vs_3 = __riscv_vredsum_vs_i16m1_i16m1(p3, vzero, vl); - - isum1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[0]; - isum2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[1]; - isum1 += __riscv_vmv_x_s_i16m1_i16(vs_2) * scales[2]; - isum2 += __riscv_vmv_x_s_i16m1_i16(vs_3) * scales[3]; - - sumf += d * (isum1 + isum2); - - } - - *s = sumf; - - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0x3); - const vector signed char lowScaleMask = vec_splats((signed char)0xF); - const vector unsigned char v2 = vec_splats((unsigned char)0x2); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - const vector unsigned char v6 = vec_splats((unsigned char)0x6); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - -#pragma GCC unroll 2 - for (int i = 0; i < nb; ++i) { - __builtin_prefetch(x[i].qs, 0, 1); - __builtin_prefetch(y[i].qs, 0, 1); - - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd = vec_mul(vxd, vyd); - - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin)); - vector float vdmin = vec_mul(vxmin, vyd); - - vector signed short q8ysums0 = vec_xl_len(y[i].bsums, 8); - - vector signed char q2xmins = (vector signed char)vec_xl_len(x[i].scales, 4); - vector signed char vscales = vec_and(q2xmins, lowScaleMask); - - q2xmins = vec_sr(q2xmins, v4); - vector signed short q2xmins0 = vec_unpackh((vector signed char)q2xmins); - - vector signed int prod0 = vec_mule(q2xmins0, q8ysums0); - vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0); - - vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0); - vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1); - - vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs); - vector signed char q2x00 = vec_and(qxs0, lowMask); - vector signed char q2x01 = vec_and(vec_sr(qxs0, v2), lowMask); - vector signed char q2x02 = vec_and(vec_sr(qxs0, v4), lowMask); - vector signed char q2x03 = vec_and(vec_sr(qxs0, v6), lowMask); - - vector signed char q8y00 = vec_xl( 0, y[i].qs); - vector signed char q8y01 = vec_xl( 16, y[i].qs); - vector signed char q8y02 = vec_xl( 32, y[i].qs); - vector signed char q8y03 = vec_xl( 48, y[i].qs); - - vector signed short qv0 = vec_add(vec_mule(q2x00, q8y00), vec_mulo(q2x00, q8y00)); - vector signed short qv1 = vec_add(vec_mule(q2x01, q8y01), vec_mulo(q2x01, q8y01)); - vector signed short qv2 = vec_add(vec_mule(q2x02, q8y02), vec_mulo(q2x02, q8y02)); - vector signed short qv3 = vec_add(vec_mule(q2x03, q8y03), vec_mulo(q2x03, q8y03)); - - vector signed short vscales_h = vec_unpackh(vscales); - vector signed short vs0 = vec_splat(vscales_h, 0); - vector signed short vs1 = vec_splat(vscales_h, 1); - vector signed short vs2 = vec_splat(vscales_h, 2); - vector signed short vs3 = vec_splat(vscales_h, 3); - - vector signed int vsumi0 = vec_add(vec_mule(qv0, vs0), vec_mulo(qv0, vs0)); - vector signed int vsumi1 = vec_add(vec_mule(qv1, vs1), vec_mulo(qv1, vs1)); - vector signed int vsumi2 = vec_add(vec_mule(qv2, vs2), vec_mulo(qv2, vs2)); - vector signed int vsumi3 = vec_add(vec_mule(qv3, vs3), vec_mulo(qv3, vs3)); - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = vec_extract(vsumf0, 0); - -#elif defined __loongarch_asx - - const __m256i m3 = __lasx_xvreplgr2vr_b(3); - - __m256 acc = (__m256)__lasx_xvldi(0); - - uint32_t ud, um; - const uint8_t * restrict db = (const uint8_t *)&ud; - const uint8_t * restrict mb = (const uint8_t *)&um; - - float summs = 0; - - // TODO: optimize this - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; - ud = (sc[0] >> 0) & 0x0f0f0f0f; - um = (sc[0] >> 4) & 0x0f0f0f0f; - - int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3]; - summs += dmin * smin; - - const __m128i q2bits = __lsx_vld((const __m128i*)q2, 0); - const __m256i q2_0 = __lasx_xvand_v(lasx_insertf128(__lsx_vsrli_h(q2bits, 2), q2bits), m3); - const __m256i q2_1 = __lasx_xvand_v(lasx_insertf128(__lsx_vsrli_h(q2bits, 6), __lsx_vsrli_h(q2bits, 4)), m3); - - const __m256i q8_0 = __lasx_xvld((const __m256i*)(q8+ 0), 0); - const __m256i q8_1 = __lasx_xvld((const __m256i*)(q8+32), 0); - - const __m256i p0 = lasx_maddubs_h(q2_0, q8_0); - const __m256i p1 = lasx_maddubs_h(q2_1, q8_1); - - const __m256i p_0 = lasx_ext16_32(lasx_extracti128(p0, 0)); - const __m256i p_1 = lasx_ext16_32(lasx_extracti128(p0, 1)); - const __m256i p_2 = lasx_ext16_32(lasx_extracti128(p1, 0)); - const __m256i p_3 = lasx_ext16_32(lasx_extracti128(p1, 1)); - - ft_union t0, t1, t2, t3; - t0.f = d * db[0]; - t1.f = d * db[1]; - t2.f = d * db[2]; - t3.f = d * db[3]; - acc = __lasx_xvfmadd_s(__lasx_xvreplgr2vr_w(t0.i), __lasx_xvffint_s_w(p_0), acc); - acc = __lasx_xvfmadd_s(__lasx_xvreplgr2vr_w(t1.i), __lasx_xvffint_s_w(p_1), acc); - acc = __lasx_xvfmadd_s(__lasx_xvreplgr2vr_w(t2.i), __lasx_xvffint_s_w(p_2), acc); - acc = __lasx_xvfmadd_s(__lasx_xvreplgr2vr_w(t3.i), __lasx_xvffint_s_w(p_3), acc); - } - - *s = hsum_float_8(acc) + summs; - -#else - - float sumf = 0; - - int isum[QK_K/16]; - - for (int i = 0; i < nb; ++i) { - - const uint8_t * q2 = x[i].qs; - const int8_t * q8 = y[i].qs; - const uint8_t * sc = x[i].scales; - - int summs = 0; - for (int j = 0; j < QK_K/16; ++j) { - summs += y[i].bsums[j] * (sc[j] >> 4); - } - - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - - memset(isum, 0, (QK_K/16)*sizeof(int)); - for (int l = 0; l < 16; ++l) { - isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3); - isum[1] += q8[l+16] * ((q2[l] >> 2) & 3); - isum[2] += q8[l+32] * ((q2[l] >> 4) & 3); - isum[3] += q8[l+48] * ((q2[l] >> 6) & 3); - } - for (int l = 0; l < QK_K/16; ++l) { - isum[l] *= (sc[l] & 0xF); - } - sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs; - } - *s = sumf; -#endif -} -#endif - -#if QK_K == 256 void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -7616,512 +6899,6 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r } -#else - -void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q3_K * restrict x = vx; - const block_q8_K * restrict y = vy; - - const int nb = n / QK_K; - -#ifdef __ARM_NEON - const int32x4_t vzero = vdupq_n_s32(0); - - const uint8x16_t m3b = vdupq_n_u8(0x3); - const uint8x16_t mh = vdupq_n_u8(4); - - ggml_int8x16x4_t q3bytes; - - uint16_t aux16[2]; - int8_t * scales = (int8_t *)aux16; - - float sum = 0; - - for (int i = 0; i < nb; ++i) { - - ggml_uint8x16x4_t q3h; - - const uint8x8_t hbits = vld1_u8(x[i].hmask); - const uint8x16_t q3bits = vld1q_u8(x[i].qs); - const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(y[i].qs); - - const uint16_t a = *(const uint16_t *)x[i].scales; - aux16[0] = a & 0x0f0f; - aux16[1] = (a >> 4) & 0x0f0f; - - for (int j = 0; j < 4; ++j) scales[j] -= 8; - - int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]); - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1)); - q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2)); - q3h.val[1] = vandq_u8(mh, htmp); - q3h.val[2] = vandq_u8(mh, vshrq_n_u8(htmp, 2)); - q3h.val[3] = vandq_u8(mh, vshrq_n_u8(htmp, 4)); - - q3bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q3bits, m3b), q3h.val[0])); - q3bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 2), m3b), q3h.val[1])); - q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2])); - q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3])); - - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0]; - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2]; - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1]; - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3]; - - sum += d * isum; - - } - - *s = sum; - -#elif defined __AVX2__ - - const __m256i m3 = _mm256_set1_epi8(3); - const __m256i m1 = _mm256_set1_epi8(1); - - __m256 acc = _mm256_setzero_ps(); - - uint64_t aux64; - - uint16_t aux16[2]; - const int8_t * aux8 = (const int8_t *)aux16; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const uint16_t a = *(const uint16_t *)x[i].scales; - aux16[0] = a & 0x0f0f; - aux16[1] = (a >> 4) & 0x0f0f; - - const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8)); - const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8)); - - memcpy(&aux64, x[i].hmask, 8); - - const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0); - __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux); - __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4); - q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2); - q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2); - - // load low 2 bits - const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3); - - // prepare low and high bits - const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits); - const __m256i q3l_0 = _mm256_and_si256(q3aux, m3); - const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3); - - // load Q8 quants - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); - - // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16, - // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, - // and 2 if the high bit was set) - const __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0); - const __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1); - - __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0); - __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1); - - p16_0 = _mm256_sub_epi16(p16_0, q8s_0); - p16_1 = _mm256_sub_epi16(p16_1, q8s_1); - - // multiply with scales - p16_0 = _mm256_madd_epi16(scale_0, p16_0); - p16_1 = _mm256_madd_epi16(scale_1, p16_1); - - p16_0 = _mm256_add_epi32(p16_0, p16_1); - - // multiply with block scale and accumulate - acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16_0), acc); - - } - - *s = hsum_float_8(acc); - -#elif defined __AVX__ - - const __m128i m3 = _mm_set1_epi8(3); - const __m128i m1 = _mm_set1_epi8(1); - - __m256 acc = _mm256_setzero_ps(); - - uint64_t aux64; - - uint16_t aux16[2]; - const int8_t * aux8 = (const int8_t *)aux16; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const uint16_t a = *(const uint16_t *)x[i].scales; - aux16[0] = a & 0x0f0f; - aux16[1] = (a >> 4) & 0x0f0f; - - const __m128i scale_0 = _mm_set1_epi16(aux8[0] - 8); - const __m128i scale_1 = _mm_set1_epi16(aux8[2] - 8); - const __m128i scale_2 = _mm_set1_epi16(aux8[1] - 8); - const __m128i scale_3 = _mm_set1_epi16(aux8[3] - 8); - - memcpy(&aux64, x[i].hmask, 8); - - __m128i q3h_0 = _mm_set_epi64x(aux64 >> 1, aux64 >> 0); - __m128i q3h_1 = _mm_srli_epi16(q3h_0, 2); - __m128i q3h_2 = _mm_srli_epi16(q3h_0, 4); - __m128i q3h_3 = _mm_srli_epi16(q3h_0, 6); - q3h_0 = _mm_slli_epi16(_mm_andnot_si128(q3h_0, m1), 2); - q3h_1 = _mm_slli_epi16(_mm_andnot_si128(q3h_1, m1), 2); - q3h_2 = _mm_slli_epi16(_mm_andnot_si128(q3h_2, m1), 2); - q3h_3 = _mm_slli_epi16(_mm_andnot_si128(q3h_3, m1), 2); - - // load low 2 bits - const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3); - - // prepare low and high bits - const __m128i q3l_0 = _mm_and_si128(q3bits, m3); - const __m128i q3l_1 = _mm_and_si128(_mm_srli_epi16(q3bits, 2), m3); - const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits, 4), m3); - const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits, 6), m3); - - // load Q8 quants - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); - - // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm_maddubs_epi16, - // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, - // and 2 if the high bit was set) - const __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, _mm256_extractf128_si256(q8_0, 0)); - const __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, _mm256_extractf128_si256(q8_0, 1)); - const __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, _mm256_extractf128_si256(q8_1, 0)); - const __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, _mm256_extractf128_si256(q8_1, 1)); - - __m128i p16_0 = _mm_maddubs_epi16(q3l_0, _mm256_extractf128_si256(q8_0, 0)); - __m128i p16_1 = _mm_maddubs_epi16(q3l_1, _mm256_extractf128_si256(q8_0, 1)); - __m128i p16_2 = _mm_maddubs_epi16(q3l_2, _mm256_extractf128_si256(q8_1, 0)); - __m128i p16_3 = _mm_maddubs_epi16(q3l_3, _mm256_extractf128_si256(q8_1, 1)); - - p16_0 = _mm_sub_epi16(p16_0, q8s_0); - p16_1 = _mm_sub_epi16(p16_1, q8s_1); - p16_2 = _mm_sub_epi16(p16_2, q8s_2); - p16_3 = _mm_sub_epi16(p16_3, q8s_3); - - // multiply with scales - p16_0 = _mm_madd_epi16(scale_0, p16_0); - p16_1 = _mm_madd_epi16(scale_1, p16_1); - p16_2 = _mm_madd_epi16(scale_2, p16_2); - p16_3 = _mm_madd_epi16(scale_3, p16_3); - - p16_0 = _mm_add_epi32(p16_0, p16_2); - p16_1 = _mm_add_epi32(p16_1, p16_3); - __m256i p16 = MM256_SET_M128I(p16_1, p16_0); - - // multiply with block scale and accumulate - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc); - - } - - *s = hsum_float_8(acc); - -#elif defined __riscv_v_intrinsic - - uint16_t aux16[2]; - int8_t * scales = (int8_t *)aux16; - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const uint16_t a = *(const uint16_t *)x[i].scales; - aux16[0] = a & 0x0f0f; - aux16[1] = (a >> 4) & 0x0f0f; - - for (int j = 0; j < 4; ++j) scales[j] -= 8; - - int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]); - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); - - // load qh - vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(x[i].hmask, 8); - vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8)); - - size_t vl = 16; - - // extend and combine both qh_x1 and qh_x2 - vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl); - - vuint8mf2_t qh_0 = __riscv_vand_vx_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl); - vuint8mf2_t qh_1 = __riscv_vand_vx_u8mf2(qh_x, 0x4, vl); - vuint8mf2_t qh_2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl); - vuint8mf2_t qh_3 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), 0x4, vl); - - // load Q3 - vuint8mf2_t q3_x = __riscv_vle8_v_u8mf2(q3, vl); - - vuint8mf2_t q3h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q3_x, 0x3, vl), qh_0, vl); - vuint8mf2_t q3h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 2, vl), 0x3, vl), qh_1, vl); - vuint8mf2_t q3h_2 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 4, vl), 0x3, vl), qh_2, vl); - vuint8mf2_t q3h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 0x6, vl), qh_3, vl); - - vint8mf2_t q3_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_0); - vint8mf2_t q3_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_1); - vint8mf2_t q3_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_2); - vint8mf2_t q3_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_3); - - // load Q8 and take product with Q3 - vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q3_0, __riscv_vle8_v_i8mf2(q8, vl), vl); - vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q3_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl); - vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q3_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl); - vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q3_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl); - - vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl); - vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl); - vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl); - vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl); - - isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scales[0]; - isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scales[2]; - isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scales[1]; - isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scales[3]; - - sumf += d * isum; - - } - - *s = sumf; - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0x3); - const vector signed char v1 = vec_splats((signed char)0x1); - const vector unsigned char v2 = vec_splats((unsigned char)0x2); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - const vector unsigned char v6 = vec_splats((unsigned char)0x6); - const vector signed char off = vec_splats((signed char)0x8); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - -#pragma GCC unroll 2 - for (int i = 0; i < nb; ++i) { - __builtin_prefetch(x[i].qs, 0, 1); - __builtin_prefetch(y[i].qs, 0, 1); - - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd = vec_mul(vxd, vyd); - - uint16_t aux16[2]; - int8_t * scales = (int8_t *)aux16; - - const uint16_t a = *(const uint16_t *)x[i].scales; - aux16[0] = a & 0x0f0f; - aux16[1] = (a >> 4) & 0x0f0f; - - vector signed char vscales = (vector signed char)vec_xl_len(scales, 8); - vector signed char qxhs0 = (vector signed char)vec_xl_len(x[i].hmask, 8); - qxhs0 = vec_or(qxhs0, vec_sr(vec_sld(qxhs0, qxhs0, 8), (vector unsigned char)v1)); - - vscales = vec_sub(vscales, off); - - vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs); - vector signed char qxs00 = vec_and(qxs0, lowMask); - vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask); - vector signed char qxs10 = vec_and(vec_sr(qxs0, v4), lowMask); - vector signed char qxs11 = vec_and(vec_sr(qxs0, v6), lowMask); - - //the 3rd bit - vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2); - vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2); - vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v4)), v2); - vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v6)), v2); - qxhs0 = vec_sr(qxhs0, v4); - - vector signed char q3x00 = vec_sub(qxs00, qxh00); - vector signed char q3x01 = vec_sub(qxs01, qxh01); - vector signed char q3x10 = vec_sub(qxs10, qxh02); - vector signed char q3x11 = vec_sub(qxs11, qxh03); - - vector signed char q8y00 = vec_xl( 0, y[i].qs); - vector signed char q8y01 = vec_xl( 16, y[i].qs); - vector signed char q8y10 = vec_xl( 32, y[i].qs); - vector signed char q8y11 = vec_xl( 48, y[i].qs); - - vector signed short vscales_h = vec_unpackh(vscales); - vector signed short vs0 = vec_splat(vscales_h, 0); - vector signed short vs1 = vec_splat(vscales_h, 1); - vector signed short vs2 = vec_splat(vscales_h, 2); - vector signed short vs3 = vec_splat(vscales_h, 3); - - vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00)); - vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10)); - vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01)); - vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11)); - - vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0)); - vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1)); - vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2)); - vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3)); - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = vec_extract(vsumf0, 0); - -#elif defined __loongarch_asx - - const __m256i m3 = __lasx_xvreplgr2vr_b(3); - const __m256i m1 = __lasx_xvreplgr2vr_b(1); - - __m256 acc = (__m256)__lasx_xvldi(0); - - uint64_t aux64; - - uint16_t aux16[2]; - const int8_t * aux8 = (const int8_t *)aux16; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const __m256i scale_0 = lasx_insertf128(__lasx_xvreplgr2vr_h(aux8[2] - 8), __lasx_xvreplgr2vr_h(aux8[0] - 8)); - const __m256i scale_1 = lasx_insertf128(__lasx_xvreplgr2vr_h(aux8[3] - 8), __lasx_xvreplgr2vr_h(aux8[1] - 8)); - - memcpy(&aux64, x[i].hmask, 8); - - __m128i haux = __lsx_vinsgr2vr_d(haux, aux64, 0); - haux = __lsx_vinsgr2vr_d(haux, aux64 >> 1, 1); - __m256i q3h_0 = lasx_insertf128(__lsx_vsrli_h(haux, 2), haux); - __m256i q3h_1 = __lasx_xvsrli_h(q3h_0, 4); - q3h_0 = __lasx_xvslli_h(__lasx_xvandn_v(q3h_0, m1), 2); - q3h_1 = __lasx_xvslli_h(__lasx_xvandn_v(q3h_1, m1), 2); - - // load low 2 bits - const __m128i q3bits = __lsx_vld((const __m128i*)q3, 0); - - // prepare low and high bits - const __m256i q3aux = lasx_insertf128(__lsx_vsrli_h(q3bits, 2), q3bits); - const __m256i q3l_0 = __lasx_xvand_v(q3aux, m3); - const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3aux, 4), m3); - - // load Q8 quants - const __m256i q8_0 = __lasx_xvld((const __m256i*)(q8+ 0), 0); - const __m256i q8_1 = __lasx_xvld((const __m256i*)(q8+32), 0); - - // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use lasx_maddubs_h, - // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, - // and 2 if the high bit was set) - const __m256i q8s_0 = lasx_maddubs_h(q3h_0, q8_0); - const __m256i q8s_1 = lasx_maddubs_h(q3h_1, q8_1); - - __m256i p16_0 = lasx_maddubs_h(q3l_0, q8_0); - __m256i p16_1 = lasx_maddubs_h(q3l_1, q8_1); - - p16_0 = __lasx_xvsub_h(p16_0, q8s_0); - p16_1 = __lasx_xvsub_h(p16_1, q8s_1); - - // multiply with scales - p16_0 = lasx_madd_h(scale_0, p16_0); - p16_1 = lasx_madd_h(scale_1, p16_1); - - p16_0 = __lasx_xvadd_w(p16_0, p16_1); - - // multiply with block scale and accumulate - acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(p16_0), acc); - } - - *s = hsum_float_8(acc); - -#else - - int8_t aux8[QK_K]; - int16_t aux16[8]; - float sums [8]; - int32_t aux32[8]; - int32_t scales[4]; - memset(sums, 0, 8*sizeof(float)); - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; - int8_t * restrict a = aux8; - for (int l = 0; l < 8; ++l) { - a[l+ 0] = (int8_t)((q3[l+0] >> 0) & 3) - (hm[l] & 0x01 ? 0 : 4); - a[l+ 8] = (int8_t)((q3[l+8] >> 0) & 3) - (hm[l] & 0x02 ? 0 : 4); - a[l+16] = (int8_t)((q3[l+0] >> 2) & 3) - (hm[l] & 0x04 ? 0 : 4); - a[l+24] = (int8_t)((q3[l+8] >> 2) & 3) - (hm[l] & 0x08 ? 0 : 4); - a[l+32] = (int8_t)((q3[l+0] >> 4) & 3) - (hm[l] & 0x10 ? 0 : 4); - a[l+40] = (int8_t)((q3[l+8] >> 4) & 3) - (hm[l] & 0x20 ? 0 : 4); - a[l+48] = (int8_t)((q3[l+0] >> 6) & 3) - (hm[l] & 0x40 ? 0 : 4); - a[l+56] = (int8_t)((q3[l+8] >> 6) & 3) - (hm[l] & 0x80 ? 0 : 4); - } - - scales[0] = (x[i].scales[0] & 0xF) - 8; - scales[1] = (x[i].scales[0] >> 4) - 8; - scales[2] = (x[i].scales[1] & 0xF) - 8; - scales[3] = (x[i].scales[1] >> 4) - 8; - - memset(aux32, 0, 8*sizeof(int32_t)); - for (int j = 0; j < QK_K/16; ++j) { - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] += q8[l] * a[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux32[l] += scales[j] * aux16[l]; - } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - } - for (int l = 0; l < 8; ++l) sumf += sums[l]; - *s = sumf; - -#endif - -} -#endif - -#if QK_K == 256 void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -8679,381 +7456,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r *s = sumf; #endif } -#else -void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - const block_q4_K * restrict x = vx; - const block_q8_K * restrict y = vy; - - const int nb = n / QK_K; - -#ifdef __ARM_NEON - const uint8x16_t m4b = vdupq_n_u8(0xf); - - const int32x4_t mzero = vdupq_n_s32(0); - - float sumf = 0; - - ggml_int8x16x2_t q4bytes; - ggml_int8x16x4_t q8bytes; - - float sum_mins = 0.f; - - uint16_t aux16[2]; - const uint8_t * restrict scales = (const uint8_t *)aux16; - - for (int i = 0; i < nb; ++i) { - - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const uint16_t * restrict a = (const uint16_t *)x[i].scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - - const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]); - sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi; - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]); - - const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); - - q8bytes = ggml_vld1q_s8_x4(q8); - q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); - q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); - - const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); - const int32_t sumi1 = vaddvq_s32(p1) * scales[0]; - - q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); - q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); - - const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]); - const int32_t sumi2 = vaddvq_s32(p2) * scales[1]; - - sumf += d * (sumi1 + sumi2); - } - - *s = sumf - sum_mins; - -#elif defined __AVX2__ - - const __m256i m4 = _mm256_set1_epi8(0xF); - - __m256 acc = _mm256_setzero_ps(); - - float summs = 0; - - uint16_t aux16[2]; - const uint8_t * scales = (const uint8_t *)aux16; - - for (int i = 0; i < nb; ++i) { - - const float d = GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d; - const float m = GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d; - const __m256 vd = _mm256_set1_ps(d); - - const uint16_t * a = (const uint16_t *)x[i].scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - - summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); - const __m256i q4l = _mm256_and_si256(q4bits, m4); - const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4); - - const __m256i q8l = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8h = _mm256_loadu_si256((const __m256i*)(q8+32)); - - const __m256i p16l = _mm256_maddubs_epi16(q4l, q8l); - const __m256i p16h = _mm256_maddubs_epi16(q4h, q8h); - - const __m256i p32l = _mm256_madd_epi16(_mm256_set1_epi16(scales[0]), p16l); - acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(p32l), acc); - - const __m256i p32h = _mm256_madd_epi16(_mm256_set1_epi16(scales[1]), p16h); - acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(p32h), acc); - - } - - *s = hsum_float_8(acc) - summs; - -#elif defined __AVX__ - - const __m128i m4 = _mm_set1_epi8(0xF); - - __m256 acc = _mm256_setzero_ps(); - - float summs = 0; - - uint16_t aux16[2]; - const uint8_t * scales = (const uint8_t *)aux16; - - for (int i = 0; i < nb; ++i) { - - const float d = GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d; - const float m = GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d; - const __m256 vd = _mm256_set1_ps(d); - - const uint16_t * a = (const uint16_t *)x[i].scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - - summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); - const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0); - const __m128i q4bits_1 = _mm256_extractf128_si256(q4bits, 1); - const __m128i q4_0 = _mm_and_si128(q4bits_0, m4); - const __m128i q4_1 = _mm_and_si128(q4bits_1, m4); - const __m128i q4_2 = _mm_and_si128(_mm_srli_epi16(q4bits_0, 4), m4); - const __m128i q4_3 = _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4); - - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); - - const __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0)); - const __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1)); - const __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0)); - const __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1)); - - const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0); - const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1); - acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc); - - const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2); - const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3); - acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc); - - } - - *s = hsum_float_8(acc) - summs; - -#elif defined __riscv_v_intrinsic - - uint16_t s16[2]; - const uint8_t * restrict scales = (const uint8_t *)s16; - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const uint16_t * restrict b = (const uint16_t *)x[i].scales; - s16[0] = b[0] & 0x0f0f; - s16[1] = (b[0] >> 4) & 0x0f0f; - - sumf -= y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]); - - size_t vl = 32; - - vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1); - - // load Q4 - vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl); - - // load Q8 and multiply it with lower Q4 nibble - vint8m1_t q4_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl)); - vint16m2_t va_0 = __riscv_vwmul_vv_i16m2(q4_a, __riscv_vle8_v_i8m1(q8, vl), vl); - vint16m1_t aux1 = __riscv_vredsum_vs_i16m2_i16m1(va_0, vzero, vl); - - sumf += d*scales[0]*__riscv_vmv_x_s_i16m1_i16(aux1); - - // load Q8 and multiply it with upper Q4 nibble - vint8m1_t q4_s = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl)); - vint16m2_t va_1 = __riscv_vwmul_vv_i16m2(q4_s, __riscv_vle8_v_i8m1(q8+32, vl), vl); - vint16m1_t aux2 = __riscv_vredsum_vs_i16m2_i16m1(va_1, vzero, vl); - - sumf += d*scales[1]*__riscv_vmv_x_s_i16m1_i16(aux2); - - } - - *s = sumf; - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0xF); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - -#pragma GCC unroll 2 - for (int i = 0; i < nb; ++i) { - __builtin_prefetch(x[i].qs, 0, 1); - __builtin_prefetch(y[i].qs, 0, 1); - - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d[1])); - vector float vyd = vec_splats(y[i].d); - vector float vd= vec_mul(vxd, vyd); - - uint16_t s16[2]; - const uint8_t * scales = (const uint8_t *)s16; - - const uint16_t * restrict b = (const uint16_t *)x[i].scales; - s16[0] = b[0] & 0x0f0f; - s16[1] = (b[0] >> 4) & 0x0f0f; - - vector signed char utmps = (vector signed char)vec_xl_len(scales, 4); - vector signed short vscales = (vector signed short)vec_unpackh(utmps); - vector signed short q4xmins0 = vec_mergeh(vscales, vscales); - q4xmins0 = vec_sld(q4xmins0, q4xmins0, 8); - - vector signed short q8ysums0 = vec_xl_len((const int16_t *)(y[i].bsums), 8); - - vector signed int prod0 = vec_mule(q4xmins0, q8ysums0); - vector signed int prod1 = vec_mulo(q4xmins0, q8ysums0); - - vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vd, vsumf0); - vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vd, vsumf1); - - vd = vec_mul(vyd, vec_splats(GGML_FP16_TO_FP32(x[i].d[0]))); - - vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs); - vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].qs); - vector signed char q4x00 = vec_and(qxs0, lowMask); - vector signed char q4x01 = vec_sr(qxs0, v4); - vector signed char q4x10 = vec_and(qxs1, lowMask); - vector signed char q4x11 = vec_sr(qxs1, v4); - - vector signed char q8y00 = vec_xl( 0, y[i].qs); - vector signed char q8y10 = vec_xl(16, y[i].qs); - vector signed char q8y01 = vec_xl(32, y[i].qs); - vector signed char q8y11 = vec_xl(48, y[i].qs); - - vector signed short qv00 = vec_add(vec_mule(q4x00, q8y00), vec_mulo(q4x00, q8y00)); - vector signed short qv01 = vec_add(vec_mule(q4x01, q8y01), vec_mulo(q4x01, q8y01)); - vector signed short qv10 = vec_add(vec_mule(q4x10, q8y10), vec_mulo(q4x10, q8y10)); - vector signed short qv11 = vec_add(vec_mule(q4x11, q8y11), vec_mulo(q4x11, q8y11)); - - vector signed short vs0 = vec_splat(vscales, 0); - vector signed short vs1 = vec_splat(vscales, 1); - - vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0)); - vector signed int vsumi1 = vec_add(vec_mule(qv10, vs0), vec_mulo(qv10, vs0)); - vector signed int vsumi2 = vec_add(vec_mule(qv01, vs1), vec_mulo(qv01, vs1)); - vector signed int vsumi3 = vec_add(vec_mule(qv11, vs1), vec_mulo(qv11, vs1)); - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = vec_extract(vsumf0, 0); - -#elif defined __loongarch_asx - - const __m256i m4 = __lasx_xvreplgr2vr_b(0xF); - - __m256 acc = (__m256)__lasx_xvldi(0); - - float summs = 0; - - uint16_t aux16[2]; - const uint8_t * scales = (const uint8_t *)aux16; - - for (int i = 0; i < nb; ++i) { - - const float d = GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d; - const float m = GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d; - const __m256 vd = __lasx_xvreplfr2vr_s(d); - - const uint16_t * a = (const uint16_t *)x[i].scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - - summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0); - const __m256i q4l = __lasx_xvand_v(q4bits, m4); - const __m256i q4h = __lasx_xvand_v(__lasx_xvsrli_h(q4bits, 4), m4); - - const __m256i q8l = __lasx_xvld((const __m256i*)(q8+ 0), 0); - const __m256i q8h = __lasx_xvld((const __m256i*)(q8+32), 0); - - const __m256i p16l = lasx_maddubs_h(q4l, q8l); - const __m256i p16h = lasx_maddubs_h(q4h, q8h); - - const __m256i p32l = lasx_madd_h(__lasx_xvreplgr2vr_h(scales[0]), p16l); - acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(p32l), acc); - - const __m256i p32h = lasx_madd_h(__lasx_xvreplgr2vr_h(scales[1]), p16h); - acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(p32h), acc); - } - - *s = hsum_float_8(acc) - summs; - -#else - - uint8_t aux8[QK_K]; - int16_t aux16[16]; - float sums [8]; - memset(sums, 0, 8*sizeof(float)); - - uint16_t s16[2]; - const uint8_t * restrict scales = (const uint8_t *)s16; - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - uint8_t * restrict a = aux8; - for (int l = 0; l < 32; ++l) a[l+ 0] = q4[l] & 0xF; - for (int l = 0; l < 32; ++l) a[l+32] = q4[l] >> 4; - - const uint16_t * restrict b = (const uint16_t *)x[i].scales; - s16[0] = b[0] & 0x0f0f; - s16[1] = (b[0] >> 4) & 0x0f0f; - - sumf -= y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]); - - for (int j = 0; j < QK_K/32; ++j) { - for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l]; - q8 += 16; a += 16; - for (int l = 0; l < 16; ++l) aux16[l] += q8[l] * a[l]; - q8 += 16; a += 16; - const float dl = d * scales[j]; - for (int l = 0; l < 8; ++l) sums[l] += dl * (aux16[l] + aux16[l+8]); - } - } - for (int l = 0; l < 8; ++l) sumf += sums[l]; - *s = sumf; -#endif -} -#endif - -#if QK_K == 256 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -9151,12 +7554,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r float summs = 0.f; - for (int i = 0; i < nb; ++i) { - + for (int i = 0; i < nb; ++i) { const uint8_t * restrict q5 = x[i].qs; const int8_t * restrict q8 = y[i].qs; -#if QK_K == 256 const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); @@ -9166,10 +7567,6 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); utmp[2] = uaux; utmp[0] &= kmask1; -#else - // TODO - const float d = 0, dmin = 0; -#endif const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); @@ -9552,15 +7949,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r const uint8_t * restrict q5 = x[i].qs; const int8_t * restrict q8 = y[i].qs; -#if QK_K == 256 const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); -#else - // TODO - const float d = 0, dmin = 0; -#endif const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0])); @@ -9679,402 +8071,6 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r #endif } -#else - -void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q5_K * restrict x = vx; - const block_q8_K * restrict y = vy; - - const int nb = n / QK_K; - -#ifdef __ARM_NEON - const uint8x16_t m4b = vdupq_n_u8(0xf); - const uint8x16_t mh = vdupq_n_u8(16); - const int32x4_t mzero = vdupq_n_s32(0); - - ggml_int8x16x4_t q5bytes; - ggml_uint8x16x4_t q5h; - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const int8_t * sc = x[i].scales; - - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - - const uint8x8_t qhbits = vld1_u8(qh); - - const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); - const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); - - const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1)); - q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4)); - q5h.val[1] = vbicq_u8(mh, vshlq_n_u8(htmp, 2)); - q5h.val[2] = vbicq_u8(mh, htmp); - q5h.val[3] = vbicq_u8(mh, vshrq_n_u8(htmp, 2)); - - q5bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q5bits.val[0], m4b)), vreinterpretq_s8_u8(q5h.val[0])); - q5bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q5bits.val[1], m4b)), vreinterpretq_s8_u8(q5h.val[1])); - q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2])); - q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3])); - - int32_t sumi1 = sc[0] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0])); - int32_t sumi2 = sc[1] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1])); - int32_t sumi3 = sc[2] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2])); - int32_t sumi4 = sc[3] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3])); - - sumf += d * (sumi1 + sumi2 + sumi3 + sumi4); - } - - *s = sumf; - -#elif defined __AVX2__ - - const __m256i m4 = _mm256_set1_epi8(0xF); - const __m256i mone = _mm256_set1_epi8(1); - - __m256 acc = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); - - const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0])); - const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2])); - - int64_t aux64; - memcpy(&aux64, x[i].qh, 8); - const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64); - const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128); - - const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4); - const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4); - - const __m256i q5l_0 = _mm256_and_si256(q5bits, m4); - const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4); - - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); - - const __m256i p16_0 = _mm256_madd_epi16(scale_l, _mm256_maddubs_epi16(q5l_0, q8_0)); - const __m256i p16_1 = _mm256_madd_epi16(scale_h, _mm256_maddubs_epi16(q5l_1, q8_1)); - const __m256i s16_0 = _mm256_madd_epi16(scale_l, _mm256_maddubs_epi16(q5h_0, q8_0)); - const __m256i s16_1 = _mm256_madd_epi16(scale_h, _mm256_maddubs_epi16(q5h_1, q8_1)); - - const __m256i dot = _mm256_sub_epi32(_mm256_add_epi32(p16_0, p16_1), _mm256_add_epi32(s16_0, s16_1)); - - acc = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(dot), acc); - - } - - *s = hsum_float_8(acc); - -#elif defined __AVX__ - - const __m128i m4 = _mm_set1_epi8(0xF); - const __m128i mone = _mm_set1_epi8(1); - - __m256 acc = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); - - const __m128i scale_0 = _mm_set1_epi16(x[i].scales[0]); - const __m128i scale_1 = _mm_set1_epi16(x[i].scales[1]); - const __m128i scale_2 = _mm_set1_epi16(x[i].scales[2]); - const __m128i scale_3 = _mm_set1_epi16(x[i].scales[3]); - - int64_t aux64; - memcpy(&aux64, x[i].qh, 8); - const __m128i haux128_0 = _mm_set_epi64x(aux64 >> 1, aux64); - const __m128i haux128_1 = _mm_srli_epi16(haux128_0, 2); - - const __m128i q5h_0 = _mm_slli_epi16(_mm_andnot_si128(haux128_0, mone), 4); - const __m128i q5h_1 = _mm_slli_epi16(_mm_andnot_si128(haux128_1, mone), 4); - const __m128i q5h_2 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_0, 4), mone), 4); - const __m128i q5h_3 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_1, 4), mone), 4); - - const __m128i q5l_0 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 0), m4); - const __m128i q5l_1 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 1), m4); - const __m128i q5l_2 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 0), 4), m4); - const __m128i q5l_3 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 1), 4), m4); - - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); - - const __m128i p16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5l_0, _mm256_extractf128_si256(q8_0, 0))); - const __m128i p16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5l_1, _mm256_extractf128_si256(q8_0, 1))); - const __m128i p16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5l_2, _mm256_extractf128_si256(q8_1, 0))); - const __m128i p16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5l_3, _mm256_extractf128_si256(q8_1, 1))); - const __m128i s16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5h_0, _mm256_extractf128_si256(q8_0, 0))); - const __m128i s16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5h_1, _mm256_extractf128_si256(q8_0, 1))); - const __m128i s16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5h_2, _mm256_extractf128_si256(q8_1, 0))); - const __m128i s16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5h_3, _mm256_extractf128_si256(q8_1, 1))); - - const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2)); - const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3)); - - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc); - - } - - *s = hsum_float_8(acc); - -#elif defined __riscv_v_intrinsic - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const int8_t * sc = x[i].scales; - - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - - vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); - - // load qh - vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(qh, 8); - vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8)); - - size_t vl = 16; - - // combine both qh_1 and qh_2 - vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl); - - vuint8mf2_t qh_h0 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl); - vuint8mf2_t qh_h1 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), vl), 16, vl); - vuint8mf2_t qh_h2 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(qh_x, vl), 16, vl); - vuint8mf2_t qh_h3 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl); - - vint8mf2_t qh_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h0); - vint8mf2_t qh_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h1); - vint8mf2_t qh_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h2); - vint8mf2_t qh_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h3); - - // load q5 - vuint8mf2_t q5_x1 = __riscv_vle8_v_u8mf2(q5, vl); - vuint8mf2_t q5_x2 = __riscv_vle8_v_u8mf2(q5+16, vl); - - vint8mf2_t q5s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x1, 0xF, vl)); - vint8mf2_t q5s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x2, 0xF, vl)); - vint8mf2_t q5s_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x1, 0x4, vl)); - vint8mf2_t q5s_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x2, 0x4, vl)); - - vint8mf2_t q5_0 = __riscv_vsub_vv_i8mf2(q5s_0, qh_0, vl); - vint8mf2_t q5_1 = __riscv_vsub_vv_i8mf2(q5s_1, qh_1, vl); - vint8mf2_t q5_2 = __riscv_vsub_vv_i8mf2(q5s_2, qh_2, vl); - vint8mf2_t q5_3 = __riscv_vsub_vv_i8mf2(q5s_3, qh_3, vl); - - // load Q8 and multiply it with Q5 - vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q5_0, __riscv_vle8_v_i8mf2(q8, vl), vl); - vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q5_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl); - vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q5_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl); - vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q5_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl); - - vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl); - vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl); - vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl); - vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl); - - int32_t sumi1 = sc[0] * __riscv_vmv_x_s_i32m1_i32(vs_0); - int32_t sumi2 = sc[1] * __riscv_vmv_x_s_i32m1_i32(vs_1); - int32_t sumi3 = sc[2] * __riscv_vmv_x_s_i32m1_i32(vs_2); - int32_t sumi4 = sc[3] * __riscv_vmv_x_s_i32m1_i32(vs_3); - - sumf += d * (sumi1 + sumi2 + sumi3 + sumi4); - - } - - *s = sumf; - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0xF); - const vector unsigned char v1 = vec_splats((unsigned char)0x1); - const vector unsigned char v2 = vec_splats((unsigned char)0x2); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - -#pragma GCC unroll 2 - for (int i = 0; i < nb; ++i) { - __builtin_prefetch(x[i].qs, 0, 1); - __builtin_prefetch(y[i].qs, 0, 1); - - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd= vec_mul(vxd, vyd); - - vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs); - vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].qs); - vector signed char qxs00 = (vector signed char)vec_and(qxs0, lowMask); - vector signed char qxs01 = (vector signed char)vec_sr(qxs0, v4); - vector signed char qxs10 = (vector signed char)vec_and(qxs1, lowMask); - vector signed char qxs11 = (vector signed char)vec_sr(qxs1, v4); - - vector signed char qxhs = (vector signed char)vec_xl_len(x[i].qh, 8); - vector signed char qxhs0 = vec_or(qxhs, vec_sr(vec_sld(qxhs, qxhs, 8), v1)); - vector signed char qxhs1 = vec_sr(qxhs0, v2); - vector signed char qxh00 = vec_sl(vec_andc((vector signed char)v1, qxhs0), v4); - vector signed char qxh10 = vec_sl(vec_andc((vector signed char)v1, qxhs1), v4); - vector signed char qxh01 = vec_sl(vec_andc((vector signed char)v1, vec_sr(qxhs0, v4)), v4); - vector signed char qxh11 = vec_sl(vec_andc((vector signed char)v1, vec_sr(qxhs1, v4)), v4); - - vector signed char q5x00 = vec_sub(qxs00, qxh00); - vector signed char q5x10 = vec_sub(qxs10, qxh10); - vector signed char q5x01 = vec_sub(qxs01, qxh01); - vector signed char q5x11 = vec_sub(qxs11, qxh11); - - vector signed char q8y00 = vec_xl( 0, y[i].qs); - vector signed char q8y10 = vec_xl(16, y[i].qs); - vector signed char q8y01 = vec_xl(32, y[i].qs); - vector signed char q8y11 = vec_xl(48, y[i].qs); - - vector signed short qv00 = vec_add(vec_mule(q5x00, q8y00), vec_mulo(q5x00, q8y00)); - vector signed short qv01 = vec_add(vec_mule(q5x01, q8y01), vec_mulo(q5x01, q8y01)); - vector signed short qv10 = vec_add(vec_mule(q5x10, q8y10), vec_mulo(q5x10, q8y10)); - vector signed short qv11 = vec_add(vec_mule(q5x11, q8y11), vec_mulo(q5x11, q8y11)); - - vector signed short vs = (vector signed short)vec_unpackh(vec_xl_len(x[i].scales, 4)); - vector signed short vs0 = vec_splat(vs, 0); - vector signed short vs1 = vec_splat(vs, 1); - vector signed short vs2 = vec_splat(vs, 2); - vector signed short vs3 = vec_splat(vs, 3); - - vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0)); - vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1)); - vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2)); - vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3)); - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = vec_extract(vsumf0, 0); - -#elif defined __loongarch_asx - - const __m256i m4 = __lasx_xvreplgr2vr_b(0xF); - const __m256i mone = __lasx_xvreplgr2vr_b(1); - - __m256 acc = (__m256)__lasx_xvldi(0); - - for (int i = 0; i < nb; ++i) { - - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); - - const __m256i scale_l = lasx_insertf128(__lsx_vreplgr2vr_h(x[i].scales[1]), __lsx_vreplgr2vr_h(x[i].scales[0])); - const __m256i scale_h = lasx_insertf128(__lsx_vreplgr2vr_h(x[i].scales[3]), __lsx_vreplgr2vr_h(x[i].scales[2])); - - int64_t aux64; - memcpy(&aux64, x[i].qh, 8); - __m128i haux128 = __lsx_vinsgr2vr_d(haux128, aux64, 0); - haux128 = __lsx_vinsgr2vr_d(haux128, aux64 >> 1, 1); - const __m256i haux256 = lasx_insertf128(__lsx_vsrli_h(haux128, 2), haux128); - - const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvandn_v(haux256, mone), 4); - const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvandn_v(__lasx_xvsrli_h(haux256, 4), mone), 4); - - const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4); - const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4); - - const __m256i q8_0 = __lasx_xvld((const __m256i*)(q8+ 0), 0); - const __m256i q8_1 = __lasx_xvld((const __m256i*)(q8+32), 0); - - const __m256i p16_0 = lasx_madd_h(scale_l, lasx_maddubs_h(q5l_0, q8_0)); - const __m256i p16_1 = lasx_madd_h(scale_h, lasx_maddubs_h(q5l_1, q8_1)); - const __m256i s16_0 = lasx_madd_h(scale_l, lasx_maddubs_h(q5h_0, q8_0)); - const __m256i s16_1 = lasx_madd_h(scale_h, lasx_maddubs_h(q5h_1, q8_1)); - - const __m256i dot = __lasx_xvsub_w(__lasx_xvadd_w(p16_0, p16_1), __lasx_xvadd_w(s16_0, s16_1)); - - acc = __lasx_xvfmadd_s((__m256)__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(dot), acc); - } - - *s = hsum_float_8(acc); - -#else - - int8_t aux8[QK_K]; - int16_t aux16[16]; - float sums [8]; - memset(sums, 0, 8*sizeof(float)); - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - int8_t * restrict a = aux8; - for (int l = 0; l < 32; ++l) { - a[l+ 0] = q4[l] & 0xF; - a[l+32] = q4[l] >> 4; - } - for (int is = 0; is < 8; ++is) { - uint8_t m = 1 << is; - for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16); - } - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const int8_t * restrict sc = x[i].scales; - - for (int j = 0; j < QK_K/16; ++j) { - const float dl = d * sc[j]; - for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) sums[l] += dl * (aux16[l] + aux16[8+l]); - q8 += 16; a += 16; - } - } - for (int l = 0; l < 8; ++l) sumf += sums[l]; - *s = sumf; -#endif -} -#endif - - -#if QK_K == 256 void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -10733,446 +8729,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r #endif } -#else - -void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { - assert(n % QK_K == 0); - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - const block_q6_K * restrict x = vx; - const block_q8_K * restrict y = vy; - - const int nb = n / QK_K; - -#ifdef __ARM_NEON - float sum = 0; - - const uint8x16_t m4b = vdupq_n_u8(0xF); - const int8x16_t m32s = vdupq_n_s8(32); - const int32x4_t vzero = vdupq_n_s32(0); - - const uint8x16_t mone = vdupq_n_u8(3); - - ggml_int8x16x4_t q6bytes; - ggml_uint8x16x4_t q6h; - - for (int i = 0; i < nb; ++i) { - - const float d_all = GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - - const int8_t * restrict scale = x[i].scales; - - int32_t isum = 0; - - uint8x16_t qhbits = vld1q_u8(qh); - ggml_uint8x16x2_t q6bits = ggml_vld1q_u8_x2(q6); - ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); - - q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4); - uint8x16_t shifted = vshrq_n_u8(qhbits, 2); - q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4); - shifted = vshrq_n_u8(qhbits, 4); - q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4); - shifted = vshrq_n_u8(qhbits, 6); - q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4); - - q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s); - q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s); - q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s); - q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s); - - isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + - vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + - vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + - vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; - - sum += isum * d_all * y[i].d; - - } - *s = sum; - -#elif defined __AVX2__ - - const __m256i m4 = _mm256_set1_epi8(0xF); - const __m256i m2 = _mm256_set1_epi8(3); - const __m256i m32s = _mm256_set1_epi8(32); - - __m256 acc = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - - const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]); - const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]); - const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]); - const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]); - - __m256i sumi = _mm256_setzero_si256(); - - const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1); - const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3); - - const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); - const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh); - - const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4); - const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4); - - const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0); - const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1); - - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); - - __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0); - __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1); - - __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0); - __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1); - - p16_0 = _mm256_sub_epi16(p16_0, q8s_0); - p16_1 = _mm256_sub_epi16(p16_1, q8s_1); - - p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0); - p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1); - - sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1)); - - acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); - } - - *s = hsum_float_8(acc); - -#elif defined __AVX__ - - const __m128i m4 = _mm_set1_epi8(0xF); - const __m128i m2 = _mm_set1_epi8(3); - const __m128i m32s = _mm_set1_epi8(32); - - __m256 acc = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - - const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]); - const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]); - const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]); - const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]); - - __m128i sumi_0 = _mm_setzero_si128(); - __m128i sumi_1 = _mm_setzero_si128(); - - const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1); - const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3); - - const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); - const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh); - - const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH, m2), 4); - const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 2), m2), 4); - const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 4), m2), 4); - const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 6), m2), 4); - - const __m128i q4_0 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 0), m4), q4h_0); - const __m128i q4_1 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 1), m4), q4h_1); - const __m128i q4_2 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 0), 4), m4), q4h_2); - const __m128i q4_3 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 1), 4), m4), q4h_3); - - const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); - const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); - - __m128i q8s_0 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 0)); - __m128i q8s_1 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 1)); - __m128i q8s_2 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 0)); - __m128i q8s_3 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 1)); - - __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0)); - __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1)); - __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0)); - __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1)); - - p16_0 = _mm_sub_epi16(p16_0, q8s_0); - p16_1 = _mm_sub_epi16(p16_1, q8s_1); - p16_2 = _mm_sub_epi16(p16_2, q8s_2); - p16_3 = _mm_sub_epi16(p16_3, q8s_3); - - p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0); - p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1); - p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2); - p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3); - - sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); - sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); - - acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc); - } - - *s = hsum_float_8(acc); - -#elif defined __riscv_v_intrinsic - - float sumf = 0; - - for (int i = 0; i < nb; ++i) { - - const float d_all = GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - - const int8_t * restrict scale = x[i].scales; - - int32_t isum = 0; - - size_t vl = 16; - - vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); - - // load Q6 - vuint8mf2_t q6_0 = __riscv_vle8_v_u8mf2(q6, vl); - vuint8mf2_t q6_1 = __riscv_vle8_v_u8mf2(q6+16, vl); - - // load qh - vuint8mf2_t qh_x = __riscv_vle8_v_u8mf2(qh, vl); - - vuint8mf2_t qh0 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl); - qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl); - vuint8mf2_t qh1 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl); - qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl); - vuint8mf2_t qh2 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl); - qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl); - vuint8mf2_t qh3 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl); - - vuint8mf2_t q6h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_0, 0xF, vl), qh0, vl); - vuint8mf2_t q6h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_1, 0xF, vl), qh1, vl); - vuint8mf2_t q6h_2 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_0, 0x4, vl), qh2, vl); - vuint8mf2_t q6h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_1, 0x4, vl), qh3, vl); - - vint8mf2_t q6v_0 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_0), 32, vl); - vint8mf2_t q6v_1 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_1), 32, vl); - vint8mf2_t q6v_2 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_2), 32, vl); - vint8mf2_t q6v_3 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_3), 32, vl); - - // load Q8 and take product - vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q6v_0, __riscv_vle8_v_i8mf2(q8, vl), vl); - vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q6v_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl); - vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q6v_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl); - vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q6v_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl); - - vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl); - vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl); - vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl); - vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl); - - isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scale[0]; - isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scale[1]; - isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scale[2]; - isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scale[3]; - - sumf += isum * d_all * y[i].d; - - } - - *s = sumf; - -#elif defined(__POWER9_VECTOR__) - const vector signed char lowMask = vec_splats((signed char)0xF); - const vector unsigned char v2 = vec_splats((unsigned char)0x2); - const vector unsigned char v3 = vec_splats((unsigned char)0x3); - const vector unsigned char v4 = vec_splats((unsigned char)0x4); - const vector unsigned char v6 = vec_splats((unsigned char)0x6); - const vector signed char off = vec_splats((signed char)0x20); - - vector float vsumf0 = vec_splats(0.0f); - vector float vsumf1 = vec_splats(0.0f); - vector float vsumf2 = vec_splats(0.0f); - vector float vsumf3 = vec_splats(0.0f); - -#pragma GCC unroll 2 - for (int i = 0; i < nb; ++i) { - __builtin_prefetch(x[i].ql, 0, 1); - __builtin_prefetch(x[i].qh, 0, 1); - __builtin_prefetch(y[i].qs, 0, 1); - - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); - vector float vyd = vec_splats(y[i].d); - vector float vd= vec_mul(vxd, vyd); - - vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].ql); - vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].ql); - vector signed char qxs00 = vec_and(qxs0, lowMask); - vector signed char qxs01 = vec_sr(qxs0, v4); - vector signed char qxs10 = vec_and(qxs1, lowMask); - vector signed char qxs11 = vec_sr(qxs1, v4); - - vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh); - - vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4); - vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4); - vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4); - vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4); - - vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off); - vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off); - vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off); - vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off); - - vector signed char q8y00 = vec_xl( 0, y[i].qs); - vector signed char q8y10 = vec_xl(16, y[i].qs); - vector signed char q8y01 = vec_xl(32, y[i].qs); - vector signed char q8y11 = vec_xl(48, y[i].qs); - - vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00)); - vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10)); - vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01)); - vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11)); - - vector signed short vs = (vector signed short)vec_unpackh(vec_xl_len(x[i].scales, 4)); - vector signed short vs0 = vec_splat(vs, 0); - vector signed short vs1 = vec_splat(vs, 1); - vector signed short vs2 = vec_splat(vs, 2); - vector signed short vs3 = vec_splat(vs, 3); - - vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0)); - vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1)); - vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2)); - vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3)); - - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); - vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); - vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); - vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3); - } - - vsumf0 = vec_add(vsumf0, vsumf2); - vsumf1 = vec_add(vsumf1, vsumf3); - - vsumf0 = vec_add(vsumf0, vsumf1); - - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); - vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); - - *s = vec_extract(vsumf0, 0); - -#elif defined __loongarch_asx - - const __m256i m4 = __lasx_xvreplgr2vr_b(0xF); - const __m256i m2 = __lasx_xvreplgr2vr_b(3); - const __m256i m32s = __lasx_xvreplgr2vr_b(32); - - __m256 acc = (__m256)__lasx_xvldi(0); - - for (int i = 0; i < nb; ++i) { - - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - - const __m64 scales_1 = __lasx_xvreplgr2vr_b(x[i].scales[0]); - const __m64 scales_2 = __lasx_xvreplgr2vr_b(x[i].scales[1]); - const __m64 scales_3 = __lasx_xvreplgr2vr_b(x[i].scales[2]); - const __m64 scales_4 = __lasx_xvreplgr2vr_b(x[i].scales[3]); - - __m256i sumi = __lasx_xvldi(0); - - __m128i scale_0 = __lsx_vinsgr2vr_d(scale_0, scales_1, 0); - scale_0 = __lsx_vinsgr2vr_d(scale_0, scales_2, 1); - __m128i scale_1 = __lsx_vinsgr2vr_d(scale_1, scales_3, 0); - scale_1 = __lsx_vinsgr2vr_d(scale_1, scales_4, 1); - - const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0); - const __m128i q4bitsH = __lsx_vld((const __m128i*)qh, 0); - - const __m256i q4h_0 = __lasx_xvslli_h(__lasx_xvand_v(lasx_insertf128(__lasx_xvsrli_h(q4bitsH, 2), q4bitsH), m2), 4); - const __m256i q4h_1 = __lasx_xvslli_h(__lasx_xvand_v(lasx_insertf128(__lasx_xvsrli_h(q4bitsH, 6), __lasx_xvsrli_h(q4bitsH, 4)), m2), 4); - - const __m256i q4_0 = __lasx_xvor_v(__lasx_xvand_v(q4bits1, m4), q4h_0); - const __m256i q4_1 = __lasx_xvor_v(__lasx_xvand_v(__lasx_xvsrli_h(q4bits1, 4), m4), q4h_1); - - const __m256i q8_0 = __lasx_xvld((const __m256i*)(q8+ 0), 0); - const __m256i q8_1 = __lasx_xvld((const __m256i*)(q8+32), 0); - - __m256i q8s_0 = lasx_maddubs_h(m32s, q8_0); - __m256i q8s_1 = lasx_maddubs_h(m32s, q8_1); - - __m256i p16_0 = lasx_maddubs_h(q4_0, q8_0); - __m256i p16_1 = lasx_maddubs_h(q4_1, q8_1); - - p16_0 = __lasx_xvsub_h(p16_0, q8s_0); - p16_1 = __lasx_xvsub_h(p16_1, q8s_1); - - p16_0 = lasx_madd_h(lasx_ext8_16(scale_0), p16_0); - p16_1 = lasx_madd_h(lasx_ext8_16(scale_1), p16_1); - - sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1)); - - acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc); - } - - *s = hsum_float_8(acc); - -#else - - int8_t aux8[QK_K]; - int16_t aux16[8]; - float sums [8]; - int32_t aux32[8]; - memset(sums, 0, 8*sizeof(float)); - - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; - for (int l = 0; l < 16; ++l) { - a[l+ 0] = (int8_t)((q4[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; - a[l+16] = (int8_t)((q4[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; - a[l+32] = (int8_t)((q4[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; - a[l+48] = (int8_t)((q4[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; - } - int is = 0; - for (int j = 0; j < QK_K/16; ++j) { - int scale = x[i].scales[is++]; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; - q8 += 8; a += 8; - } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - } - for (int l = 0; l < 8; ++l) sumf += sums[l]; - *s = sumf; -#endif -} - -#endif - #if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx) static const int8_t keven_signs_q2xs[1024] = { 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, @@ -11564,64 +9120,6 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1); const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2); -#if QK_K == 64 - static const uint8_t k_bit_helper[16] = { - 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, - }; - const __m128i bit_helper = _mm_loadu_si128((const __m128i*)k_bit_helper); - const __m128i m511 = _mm_set1_epi16(511); - typedef union { - __m128i vec_index; - uint16_t index[8]; - } index_t; - - index_t idx; - __m256 accumf = _mm256_setzero_ps(); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const __m128i q2_data = _mm_loadu_si128((const __m128i*)x[i].qs); - idx.vec_index = _mm_and_si128(q2_data, m511); - - const __m128i partial_sign_bits = _mm_srli_epi16(q2_data, 9); - const __m128i partial_sign_bits_upper = _mm_srli_epi16(q2_data, 13); - const __m128i partial_sign_bits_for_counting = _mm_xor_si128(partial_sign_bits, partial_sign_bits_upper); - - const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting); - const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits); - const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits); - - const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs); - const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32)); - - const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]], - iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]); - const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]], - iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]); - - __m256i signs; - signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_1); - signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); - const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone)); - - signs = _mm256_shuffle_epi8(full_signs, block_sign_shuffle_2); - signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); - const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone)); - - const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); - const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); - - const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1)); - const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1)); - - const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2)); - - accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sum), accumf); - - } - - *s = 0.125f * hsum_float_8(accumf); -#else - static const uint8_t k_bit_helper[32] = { 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, @@ -11719,7 +9217,6 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * } *s = 0.125f * hsum_float_8(accumf); -#endif #elif defined(__loongarch_asx) const __m256i mone = __lasx_xvreplgr2vr_b(1); @@ -11740,62 +9237,6 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * const __m256i block_sign_shuffle_1 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_1, 0); const __m256i block_sign_shuffle_2 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_2, 0); -#if QK_K == 64 - static const uint8_t k_bit_helper[16] = { - 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, - }; - const __m128i bit_helper = __lsx_vld((const __m128i*)k_bit_helper, 0); - const __m128i m511 = __lsx_vreplgr2vr_h(511); - typedef union { - __m128i vec_index; - uint16_t index[8]; - } index_t; - - index_t idx; - __m256 accumf = (__m256)__lasx_xvldi(0); - for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const __m128i q2_data = __lsx_vld((const __m128i*)x[i].qs, 0); - idx.vec_index = __lsx_vand_v(q2_data, m511); - - const __m128i partial_sign_bits = __lsx_vsrli_h(q2_data, 9); - const __m128i partial_sign_bits_upper = __lsx_vsrli_h(q2_data, 13); - const __m128i partial_sign_bits_for_counting = __lsx_vxor_v(partial_sign_bits, partial_sign_bits_upper); - - const __m128i odd_bits = lsx_shuffle_b(bit_helper, partial_sign_bits_for_counting); - const __m128i full_sign_bits = __lsx_vor_v(partial_sign_bits, odd_bits); - const __m256i full_signs = lasx_insertf128(full_sign_bits, full_sign_bits); - - const __m256i q8_1 = __lasx_xvld((const __m256i *)y[i].qs, 0); - const __m256i q8_2 = __lasx_xvld((const __m256i *)(y[i].qs+32), 0); - - const __m256i q2_1 = lasx_set_d(iq2xs_grid[idx.index[3]], iq2xs_grid[idx.index[2]], - iq2xs_grid[idx.index[1]], iq2xs_grid[idx.index[0]]); - const __m256i q2_2 = lasx_set_d(iq2xs_grid[idx.index[7]], iq2xs_grid[idx.index[6]], - iq2xs_grid[idx.index[5]], iq2xs_grid[idx.index[4]]); - __m256i signs; - signs = lasx_shuffle_b(full_signs, block_sign_shuffle_1); - signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask); - const __m256i q8s_1 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_1); - - signs = lasx_shuffle_b(full_signs, block_sign_shuffle_2); - signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask); - const __m256i q8s_2 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_2); - - const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); - const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); - - const __m256i sc1 = lasx_insertf128(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), __lsx_vreplgr2vr_h(2*(x[i].scales[0] & 0xf)+1)); - const __m256i sc2 = lasx_insertf128(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), __lsx_vreplgr2vr_h(2*(x[i].scales[1] & 0xf)+1)); - - const __m256i sum = __lasx_xvadd_w(lasx_madd_h(sc1, dot1), lasx_madd_h(sc2, dot2)); - - accumf = __lasx_vfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sum), accumf); - } - - *s = 0.125f * hsum_float_8(accumf); -#else - static const uint8_t k_bit_helper[32] = { 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, @@ -11893,9 +9334,6 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * } *s = 0.125f * hsum_float_8(accumf); -#endif - - #elif defined(__POWER9_VECTOR__) vector float vsumf0 = vec_splats(0.0f); vector float vsumf1 = vec_splats(0.0f); @@ -12748,10 +10186,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * ggml_int8x16x4_t q8b; vec_index_t idx; -#if QK_K == 256 uint32_t scales32[2]; const uint8_t * scales8 = (const uint8_t *)scales32; -#endif float sumf = 0; for (int i = 0; i < nb; ++i) { @@ -12761,11 +10197,9 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * const uint16_t * restrict signs = (const uint16_t *)x[i].signs; const int8_t * restrict q8 = y[i].qs; -#if QK_K == 256 memcpy(scales32, x[i].scales, 4); scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101; scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101; -#endif int sumi1 = 0, sumi2 = 0; for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { @@ -12806,13 +10240,9 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]); const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]); -#if QK_K == 256 + sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0]; sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4]; -#else - sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf)); - sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4)); -#endif } sumf += d*(sumi1 + sumi2); } @@ -13476,17 +10906,10 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void const int nb = n / QK_K; -#if QK_K != 64 iq1m_scale_t scale; -#endif #if defined __ARM_NEON - -#if QK_K == 64 - const int32x4_t mask = vdupq_n_s32(0xf); -#else const int32x4_t mask = vdupq_n_s32(0x7); -#endif const int32x4_t mone = vdupq_n_s32(1); const int32x4_t mzero = vdupq_n_s32(0); @@ -13510,9 +10933,7 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void const uint8_t * qh = x[i].qh; const uint16_t * sc = (const uint16_t *)x[i].scales; -#if QK_K != 64 scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); -#endif int32x4_t sumi1 = mzero; int32x4_t sumi2 = mzero; @@ -13541,11 +10962,8 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3])); const int32x4_t p34 = vpaddq_s32(p3, p4); -#if QK_K == 64 - int32x4_t scales_4 = ggml_vld1q_u32(sc[0] >> 0, sc[0] >> 4, sc[0] >> 8, sc[0] >> 12); -#else int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9); -#endif + scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone); sumi1 = vmlaq_s32(sumi1, scales_4, p12); @@ -13555,22 +10973,14 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void } -#if QK_K == 64 - sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2)); -#else sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2)); -#endif } *s = sumf; #elif defined __AVX2__ -#if QK_K == 64 - const __m256i mask = _mm256_set1_epi16(0xf); -#else const __m256i mask = _mm256_set1_epi16(0x7); -#endif const __m256i mone = _mm256_set1_epi16(1); __m256 accum1 = _mm256_setzero_ps(); @@ -13582,9 +10992,7 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void const uint8_t * qh = x[i].qh; const uint16_t * sc = (const uint16_t *)x[i].scales; -#if QK_K != 64 scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); -#endif __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); @@ -13614,13 +11022,10 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void const __m256i dot3 = mul_add_epi8(delta1, q8b_1); const __m256i dot4 = mul_add_epi8(delta2, q8b_2); -#if QK_K == 64 - __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 4), _mm_set1_epi16(sc[0] >> 0)); - __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 12), _mm_set1_epi16(sc[0] >> 8)); -#else + __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0)); __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6)); -#endif + scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone); scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone); const __m256i p1 = _mm256_madd_epi16(dot1, scale1); @@ -13634,14 +11039,10 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void qs += 8; qh += 4; } -#if QK_K == 64 - const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)); -#else const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16)); -#endif + accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1); accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2); - } *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2); @@ -13658,9 +11059,7 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void const uint8_t * qh = x[i].qh; const uint16_t * sc = (const uint16_t *)x[i].scales; -#if QK_K != 64 scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); -#endif int sumi1 = 0, sumi2 = 0; for (int ib = 0; ib < QK_K/32; ++ib) { @@ -13680,24 +11079,17 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void sum1[l/2] += lsum1; sum2[l/2] += lsum2*delta[l]; } -#if QK_K == 64 - const int ls1 = 2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1; - const int ls2 = 2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1; -#else + const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1; const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1; -#endif + sumi1 += sum1[0] * ls1 + sum1[1] * ls2; sumi2 += sum2[0] * ls1 + sum2[1] * ls2; qs += 4; qh += 2; } -#if QK_K == 64 - sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); -#else sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); -#endif } *s = sumf; @@ -13885,9 +11277,6 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * UNUSED(by); UNUSED(bs); assert(n % QK_K == 0); -#if QK_K == 64 - ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc); -#else const block_iq4_xs * restrict x = vx; const block_q8_K * restrict y = vy; @@ -14180,7 +11569,6 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * } *s = sumf; #endif -#endif } // ================================ IQ2 quantization ============================================= @@ -15998,10 +13386,6 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy const float * xx; for (int ibl = 0; ibl < nbl; ++ibl) { - -#if QK_K == 64 - y[ibl].d = GGML_FP32_TO_FP16(0.f); -#endif memset(y[ibl].qs, 0, QK_K/8); memset(y[ibl].qh, 0, QK_K/16); memset(y[ibl].scales, 0, QK_K/32); @@ -16176,22 +13560,13 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy } uint16_t * sc = (uint16_t *)y[ibl].scales; -#if QK_K == 64 - float d = max_scale/31; -#else float d = max_scale/15; -#endif float id = 1/d; float sumqx_f = 0, sumq2_f = 0; for (int ib = 0; ib < QK_K/block_size; ++ib) { int l = nearest_int(0.5f*(id*scales[ib+0]-1)); -#if QK_K == 64 - l = MAX(0, MIN(15, l)); - sc[ib/4] |= (l << 4*(ib%4)); -#else l = MAX(0, MIN(7, l)); sc[ib/4] |= (l << 3*(ib%4)); -#endif y[ibl].qh[ib] |= masks[shifts[ib]]; const float * xb = xbl + block_size*ib; if (quant_weights) { @@ -16214,14 +13589,10 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy } if (sumq2_f > 0) d = sumqx_f/sumq2_f; s.f16 = GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed. -#if QK_K == 64 - y[ibl].d = s.f16; -#else sc[0] |= ((s.u16 & 0x000f) << 12); sc[1] |= ((s.u16 & 0x00f0) << 8); sc[2] |= ((s.u16 & 0x0f00) << 4); sc[3] |= ((s.u16 & 0xf000) << 0); -#endif } } @@ -16410,9 +13781,6 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest } size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { -#if QK_K == 64 - return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights); -#else GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -16430,7 +13798,6 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t qrow += nblock*sizeof(block_iq4_xs); } return nrow * nblock * sizeof(block_iq4_xs); -#endif } void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) { @@ -16842,19 +14209,11 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte } break; case GGML_TYPE_Q4_K: { - #ifdef GGML_QKK_64 - VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d[0], d[1]); - #else VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d, dmin); - #endif } break; case GGML_TYPE_Q5_K: { - #ifdef GGML_QKK_64 - VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_K, data, nb); - #else VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_K, data, nb, d, dmin); - #endif } break; case GGML_TYPE_Q6_K: { @@ -16877,18 +14236,12 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { const block_iq1_m * q = (const block_iq1_m *) data; for (size_t i = 0; i < nb; ++i) { - #if QK_K == 64 - if (!validate_fp16(q[i].d, i)) { - return false; - } - #else iq1m_scale_t scale; const uint16_t * sc = (const uint16_t *)q[i].scales; scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); if (!validate_fp16(scale.f16, i)) { return false; } - #endif } } break; case GGML_TYPE_IQ2_XXS: @@ -16913,12 +14266,9 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_s, data, nb); } break; case GGML_TYPE_IQ4_XS: - #if QK_K != 64 { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_xs, data, nb); } break; - #endif - // with QK_K == 64, iq4_xs is iq4_nl case GGML_TYPE_IQ4_NL: { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index f486b6c0a..496ec61c3 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -4197,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri const block_q2_K * x = (const block_q2_K *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int n = tid/32; const int l = tid - 32*n; const int is = 8*n + l/16; @@ -4211,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); -#else - const int is = tid/16; // 0 or 1 - const int il = tid%16; // 0...15 - const uint8_t q = x[i].qs[il] >> (2*is); - dst_t * y = yy + i*QK_K + 16*is + il; - - float dall = x[i].dm[0]; - float dmin = x[i].dm[1]; - y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); - y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); -#endif - } template @@ -4232,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri const int i = item_ct1.get_group(2); const block_q3_K * x = (const block_q3_K *) vx; -#if QK_K == 256 const int r = item_ct1.get_local_id(2) / 4; const int tid = r/2; const int is0 = r%2; @@ -4256,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri const uint8_t * hm = x[i].hmask; for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); -#else - const int tid = item_ct1.get_local_id(2); - const int is = tid/16; // 0 or 1 - const int il = tid%16; // 0...15 - const int im = il/8; // 0...1 - const int in = il%8; // 0...7 - - dst_t * y = yy + i*QK_K + 16*is + il; - - const uint8_t q = x[i].qs[il] >> (2*is); - const uint8_t h = x[i].hmask[in] >> (2*is + im); - const float d = (float)x[i].d; - - if (is == 0) { - y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); - y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); - } else { - y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); - y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); - } -#endif - } -#if QK_K == 256 static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { if (j < 4) { d = q[j] & 63; m = q[j + 4] & 63; @@ -4289,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8 m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); } } -#endif template static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy, @@ -4298,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri const int i = item_ct1.get_group(2); -#if QK_K == 256 // assume 32 threads const int tid = item_ct1.get_local_id(2); const int il = tid/8; @@ -4322,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri y[l + 0] = d1 * (q[l] & 0xF) - m1; y[l +32] = d2 * (q[l] >> 4) - m2; } -#else - const int tid = item_ct1.get_local_id(2); - const uint8_t * q = x[i].qs; - dst_t * y = yy + i*QK_K; - const float d = (float)x[i].dm[0]; - const float m = (float)x[i].dm[1]; - y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); - y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); -#endif } template @@ -4340,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri const int i = item_ct1.get_group(2); -#if QK_K == 256 // assume 64 threads - this is very slightly better than the one below const int tid = item_ct1.get_local_id(2); const int il = tid/16; // il is in 0...3 @@ -4367,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri hm <<= 1; y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; -#else - const int tid = item_ct1.get_local_id(2); - const uint8_t q = x[i].qs[tid]; - const int im = tid/8; // 0...3 - const int in = tid%8; // 0...7 - const int is = tid/16; // 0 or 1 - const uint8_t h = x[i].qh[in] >> im; - const float d = x[i].d; - dst_t * y = yy + i*QK_K + tid; - y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); - y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); -#endif } template @@ -4387,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri const block_q6_K * x = (const block_q6_K *) vx; const int i = item_ct1.get_group(2); -#if QK_K == 256 // assume 64 threads - this is very slightly better than the one below const int tid = item_ct1.get_local_id(2); @@ -4407,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); -#else - - // assume 32 threads - const int tid = item_ct1.get_local_id(2); - const int ip = tid/16; // 0 or 1 - const int il = tid - 16*ip; // 0...15 - - dst_t * y = yy + i*QK_K + 16*ip + il; - - const float d = x[i].d; - - const uint8_t ql = x[i].ql[16*ip + il]; - const uint8_t qh = x[i].qh[il] >> (2*ip); - const int8_t * sc = x[i].scales; - - y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); - y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32); -#endif } template @@ -4438,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res const block_iq2_xxs * x = (const block_iq2_xxs *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -4449,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f; const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127]; for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f); -#else - assert(false); -#endif - } template @@ -4466,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest const block_iq2_xs * x = (const block_iq2_xs *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -4475,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -#else - assert(false); -#endif - } template @@ -4490,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy, const block_iq2_s * x = (const block_iq2_s *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -4498,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy, const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; #pragma unroll - for (int j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -#else - assert(false); - -#endif - + } } template @@ -4518,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res const block_iq3_xxs * x = (const block_iq3_xxs *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -4533,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); } -#else - assert(false); -#endif - } template @@ -4549,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy, const block_iq3_s * x = (const block_iq3_s *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -4563,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy, y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); } -#else - assert(false); -#endif - } template @@ -4579,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy, const block_iq1_s * x = (const block_iq1_s *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -4593,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy, for (int j = 0; j < 8; ++j) { y[j] = d * (q[j] + delta); } -#else - assert(false); -#endif - } template @@ -4609,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy, const block_iq1_m * x = (const block_iq1_m *) vx; const int tid = item_ct1.get_local_id(2); -#if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -4627,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy, for (int j = 0; j < 8; ++j) { y[j] = d * (q[j] + delta); } -#else - assert(false); -#endif - } template @@ -4704,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, float tmp = 0; // partial sum for thread in warp -#if QK_K == 256 const int tid = item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15 const int ix = @@ -4755,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, tmp += dall * sum1 - dmin * sum2; } -#else - const int tid = item_ct1.get_local_id(2) / - (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7 - const int ix = item_ct1.get_local_id(2) % - (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3 - const int offset = tid * K_QUANTS_PER_ITERATION; - - uint32_t uaux[2]; - const uint8_t * d = (const uint8_t *)uaux; - - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + offset; - const uint8_t * q = x[i].qs + offset; - const uint32_t * s = (const uint32_t *)x[i].scales; - - uaux[0] = s[0] & 0x0f0f0f0f; - uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; - - const sycl::float2 dall = - x[i].dm.convert(); - - float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - const uint8_t ql = q[l]; - sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) - + y[l+16] * d[1] * ((ql >> 2) & 3) - + y[l+32] * d[2] * ((ql >> 4) & 3) - + y[l+48] * d[3] * ((ql >> 6) & 3); - sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; - } - tmp += dall.x() * sum1 - dall.y() * sum2; - } - -#endif // sum up partial sums and write back result #pragma unroll @@ -4828,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, float tmp = 0; // partial sum for thread in warp -#if QK_K == 256 - const uint16_t kmask1 = 0x0303; const uint16_t kmask2 = 0x0f0f; @@ -4882,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, tmp += d * sum; } -#else - - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 - const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14 - const int in = offset/8; // 0 or 1 - const int im = offset%8; // 0...7 - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + offset; - const uint8_t * q = x[i].qs + offset; - const uint8_t * s = x[i].scales; - - const float dall = (float)x[i].d; - - float sum = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - const uint8_t hl = x[i].hmask[im+l] >> in; - const uint8_t ql = q[l]; - sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) - + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) - + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) - + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); - } - tmp += sum; - } -#endif // sum up partial sums and write back result #pragma unroll @@ -4944,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, const block_q4_K * x = (const block_q4_K *)vx + ib0; -#if QK_K == 256 const uint16_t kmask1 = 0x3f3f; const uint16_t kmask2 = 0x0f0f; const uint16_t kmask3 = 0xc0c0; @@ -5033,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, #endif } -#else - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); - - const int step = tid * K_QUANTS_PER_ITERATION; - - uint16_t aux16[2]; - const uint8_t * s = (const uint8_t *)aux16; - - float tmp = 0; - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - const uint8_t * q = x[i].qs + step; - const float * y = yy + i*QK_K + step; - const uint16_t * a = (const uint16_t *)x[i].scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - const float d = (float)x[i].dm[0]; - const float m = (float)x[i].dm[1]; - float sum = 0.f; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) - + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) - + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3]) - + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]); - } - tmp += sum; - } - -#endif // sum up partial sums and write back result #pragma unroll @@ -5097,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, float tmp = 0; // partial sum for thread in warp -#if QK_K == 256 const uint16_t kmask1 = 0x3f3f; const uint16_t kmask2 = 0x0f0f; const uint16_t kmask3 = 0xc0c0; @@ -5174,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, dmin * smin; } -#else - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); - const int step = tid * K_QUANTS_PER_ITERATION; - const int im = step/8; - const int in = step%8; - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - const uint8_t * q = x[i].qs + step; - const int8_t * s = x[i].scales; - const float * y = yy + i*QK_K + step; - const float d = x[i].d; - float sum = 0.f; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - const uint8_t h = x[i].qh[in+j] >> im; - sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) - + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) - + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) - + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); - } - tmp += sum; - } -#endif - // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { @@ -5224,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa const block_q6_K * x = (const block_q6_K *)vx + ib0; -#if QK_K == 256 - const int tid = item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 const int ix = @@ -5282,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa } -#else - - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3 - - const int step = tid * K_QUANTS_PER_ITERATION; - - float tmp = 0; // partial sum for thread in warp - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + step; - const uint8_t * ql = x[i].ql + step; - const uint8_t * qh = x[i].qh + step; - const int8_t * s = x[i].scales; - - const float d = x[i+0].d; - - float sum = 0; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) - + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) - + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32) - + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32); - } - tmp += sum; - - } - -#endif - // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { @@ -6857,7 +6586,6 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs) { -#ifndef GGML_QKK_64 const block_q4_K * bq4_K = (const block_q4_K *) vbq; int v[2]; @@ -6899,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq, } return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); - -#else - -#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics - const block_q4_K * bq4_K = (const block_q4_K *) vbq; - - float sumf_d = 0.0f; - float sumf_m = 0.0f; - - uint16_t aux16[2]; - const uint8_t * s = (const uint8_t *)aux16; - - const uint16_t * a = (const uint16_t *)bq4_K->scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - - const float dall = bq4_K->dm[0]; - const float dmin = bq4_K->dm[1]; - - const float d8_1 = bq8_1[0].ds[0]; - const float d8_2 = bq8_1[1].ds[1]; - - const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); - const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); - const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); - const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); - - const int * q4 = (const int *)bq4_K->qs + (iqs/2); - const int v1 = q4[0]; - const int v2 = q4[4]; - - const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0)); - const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0)); - const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0)); - const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0)); - - sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]); - sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]); - - return dall * sumf_d - dmin * sumf_m; - -#else - bad_arch(); -#endif // __SYCL_ARCH__ >= VER_4VEC - -#endif } template @@ -7003,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql, const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; -#if QK_K == 256 x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; -#else - x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]}; -#endif } #pragma unroll @@ -7050,7 +6728,6 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs) { -#ifndef GGML_QKK_64 const block_q5_K * bq5_K = (const block_q5_K *) vbq; int vl[2]; @@ -7092,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq, } return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); - -#else - -#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics - const block_q5_K * bq5_K = (const block_q5_K *) vbq; - - const int8_t * s = bq5_K->scales; - - const float d = bq5_K->d; - - const float d8_1 = bq8_1[0].ds[0]; - const float d8_2 = bq8_1[1].ds[1]; - - const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); - const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); - const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); - const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); - - const int * ql = (const int *)bq5_K->qs + (iqs/2); - const int vl1 = ql[0]; - const int vl2 = ql[4]; - - const int step = 4 * (iqs/2); // 0, 4, 8, 12 - const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6 - const int in = step%8; // 0, 4, 0, 4 - const int vh = (*((const int *)(bq5_K->qh + in))) >> im; - - const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f); - const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f); - const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f); - const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f); - - const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1]) - + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]); - - return d * sumf_d; - -#else - bad_arch(); -#endif // __SYCL_ARCH__ >= VER_4VEC - -#endif } template @@ -7205,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql, const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; -#if QK_K == 256 x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; -#endif } #pragma unroll @@ -7387,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs, const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs, const uint8_t *kmask_iq2xs) { -#if QK_K == 256 const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq; #if QR2_XXS == 8 @@ -7428,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq, } return d * (sumi1 + sumi2); #endif -#else - assert(false); - return 0.f; -#endif } static __dpct_inline__ float @@ -7440,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq, const uint64_t *iq2xs_grid, const uint64_t *ksigns64) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics -#if QK_K == 256 const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq; const int ib32 = iqs; @@ -7478,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq, assert(false); return 0.f; #endif -#else - assert(false); - return 0.f; -#endif } static __dpct_inline__ float vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs) { -#if QK_K == 256 const block_iq2_s * bq2 = (const block_iq2_s *) vbq; const int ib32 = iqs; @@ -7531,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, } const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f; return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); -#else - assert(false); -#endif } static __dpct_inline__ float @@ -7542,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics -#if QK_K == 256 const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq; const int ib32 = iqs; @@ -7570,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, assert(false); return 0.f; #endif -#else - assert(false); - return 0.f; -#endif } static __dpct_inline__ float vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs, const uint32_t *iq3s_grid) { -#if QK_K == 256 const block_iq3_s * bq2 = (const block_iq3_s *) vbq; const int ib32 = iqs; @@ -7609,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) * bq8_1[ib32].ds[0]; return d * sumi; -#else - assert(false); -#endif } static __dpct_inline__ float vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs, const uint32_t *iq1s_grid_gpu) { -#if QK_K == 256 const block_iq1_s * bq1 = (const block_iq1_s *) vbq; const int ib32 = iqs; @@ -7637,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, const float d = d1q * bq8_1[ib32].ds[0]; const float m = d1q * bq8_1[ib32].ds[1]; return d * sumi + m * delta; -#else - assert(false); -#endif } static __dpct_inline__ float vec_dot_iq1_m_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs) { -#if QK_K == 256 const block_iq1_m * bq1 = (const block_iq1_m *) vbq; const int ib32 = iqs; @@ -7670,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq, scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); const float d = (float)scale.f16 * bq8_1[ib32].ds[0]; return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1)); -#else - assert(false); -#endif } static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4, @@ -7720,7 +7322,6 @@ static __dpct_inline__ float vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs) { -#if QK_K == 256 const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; const uint8_t * values = (const uint8_t *)kvalues_iq4nl; @@ -7738,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq, sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2); } return d * (sumi1 + sumi2); -#else - assert(false); -#endif } template static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; -#if QK_K == 256 { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); @@ -10215,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k, dequantize_block_q2_K(vx, y, item_ct1); }); } -#else - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q2_K(vx, y, item_ct1); - }); - } - -#endif } template static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; -#if QK_K == 256 { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); @@ -10247,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k, dequantize_block_q3_K(vx, y, item_ct1); }); } -#else - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q3_K(vx, y, item_ct1); - }); - } -#endif } template @@ -10320,7 +9889,6 @@ template static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; -#if QK_K == 256 { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); @@ -10332,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k, dequantize_block_q5_K(vx, y, item_ct1); }); } -#else - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q5_K(vx, y, item_ct1); - }); - } - -#endif } template static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; -#if QK_K == 256 { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); @@ -10364,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k, dequantize_block_q6_K(vx, y, item_ct1); }); } -#else - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q6_K(vx, y, item_ct1); - }); - } - -#endif } template @@ -10529,9 +10068,6 @@ template static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = (k + QK_K - 1) / QK_K; -#if QK_K == 64 - dequantize_row_iq4_nl_sycl(vx, y, k, stream); -#else { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); @@ -10546,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k, }); }); } -#endif } @@ -12051,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, const int nrows_y, const int nrows_dst, dpct::queue_ptr stream) try { -#if QK_K == 256 - int id; SYCL_CHECK( CHECK_TRY_ERROR(id = get_current_device_id())); @@ -12167,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, }); } } -#endif } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ diff --git a/ggml.c b/ggml.c index d316e3d31..673c47748 100644 --- a/ggml.c +++ b/ggml.c @@ -871,22 +871,14 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { }, [GGML_TYPE_IQ4_XS] = { .type_name = "iq4_xs", -#if QK_K == 64 - .blck_size = QK4_NL, -#else .blck_size = QK_K, -#endif .type_size = sizeof(block_iq4_xs), .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq4_xs, .from_float = quantize_row_iq4_xs, .from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference, .vec_dot = ggml_vec_dot_iq4_xs_q8_K, -#if QK_K == 64 - .vec_dot_type = GGML_TYPE_Q8_0, -#else .vec_dot_type = GGML_TYPE_Q8_K, -#endif .nrows = 1, }, [GGML_TYPE_Q8_K] = { @@ -22117,11 +22109,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; -#if QK_K == 64 - case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; -#else case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; -#endif case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 42df2e4d0..67e23dcc1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -905,9 +905,8 @@ class GGUFValueType(IntEnum): raise ValueError(f"Unknown type: {type(val)}") -# Note: Does not support GGML_QKK_64 -QK_K = 256 # Items here are (block size, type size) +QK_K = 256 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { GGMLQuantizationType.F32: (1, 4), GGMLQuantizationType.F16: (1, 2), diff --git a/llama.cpp b/llama.cpp index 34137c7ad..37b3d58c6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -26,13 +26,9 @@ #ifdef GGML_USE_METAL # include "ggml-metal.h" #endif -#ifndef QK_K -# ifdef GGML_QKK_64 -# define QK_K 64 -# else -# define QK_K 256 -# endif -#endif + +// TODO: replace with ggml API call +#define QK_K 256 #ifdef __has_include #if __has_include() @@ -14308,8 +14304,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; - else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && - (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; if (qs.model.type == MODEL_70B) { // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with From d48c88cbd563b6cf0ce972e2f56796896e240736 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 May 2024 10:00:44 +0300 Subject: [PATCH 23/34] ggml : remove ggml_flash_attn and ggml_flash_ff (#7463) ggml-ci --- examples/finetune/finetune.cpp | 3 +- .../train-text-from-scratch.cpp | 3 +- ggml.c | 676 +----------------- ggml.h | 18 +- tests/test-grad0.cpp | 116 +-- 5 files changed, 47 insertions(+), 769 deletions(-) diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 992426c1b..22425730f 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -643,7 +643,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch); struct ggml_tensor * t16; if (enable_flash_attn) { - t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch); + GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported"); + //t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch); } else { struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch); struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 45bdfa8f5..e2f85c682 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -341,7 +341,8 @@ static struct ggml_tensor * llama_build_train_graphs( struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); struct ggml_tensor * t16; if (enable_flash_attn) { - t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); + GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported"); + //t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); } else { struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch); struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch); diff --git a/ggml.c b/ggml.c index 673c47748..9e72b7a76 100644 --- a/ggml.c +++ b/ggml.c @@ -2670,9 +2670,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "ARGSORT", "LEAKY_RELU", - "FLASH_ATTN", "FLASH_ATTN_EXT", - "FLASH_FF", "FLASH_ATTN_BACK", "SSM_CONV", "SSM_SCAN", @@ -2698,7 +2696,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76"); +static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -2760,9 +2758,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "argsort(x)", "leaky_relu(x)", - "flash_attn(x)", "flash_attn_ext(x)", - "flash_ff(x)", "flash_attn_back(x)", "ssm_conv(x)", "ssm_scan(x)", @@ -2788,7 +2784,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76"); +static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -6948,38 +6944,6 @@ struct ggml_tensor * ggml_top_k( return result; } -// ggml_flash_attn - -struct ggml_tensor * ggml_flash_attn( - struct ggml_context * ctx, - struct ggml_tensor * q, - struct ggml_tensor * k, - struct ggml_tensor * v, - bool masked) { - GGML_ASSERT(ggml_can_mul_mat(k, q)); - // TODO: check if vT can be multiplied by (k*qT) - - bool is_node = false; - - if (q->grad || k->grad || v->grad) { - is_node = true; - } - - //struct ggml_tensor * result = ggml_dup_tensor(ctx, q); - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne); - - int32_t t = masked ? 1 : 0; - ggml_set_op_params(result, &t, sizeof(t)); - - result->op = GGML_OP_FLASH_ATTN; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = q; - result->src[1] = k; - result->src[2] = v; - - return result; -} - // ggml_flash_attn_ext struct ggml_tensor * ggml_flash_attn_ext( @@ -7039,38 +7003,6 @@ void ggml_flash_attn_ext_set_prec( ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second } -// ggml_flash_ff - -struct ggml_tensor * ggml_flash_ff( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b0, - struct ggml_tensor * b1, - struct ggml_tensor * c0, - struct ggml_tensor * c1) { - GGML_ASSERT(ggml_can_mul_mat(b0, a)); - // TODO: more checks - - bool is_node = false; - - if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) { - is_node = true; - } - - //struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne); - - result->op = GGML_OP_FLASH_FF; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - result->src[1] = b0; - result->src[2] = b1; - result->src[3] = c0; - result->src[4] = c1; - - return result; -} - // ggml_flash_attn_back struct ggml_tensor * ggml_flash_attn_back( @@ -7080,6 +7012,8 @@ struct ggml_tensor * ggml_flash_attn_back( struct ggml_tensor * v, struct ggml_tensor * d, bool masked) { + GGML_ASSERT(false && "TODO: adapt to ggml_flash_attn_ext() changes"); + GGML_ASSERT(ggml_can_mul_mat(k, q)); // TODO: check if vT can be multiplied by (k*qT) @@ -15709,400 +15643,6 @@ static void ggml_compute_forward_argsort( } } -// ggml_compute_forward_flash_attn - -static void ggml_compute_forward_flash_attn_f32( - const struct ggml_compute_params * params, - const bool masked, - struct ggml_tensor * dst) { - - const struct ggml_tensor * q = dst->src[0]; - const struct ggml_tensor * k = dst->src[1]; - const struct ggml_tensor * v = dst->src[2]; - - int64_t t0 = ggml_perf_time_us(); - UNUSED(t0); - - GGML_TENSOR_LOCALS(int64_t, neq, q, ne) - GGML_TENSOR_LOCALS(size_t, nbq, q, nb) - GGML_TENSOR_LOCALS(int64_t, nek, k, ne) - GGML_TENSOR_LOCALS(size_t, nbk, k, nb) - GGML_TENSOR_LOCALS(int64_t, nev, v, ne) - GGML_TENSOR_LOCALS(size_t, nbv, v, nb) - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) - GGML_TENSOR_LOCALS(size_t, nb, dst, nb) - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t D = neq0; - const int64_t N = neq1; - const int64_t P = nek1 - N; - const int64_t M = P + N; - - const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); - - GGML_ASSERT(ne0 == D); - GGML_ASSERT(ne1 == N); - GGML_ASSERT(P >= 0); - - GGML_ASSERT(nbq0 == sizeof(float)); - GGML_ASSERT(nbk0 == sizeof(float)); - GGML_ASSERT(nbv0 == sizeof(float)); - - GGML_ASSERT(neq0 == D); - GGML_ASSERT(nek0 == D); - GGML_ASSERT(nev1 == D); - - GGML_ASSERT(neq1 == N); - GGML_ASSERT(nek1 == N + P); - GGML_ASSERT(nev1 == D); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - if (params->type == GGML_TASK_TYPE_INIT) { - return; - } - - if (params->type == GGML_TASK_TYPE_FINALIZE) { - return; - } - - // parallelize by q rows using ggml_vec_dot_f32 - - // total rows in q - const int nr = neq1*neq2*neq3; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - const float scale = 1.0f/sqrtf(D); - - //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); - - for (int ir = ir0; ir < ir1; ++ir) { - // q indices - const int iq3 = ir/(neq2*neq1); - const int iq2 = (ir - iq3*neq2*neq1)/neq1; - const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); - - float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32); - - for (int i = M; i < Mup; ++i) { - S[i] = -INFINITY; - } - - const int64_t masked_begin = masked ? (P + iq1 + 1) : M; - for (int64_t ic = 0; ic < masked_begin; ++ic) { - // k indices - const int ik3 = iq3; - const int ik2 = iq2 % nek2; - const int ik1 = ic; - - // S indices - const int i1 = ik1; - - ggml_vec_dot_f32(neq0, - S + i1, 0, - (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); - } - - // scale - ggml_vec_scale_f32(masked_begin, S, scale); - - for (int64_t i = masked_begin; i < M; i++) { - S[i] = -INFINITY; - } - - // softmax - // exclude known -INF S[..] values from max and loop - // dont forget to set their SW values to zero - { - float max = -INFINITY; - ggml_vec_max_f32(masked_begin, &max, S); - - ggml_float sum = 0.0; - { -#ifdef GGML_SOFT_MAX_ACCELERATE - max = -max; - vDSP_vsadd(S, 1, &max, S, 1, Mup); - vvexpf(S, S, &Mup); - ggml_vec_sum_f32(Mup, &sum, S); -#else - sum = ggml_vec_soft_max_f32(Mup, S, S, max); -#endif - } - - assert(sum > 0.0); - - sum = 1.0/sum; - ggml_vec_scale_f32(masked_begin, S, sum); - -#ifndef NDEBUG - for (int i = 0; i < masked_begin; ++i) { - assert(!isnan(S[i])); - assert(!isinf(S[i])); - } -#endif - } - - for (int64_t ic = 0; ic < nev1; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; - - // v indices - const int iv2 = iq2 % nev2; - const int iv3 = iq3; - - ggml_vec_dot_f32(masked_begin, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, - (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0, - S, 0, 1); - } - } -} - -static void ggml_compute_forward_flash_attn_f16( - const struct ggml_compute_params * params, - const bool masked, - struct ggml_tensor * dst) { - - const struct ggml_tensor * q = dst->src[0]; - const struct ggml_tensor * k = dst->src[1]; - const struct ggml_tensor * v = dst->src[2]; - - int64_t t0 = ggml_perf_time_us(); - UNUSED(t0); - - GGML_TENSOR_LOCALS(int64_t, neq, q, ne) - GGML_TENSOR_LOCALS(size_t, nbq, q, nb) - GGML_TENSOR_LOCALS(int64_t, nek, k, ne) - GGML_TENSOR_LOCALS(size_t, nbk, k, nb) - GGML_TENSOR_LOCALS(int64_t, nev, v, ne) - GGML_TENSOR_LOCALS(size_t, nbv, v, nb) - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) - GGML_TENSOR_LOCALS(size_t, nb, dst, nb) - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t D = neq0; - const int64_t N = neq1; - const int64_t P = nek1 - N; - const int64_t M = P + N; - - const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); - - GGML_ASSERT(ne0 == D); - GGML_ASSERT(ne1 == N); - GGML_ASSERT(P >= 0); - - GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t)); - - GGML_ASSERT(neq0 == D); - GGML_ASSERT(nek0 == D); - GGML_ASSERT(nev1 == D); - - GGML_ASSERT(neq1 == N); - GGML_ASSERT(nek1 == N + P); - GGML_ASSERT(nev1 == D); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - if (params->type == GGML_TASK_TYPE_INIT) { - return; - } - - if (params->type == GGML_TASK_TYPE_FINALIZE) { - return; - } - - // parallelize by q rows using ggml_vec_dot_f32 - - // total rows in q - const int nr = neq1*neq2*neq3; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - const float scale = 1.0f/sqrtf(D); - - //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); - - for (int ir = ir0; ir < ir1; ++ir) { - // q indices - const int iq3 = ir/(neq2*neq1); - const int iq2 = (ir - iq3*neq2*neq1)/neq1; - const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); - - float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32); - - for (int i = M; i < Mup; ++i) { - S[i] = -INFINITY; - } - - if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) { - for (int64_t ic = 0; ic < nek1; ++ic) { - // k indices - const int ik3 = iq3; - const int ik2 = iq2 % nek2; - const int ik1 = ic; - - // S indices - const int i1 = ik1; - - ggml_vec_dot_f16(neq0, - S + i1, 0, - (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, - (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); - } - } else { - for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) { - // k indices - const int ik3 = iq3; - const int ik2 = iq2 % nek2; - const int ik1 = ic; - - // S indices - const int i1 = ik1; - - ggml_vec_dot_f16_unroll(neq0, nbk1, - S + i1, - ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); - } - } - - // scale - ggml_vec_scale_f32(nek1, S, scale); - - if (masked) { - for (int64_t i = P; i < M; i++) { - if (i > P + iq1) { - S[i] = -INFINITY; - } - } - } - - // softmax - // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero. - // dont forget to set their S values to zero - { - float max = -INFINITY; - ggml_vec_max_f32(M, &max, S); - - ggml_float sum = 0.0; - { -#ifdef GGML_SOFT_MAX_ACCELERATE - max = -max; - vDSP_vsadd(S, 1, &max, S, 1, Mup); - vvexpf(S, S, &Mup); - ggml_vec_sum_f32(Mup, &sum, S); -#else - sum = ggml_vec_soft_max_f32(Mup, S, S, max); -#endif - } - - assert(sum > 0.0); - - sum = 1.0/sum; - ggml_vec_scale_f32(M, S, sum); - -#ifndef NDEBUG - for (int i = 0; i < M; ++i) { - assert(!isnan(S[i])); - assert(!isinf(S[i])); - } -#endif - } - - ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup); - - for (int64_t i = 0; i < M; i++) { - S16[i] = GGML_FP32_TO_FP16(S[i]); - } - - // todo: exclude known zero S[..] values from dot (reducing nev0 and increasing begin of v and S16). - if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) { - for (int64_t ic = 0; ic < nev1; ++ic) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; - - // v indices - const int iv2 = iq2 % nev2; - const int iv3 = iq3; - - ggml_vec_dot_f16(nev0, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, - (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0, - S16, 0, 1); - } - } else { - for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) { - // dst indices - const int i1 = iq1; - const int i2 = iq2; - const int i3 = iq3; - - // v indices - const int iv2 = iq2 % nev2; - const int iv3 = iq3; - - ggml_vec_dot_f16_unroll(nev0, nbv1, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), - S16); - } - } - } -} - -static void ggml_compute_forward_flash_attn( - const struct ggml_compute_params * params, - const bool masked, - struct ggml_tensor * dst) { - - const struct ggml_tensor * q = dst->src[0]; - - switch (q->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_flash_attn_f16(params, masked, dst); - } break; - case GGML_TYPE_F32: - { - ggml_compute_forward_flash_attn_f32(params, masked, dst); - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} - // ggml_compute_forward_flash_attn_ext static void ggml_compute_forward_flash_attn_ext_f16( @@ -16336,165 +15876,6 @@ static void ggml_compute_forward_flash_attn_ext( } } -// ggml_compute_forward_flash_ff - -static void ggml_compute_forward_flash_ff_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * a = dst->src[0]; // F16 - const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w - const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b - const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w - const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b - - int64_t t0 = ggml_perf_time_us(); - UNUSED(t0); - - GGML_TENSOR_LOCALS(int64_t, nea, a, ne) - GGML_TENSOR_LOCALS(size_t, nba, a, nb) - GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne) - GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb) - GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne) - GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb) - GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne) - GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb) - GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne) - GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb) - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) - GGML_TENSOR_LOCALS(size_t, nb, dst, nb) - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t D = nea0; - //const int64_t N = nea1; - const int64_t M = neb01; - - GGML_ASSERT(ne0 == nea0); - GGML_ASSERT(ne1 == nea1); - GGML_ASSERT(ne2 == nea2); - - GGML_ASSERT(nba0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nbb10 == sizeof(float)); - GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nbc10 == sizeof(float)); - - GGML_ASSERT(neb00 == D); - GGML_ASSERT(neb01 == M); - GGML_ASSERT(neb10 == M); - GGML_ASSERT(neb11 == 1); - - GGML_ASSERT(nec00 == M); - GGML_ASSERT(nec01 == D); - GGML_ASSERT(nec10 == D); - GGML_ASSERT(nec11 == 1); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - if (params->type == GGML_TASK_TYPE_INIT) { - return; - } - - if (params->type == GGML_TASK_TYPE_FINALIZE) { - return; - } - - // parallelize by a rows using ggml_vec_dot_f32 - - // total rows in a - const int nr = nea1*nea2*nea3; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - for (int ir = ir0; ir < ir1; ++ir) { - // a indices - const int ia3 = ir/(nea2*nea1); - const int ia2 = (ir - ia3*nea2*nea1)/nea1; - const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1); - - float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32); - - for (int64_t ic = 0; ic < neb01; ++ic) { - // b0 indices - const int ib03 = ia3; - const int ib02 = ia2; - const int ib01 = ic; - - // S indices - const int i1 = ib01; - - ggml_vec_dot_f16(nea0, - S + i1, 0, - (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0, - (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1); - } - - ggml_vec_add_f32(neb01, S, S, (float *) b1->data); - //ggml_vec_gelu_f32(neb01, S, S); - - ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M); - - for (int64_t i = 0; i < M; i++) { - S16[i] = GGML_FP32_TO_FP16(S[i]); - } - - ggml_vec_gelu_f16(neb01, S16, S16); - - { - // dst indices - const int i1 = ia1; - const int i2 = ia2; - const int i3 = ia3; - - for (int64_t ic = 0; ic < nec01; ++ic) { - - ggml_vec_dot_f16(neb01, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, - (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0, - S16, 0, 1); - } - - ggml_vec_add_f32(nec01, - (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)), - (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)), - (float *) c1->data); - } - } -} - -static void ggml_compute_forward_flash_ff( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * b0 = dst->src[1]; - - switch (b0->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_flash_ff_f16(params, dst); - } break; - case GGML_TYPE_F32: - { - GGML_ASSERT(false); // TODO - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} - // ggml_compute_forward_flash_attn_back static void ggml_compute_forward_flash_attn_back_f32( @@ -18065,21 +17446,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_leaky_relu(params, tensor); } break; - case GGML_OP_FLASH_ATTN: - { - const int32_t t = ggml_get_op_params_i32(tensor, 0); - GGML_ASSERT(t == 0 || t == 1); - const bool masked = t != 0; - ggml_compute_forward_flash_attn(params, masked, tensor); - } break; case GGML_OP_FLASH_ATTN_EXT: { ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor); } break; - case GGML_OP_FLASH_FF: - { - ggml_compute_forward_flash_ff(params, tensor); - } break; case GGML_OP_FLASH_ATTN_BACK: { int32_t t = ggml_get_op_params_i32(tensor, 0); @@ -19086,7 +18456,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_FLASH_ATTN: case GGML_OP_FLASH_ATTN_EXT: { struct ggml_tensor * flash_grad = NULL; @@ -19140,10 +18509,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor zero_table); } } break; - case GGML_OP_FLASH_FF: - { - GGML_ASSERT(false); // not supported - } break; case GGML_OP_FLASH_ATTN_BACK: { GGML_ASSERT(false); // not supported @@ -19830,15 +19195,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_ { n_tasks = n_threads; } break; - case GGML_OP_FLASH_ATTN: case GGML_OP_FLASH_ATTN_EXT: { n_tasks = n_threads; } break; - case GGML_OP_FLASH_FF: - { - n_tasks = n_threads; - } break; case GGML_OP_FLASH_ATTN_BACK: { n_tasks = n_threads; @@ -20235,40 +19595,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03; cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12; } break; - case GGML_OP_FLASH_ATTN: - { - const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); - - if (node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 - } else if (node->src[1]->type == GGML_TYPE_F16) { - cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 - } else if (node->src[1]->type == GGML_TYPE_BF16) { - cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 - } - } break; case GGML_OP_FLASH_ATTN_EXT: { const int64_t ne00 = node->src[0]->ne[0]; // D cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread } break; - case GGML_OP_FLASH_FF: - { - if (node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 - } else if (node->src[1]->type == GGML_TYPE_F16) { - cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 - } else if (node->src[1]->type == GGML_TYPE_BF16) { - cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 - } - } break; case GGML_OP_FLASH_ATTN_BACK: { const int64_t D = node->src[0]->ne[0]; diff --git a/ggml.h b/ggml.h index 08835042c..be81e0c52 100644 --- a/ggml.h +++ b/ggml.h @@ -481,9 +481,7 @@ extern "C" { GGML_OP_ARGSORT, GGML_OP_LEAKY_RELU, - GGML_OP_FLASH_ATTN, GGML_OP_FLASH_ATTN_EXT, - GGML_OP_FLASH_FF, GGML_OP_FLASH_ATTN_BACK, GGML_OP_SSM_CONV, GGML_OP_SSM_SCAN, @@ -1761,13 +1759,6 @@ extern "C" { struct ggml_tensor * a, int k); - GGML_API struct ggml_tensor * ggml_flash_attn( - struct ggml_context * ctx, - struct ggml_tensor * q, - struct ggml_tensor * k, - struct ggml_tensor * v, - bool masked); - #define GGML_KQ_MASK_PAD 32 // q: [n_embd, n_batch, n_head, 1] @@ -1788,6 +1779,7 @@ extern "C" { struct ggml_tensor * a, enum ggml_prec prec); + // TODO: needs to be adapted to ggml_flash_attn_ext GGML_API struct ggml_tensor * ggml_flash_attn_back( struct ggml_context * ctx, struct ggml_tensor * q, @@ -1796,14 +1788,6 @@ extern "C" { struct ggml_tensor * d, bool masked); - GGML_API struct ggml_tensor * ggml_flash_ff( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b0, - struct ggml_tensor * b1, - struct ggml_tensor * c0, - struct ggml_tensor * c1); - GGML_API struct ggml_tensor * ggml_ssm_conv( struct ggml_context * ctx, struct ggml_tensor * s, diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp index 8ff76c891..21ca43be3 100644 --- a/tests/test-grad0.cpp +++ b/tests/test-grad0.cpp @@ -1515,90 +1515,50 @@ int main(int argc, const char ** argv) { } // flash_attn f32 - { - srand(seed); - const int nargs = 3; + // TODO: adapt to ggml_flash_attn_ext() changes + //{ + // srand(seed); + // const int nargs = 3; - int64_t ne2[4]; + // int64_t ne2[4]; - get_random_dims(ne2, 4); - int64_t D = ne2[0]; - int64_t N = ne2[1]; - int64_t M = ne2[2] + N; - int64_t B = ne2[3]; + // get_random_dims(ne2, 4); + // int64_t D = ne2[0]; + // int64_t N = ne2[1]; + // int64_t M = ne2[2] + N; + // int64_t B = ne2[3]; - for (int masked = 0; masked <= 1; ++masked) { - for (int ndims = 2; ndims <= 4; ++ndims) { - int max_nrep = (ndims >= 3) ? 2 : 1; - for (int nrep = 1; nrep < max_nrep; ++nrep) { - int64_t neq[4] = { D, N, B*nrep, ne[3] }; - int64_t nek[4] = { D, M, B, ne[3] }; - int64_t nev[4] = { M, D, B, ne[3] }; - if (ndims == 2) { - neq[2] = 1; neq[3] = 1; - nek[2] = 1; nek[3] = 1; - nev[2] = 1; nev[3] = 1; - } else if (ndims == 3) { - neq[3] = 1; - nek[3] = 1; - nev[3] = 1; - } - x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f); - x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f); - x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f); - ggml_set_param(ctx0, x[0]); - ggml_set_param(ctx0, x[1]); - ggml_set_param(ctx0, x[2]); + // for (int masked = 0; masked <= 1; ++masked) { + // for (int ndims = 2; ndims <= 4; ++ndims) { + // int max_nrep = (ndims >= 3) ? 2 : 1; + // for (int nrep = 1; nrep < max_nrep; ++nrep) { + // int64_t neq[4] = { D, N, B*nrep, ne[3] }; + // int64_t nek[4] = { D, M, B, ne[3] }; + // int64_t nev[4] = { M, D, B, ne[3] }; + // if (ndims == 2) { + // neq[2] = 1; neq[3] = 1; + // nek[2] = 1; nek[3] = 1; + // nev[2] = 1; nev[3] = 1; + // } else if (ndims == 3) { + // neq[3] = 1; + // nek[3] = 1; + // nev[3] = 1; + // } + // x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f); + // x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f); + // x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f); + // ggml_set_param(ctx0, x[0]); + // ggml_set_param(ctx0, x[1]); + // ggml_set_param(ctx0, x[2]); - struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0))); + // struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0))); - check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY); - } - } - } - } + // check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY); + // } + // } + // } + //} - // flash_attn f16, not yet fully implemented - if(0) - { - srand(seed); - const int nargs = 3; - - int64_t ne2[4]; - - get_random_dims(ne2, 4); - int64_t D = ne2[0]; - int64_t N = ne2[1]; - int64_t M = ne2[2] + N; - int64_t B = ne2[3]; - - for (int masked = 0; masked <= 1; ++masked) { - for (int ndims = 2; ndims <= 4; ++ndims) { - int64_t neq[4] = { D, N, B, ne[3] }; - int64_t nek[4] = { D, M, B, ne[3] }; - int64_t nev[4] = { M, D, B, ne[3] }; - if (ndims == 2) { - neq[2] = 1; neq[3] = 1; - nek[2] = 1; nek[3] = 1; - nev[2] = 1; nev[3] = 1; - } else if (ndims == 3) { - neq[3] = 1; - nek[3] = 1; - nev[3] = 1; - } - x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f); - x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f); - x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f); - ggml_set_param(ctx0, x[0]); - ggml_set_param(ctx0, x[1]); - ggml_set_param(ctx0, x[2]); - - struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0))); - - check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY); - } - } - } ggml_free(ctx0); } From 152da28ae54139e3754189b9e6e1c28e11277502 Mon Sep 17 00:00:00 2001 From: Brian Date: Thu, 23 May 2024 17:40:43 +1000 Subject: [PATCH 24/34] labeler.yml: add embedding label detector [no ci] (#7482) --- .github/labeler.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/labeler.yml b/.github/labeler.yml index fca60594f..a67f78044 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -62,6 +62,8 @@ server: ggml: - changed-files: - any-glob-to-any-file: + - ggml.c + - ggml.h - ggml-*.c - ggml-*.h - ggml-cuda/** @@ -71,3 +73,6 @@ nix: - "**/*.nix" - .github/workflows/nix-*.yml - .devops/nix/nixpkgs-instances.nix +embedding: + - changed-files: + - any-glob-to-any-file: examples/embedding/ From a61a94e543e3c6877c087e80fca27a0313ce5fd5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 May 2024 12:38:18 +0300 Subject: [PATCH 25/34] llama : rename n_ctx -> cache.size, less confusing (#0) --- llama.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index 37b3d58c6..3e09a2390 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2475,7 +2475,6 @@ static bool llama_kv_cache_init( static bool llama_kv_cache_find_slot( struct llama_kv_cache & cache, const struct llama_batch & batch) { - const uint32_t n_ctx = cache.size; const uint32_t n_tokens = batch.n_tokens; if (cache.recurrent) { @@ -2526,16 +2525,16 @@ static bool llama_kv_cache_find_slot( } // otherwise, one cell per token. - if (n_tokens > n_ctx) { - LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx); + if (n_tokens > cache.size) { + LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size); return false; } uint32_t n_tested = 0; while (true) { - if (cache.head + n_tokens > n_ctx) { - n_tested += n_ctx - cache.head; + if (cache.head + n_tokens > cache.size) { + n_tested += cache.size - cache.head; cache.head = 0; continue; } @@ -2554,7 +2553,7 @@ static bool llama_kv_cache_find_slot( break; } - if (n_tested >= n_ctx) { + if (n_tested >= cache.size) { //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); return false; } From 9b82476ee9e73065a759f8bcc4cf27ec7ab2ed8c Mon Sep 17 00:00:00 2001 From: fairydreaming <166155368+fairydreaming@users.noreply.github.com> Date: Thu, 23 May 2024 11:49:53 +0200 Subject: [PATCH 26/34] Add missing inference support for GPTNeoXForCausalLM (Pythia and GPT-NeoX base models) (#7461) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * convert-hf : add conversion of bloom-style qkv tensor to gpt-style qkv (code borrowed from BloomModel) * llama : add inference support for LLM_ARCH_GPTNEOX * llama : add model types for every Pythia variant and GPT-NeoX Co-authored-by: Stanisław Szymczyk --- convert-hf-to-gguf.py | 38 +++++++ llama.cpp | 236 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 273 insertions(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index daad1c4fc..5a00a5e89 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -673,6 +673,44 @@ class GPTNeoXModel(Model): self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + + tensors: list[tuple[str, Tensor]] = [] + + if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) + data_torch = torch.cat( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.weight") + elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name): + qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) + data_torch = torch.cat( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.bias") + + tensors.append((self.map_tensor_name(name), data_torch)) + + return tensors + @Model.register("BloomForCausalLM") class BloomModel(Model): diff --git a/llama.cpp b/llama.cpp index 3e09a2390..5ff186a57 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1692,17 +1692,24 @@ static llama_state g_state; // available llama models enum e_model { MODEL_UNKNOWN, + MODEL_14M, MODEL_17M, MODEL_22M, MODEL_33M, + MODEL_70M, MODEL_109M, MODEL_137M, + MODEL_160M, MODEL_335M, + MODEL_410M, MODEL_0_5B, MODEL_1B, + MODEL_1_4B, MODEL_2B, + MODEL_2_8B, MODEL_3B, MODEL_4B, + MODEL_6_9B, MODEL_7B, MODEL_8B, MODEL_12B, @@ -1734,6 +1741,7 @@ static const size_t GiB = 1024*MiB; struct llama_hparams { bool vocab_only; bool rope_finetuned; + bool use_par_res; uint32_t n_vocab; uint32_t n_ctx_train; // context size the model was trained on @@ -3773,17 +3781,24 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { static const char * llama_model_type_name(e_model type) { switch (type) { + case MODEL_14M: return "14M"; case MODEL_17M: return "17M"; case MODEL_22M: return "22M"; case MODEL_33M: return "33M"; + case MODEL_70M: return "70M"; case MODEL_109M: return "109M"; case MODEL_137M: return "137M"; + case MODEL_160M: return "160M"; case MODEL_335M: return "335M"; + case MODEL_410M: return "410M"; case MODEL_0_5B: return "0.5B"; case MODEL_1B: return "1B"; + case MODEL_1_4B: return "1.4B"; case MODEL_2B: return "2B"; + case MODEL_2_8B: return "2.8B"; case MODEL_3B: return "3B"; case MODEL_4B: return "4B"; + case MODEL_6_9B: return "6.9B"; case MODEL_7B: return "7B"; case MODEL_8B: return "8B"; case MODEL_12B: return "12B"; @@ -4282,6 +4297,52 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_GPTNEOX: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res); + switch (hparams.n_layer) { + case 6: + switch (hparams.n_ff) { + case 512: model.type = e_model::MODEL_14M; break; + case 2048: model.type = e_model::MODEL_70M; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 12: + switch (hparams.n_ff) { + case 3072: model.type = e_model::MODEL_160M; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 16: + switch (hparams.n_ff) { + case 8192: model.type = e_model::MODEL_1B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 24: + switch (hparams.n_ff) { + case 4096: model.type = e_model::MODEL_410M; break; + case 8192: model.type = e_model::MODEL_1_4B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 32: + switch (hparams.n_ff) { + case 10240: model.type = e_model::MODEL_2_8B; break; + case 16384: model.type = e_model::MODEL_6_9B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 36: + switch (hparams.n_ff) { + case 20480: model.type = e_model::MODEL_12B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 44: + switch (hparams.n_ff) { + case 24576: model.type = e_model::MODEL_20B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -6033,6 +6094,41 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_GPTNEOX: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -10560,6 +10656,140 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_gptneox() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // ffn + if (hparams.use_par_res) { + // attention and ffn are computed in parallel + // x = x + attn(ln1(x)) + ffn(ln2(x)) + + struct ggml_tensor * attn_out = cur; + + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "ffn_out", il); + + inpL = ggml_add(ctx0, cur, attn_out); + cb(inpL, "l_out", il); + } else { + // attention and ffn are computed sequentially + // x = x + attn(ln1(x)) + // x = x + ffn(ln2(x)) + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); + } + } + + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -10770,6 +11000,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_olmo(); } break; + case LLM_ARCH_GPTNEOX: + { + result = llm.build_gptneox(); + } break; default: GGML_ASSERT(false); } @@ -15762,7 +15996,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { // these models do not use RoPE case LLM_ARCH_GPT2: case LLM_ARCH_GPTJ: - case LLM_ARCH_GPTNEOX: case LLM_ARCH_MPT: case LLM_ARCH_REFACT: case LLM_ARCH_BLOOM: @@ -15798,6 +16031,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_PHI3: case LLM_ARCH_GEMMA: case LLM_ARCH_STARCODER2: + case LLM_ARCH_GPTNEOX: return LLAMA_ROPE_TYPE_NEOX; // all model arches should be listed explicitly here From dacfcebd6022175848e978f82811a244f1033038 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Thu, 23 May 2024 15:12:43 +0300 Subject: [PATCH 27/34] readme : add GPT-NeoX + Pythia to the list of supported models (#7491) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f4088c05e..ccd12e211 100644 --- a/README.md +++ b/README.md @@ -127,6 +127,7 @@ Typically finetunes of the base models below are supported as well. - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [OLMo](https://allenai.org/olmo) +- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia) (instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md)) From 55ac3b7aeaf52f19786ed96e885d89521fc0f6c8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 May 2024 15:28:14 +0300 Subject: [PATCH 28/34] ci : use Pythia models instead of OpenLlama (#7470) * ci : start using Pythia models over OpenLlama ggml-ci * ci : disable q2_k ppl tests * ci : use convert-hf-to-gguf.py * ci : update gg_get_model * ci : fix convert outfile name ggml-ci * llama : gptneox arch use F32 attn prec ggml-ci --- ci/run.sh | 424 +++++++++++++++++++++++++++++++++++------------------- llama.cpp | 4 +- 2 files changed, 282 insertions(+), 146 deletions(-) diff --git a/ci/run.sh b/ci/run.sh index 79dcd0772..940299025 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -202,12 +202,15 @@ function gg_sum_test_scripts_release { } function gg_get_model { - local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf" - local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf" - if [[ -s $gguf_3b ]]; then - echo -n "$gguf_3b" - elif [[ -s $gguf_7b ]]; then - echo -n "$gguf_7b" + local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf" + local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf" + local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf" + if [[ -s $gguf_0 ]]; then + echo -n "$gguf_0" + elif [[ -s $gguf_1 ]]; then + echo -n "$gguf_1" + elif [[ -s $gguf_2 ]]; then + echo -n "$gguf_2" else echo >&2 "No model found. Can't run gg_run_ctest_with_model." exit 1 @@ -256,139 +259,6 @@ function gg_sum_ctest_with_model_release { gg_printf '```\n' } -# open_llama_3b_v2 - -function gg_run_open_llama_3b_v2 { - cd ${SRC} - - gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json - gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model - gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json - gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json - gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin - gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json - - gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip - unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ - head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw - - path_models="../models-mnt/open-llama/3B-v2" - path_wiki="../models-mnt/wikitext/wikitext-2-raw" - - rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release - - set -e - - (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - - python3 ../convert.py ${path_models} - - model_f16="${path_models}/ggml-model-f16.gguf" - model_q8_0="${path_models}/ggml-model-q8_0.gguf" - model_q4_0="${path_models}/ggml-model-q4_0.gguf" - model_q4_1="${path_models}/ggml-model-q4_1.gguf" - model_q5_0="${path_models}/ggml-model-q5_0.gguf" - model_q5_1="${path_models}/ggml-model-q5_1.gguf" - model_q2_k="${path_models}/ggml-model-q2_k.gguf" - model_q3_k="${path_models}/ggml-model-q3_k.gguf" - model_q4_k="${path_models}/ggml-model-q4_k.gguf" - model_q5_k="${path_models}/ggml-model-q5_k.gguf" - model_q6_k="${path_models}/ggml-model-q6_k.gguf" - - wiki_test_60="${path_wiki}/wiki.test-60.raw" - - ./bin/quantize ${model_f16} ${model_q8_0} q8_0 - ./bin/quantize ${model_f16} ${model_q4_0} q4_0 - ./bin/quantize ${model_f16} ${model_q4_1} q4_1 - ./bin/quantize ${model_f16} ${model_q5_0} q5_0 - ./bin/quantize ${model_f16} ${model_q5_1} q5_1 - ./bin/quantize ${model_f16} ${model_q2_k} q2_k - ./bin/quantize ${model_f16} ${model_q3_k} q3_k - ./bin/quantize ${model_f16} ${model_q4_k} q4_k - ./bin/quantize ${model_f16} ${model_q5_k} q5_k - ./bin/quantize ${model_f16} ${model_q6_k} q6_k - - (time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - - (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - - function check_ppl { - qnt="$1" - ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) - - if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then - printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl" - return 20 - fi - - printf ' - %s @ %s OK\n' "$qnt" "$ppl" - return 0 - } - - check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - - cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log - - set +e -} - -function gg_sum_open_llama_3b_v2 { - gg_printf '### %s\n\n' "${ci}" - - gg_printf 'OpenLLaMA 3B-v2:\n' - gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" - gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" - gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" - gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" - gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" - gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" - gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)" - gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)" - gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)" - gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)" - gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)" - gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" - gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" - gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" - gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" -} - # open_llama_7b_v2 # requires: GG_BUILD_CUDA @@ -417,7 +287,7 @@ function gg_run_open_llama_7b_v2 { (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - python3 ../convert.py ${path_models} + python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf model_f16="${path_models}/ggml-model-f16.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf" @@ -526,6 +396,272 @@ function gg_sum_open_llama_7b_v2 { gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" } +# pythia_1.4b + +function gg_run_pythia_1_4b { + cd ${SRC} + + gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json + gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json + gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json + gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json + gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin + + gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip + unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ + head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw + + path_models="../models-mnt/pythia/1.4B" + path_wiki="../models-mnt/wikitext/wikitext-2-raw" + + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + + python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf + + model_f16="${path_models}/ggml-model-f16.gguf" + model_q8_0="${path_models}/ggml-model-q8_0.gguf" + model_q4_0="${path_models}/ggml-model-q4_0.gguf" + model_q4_1="${path_models}/ggml-model-q4_1.gguf" + model_q5_0="${path_models}/ggml-model-q5_0.gguf" + model_q5_1="${path_models}/ggml-model-q5_1.gguf" + model_q2_k="${path_models}/ggml-model-q2_k.gguf" + model_q3_k="${path_models}/ggml-model-q3_k.gguf" + model_q4_k="${path_models}/ggml-model-q4_k.gguf" + model_q5_k="${path_models}/ggml-model-q5_k.gguf" + model_q6_k="${path_models}/ggml-model-q6_k.gguf" + + wiki_test_60="${path_wiki}/wiki.test-60.raw" + + ./bin/quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/quantize ${model_f16} ${model_q2_k} q2_k + ./bin/quantize ${model_f16} ${model_q3_k} q3_k + ./bin/quantize ${model_f16} ${model_q4_k} q4_k + ./bin/quantize ${model_f16} ${model_q5_k} q5_k + ./bin/quantize ${model_f16} ${model_q6_k} q6_k + + (time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + + (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + + function check_ppl { + qnt="$1" + ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + + if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then + printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl" + return 20 + fi + + printf ' - %s @ %s OK\n' "$qnt" "$ppl" + return 0 + } + + check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model + check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + + cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log + + set +e +} + +function gg_sum_pythia_1_4b { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Pythia 1.4B:\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" + gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" + gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" + gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" + gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)" + gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)" + gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)" + gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)" + gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)" + gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" + gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" + gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" + gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" +} + +# pythia_2_8b +# requires: GG_BUILD_CUDA + +function gg_run_pythia_2_8b { + cd ${SRC} + + gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json + gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json + gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json + gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json + gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin + + gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip + unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ + + path_models="../models-mnt/pythia/2.8B" + path_wiki="../models-mnt/wikitext/wikitext-2-raw" + + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + + python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf + + model_f16="${path_models}/ggml-model-f16.gguf" + model_q8_0="${path_models}/ggml-model-q8_0.gguf" + model_q4_0="${path_models}/ggml-model-q4_0.gguf" + model_q4_1="${path_models}/ggml-model-q4_1.gguf" + model_q5_0="${path_models}/ggml-model-q5_0.gguf" + model_q5_1="${path_models}/ggml-model-q5_1.gguf" + model_q2_k="${path_models}/ggml-model-q2_k.gguf" + model_q3_k="${path_models}/ggml-model-q3_k.gguf" + model_q4_k="${path_models}/ggml-model-q4_k.gguf" + model_q5_k="${path_models}/ggml-model-q5_k.gguf" + model_q6_k="${path_models}/ggml-model-q6_k.gguf" + + wiki_test="${path_wiki}/wiki.test.raw" + + ./bin/quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/quantize ${model_f16} ${model_q2_k} q2_k + ./bin/quantize ${model_f16} ${model_q3_k} q3_k + ./bin/quantize ${model_f16} ${model_q4_k} q4_k + ./bin/quantize ${model_f16} ${model_q5_k} q5_k + ./bin/quantize ${model_f16} ${model_q6_k} q6_k + + (time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + + (time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + + function check_ppl { + qnt="$1" + ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + + if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then + printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl" + return 20 + fi + + printf ' - %s @ %s OK\n' "$qnt" "$ppl" + return 0 + } + + check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model + check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + + cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log + + set +e +} + +function gg_sum_pythia_2_8b { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Pythia 2.8B:\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" + gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" + gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" + gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" + gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)" + gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)" + gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)" + gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)" + gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)" + gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" + gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" + gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" + gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" +} + # bge-small function gg_run_embd_bge_small { @@ -552,7 +688,7 @@ function gg_run_embd_bge_small { (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - python3 ../convert-hf-to-gguf.py ${path_models} + python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf model_f16="${path_models}/ggml-model-f16.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf" @@ -606,10 +742,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_CUDA} ]; then - #test $ret -eq 0 && gg_run open_llama_3b_v2 - date # dummy + test $ret -eq 0 && gg_run pythia_1_4b else - test $ret -eq 0 && gg_run open_llama_7b_v2 + test $ret -eq 0 && gg_run pythia_2_8b + #test $ret -eq 0 && gg_run open_llama_7b_v2 fi test $ret -eq 0 && gg_run ctest_with_model_debug test $ret -eq 0 && gg_run ctest_with_model_release diff --git a/llama.cpp b/llama.cpp index 5ff186a57..1f9e10eed 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6718,7 +6718,7 @@ static struct ggml_tensor * llm_build_kqv( cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias); - if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) { + if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); } @@ -6727,7 +6727,7 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); - if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) { + if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 ggml_mul_mat_set_prec(kq, GGML_PREC_F32); From 3015851c5ac7334fb544a23a70a284c117b87044 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 23 May 2024 14:29:26 +0200 Subject: [PATCH 29/34] llama : add getters for n_threads/n_threads_batch (#7464) * llama : add getters for n_threads/n_threads_batch This commit adds two new functions to the llama API. The functions can be used to get the number of threads used for generating a single token and the number of threads used for prompt and batch processing (multiple tokens). The motivation for this is that we want to be able to get the number of threads that the a context is using. The main use case is for a testing/verification that the number of threads is set correctly. Signed-off-by: Daniel Bevenius * squash! llama : add getters for n_threads/n_threads_batch Rename the getters to llama_n_threads and llama_n_threads_batch. Signed-off-by: Daniel Bevenius --------- Signed-off-by: Daniel Bevenius --- llama.cpp | 8 ++++++++ llama.h | 6 ++++++ 2 files changed, 14 insertions(+) diff --git a/llama.cpp b/llama.cpp index 1f9e10eed..e540c1b39 100644 --- a/llama.cpp +++ b/llama.cpp @@ -17410,6 +17410,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_ ctx->cparams.n_threads_batch = n_threads_batch; } +uint32_t llama_n_threads(struct llama_context * ctx) { + return ctx->cparams.n_threads; +} + +uint32_t llama_n_threads_batch(struct llama_context * ctx) { + return ctx->cparams.n_threads_batch; +} + void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { ctx->abort_callback = abort_callback; ctx->abort_callback_data = abort_callback_data; diff --git a/llama.h b/llama.h index b7bf2afcb..16cece5db 100644 --- a/llama.h +++ b/llama.h @@ -759,6 +759,12 @@ extern "C" { // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch); + // Get the number of threads used for generation of a single token. + LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx); + + // Get the number of threads used for prompt and batch processing (multiple token). + LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx); + // Set whether to use causal attention or not // If set to true, the model will only attend to the past tokens LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn); From 8b94e799dfa482adf63419df4905dc79b37e179f Mon Sep 17 00:00:00 2001 From: Raj Hammeer Singh Hada Date: Thu, 23 May 2024 18:00:13 +0530 Subject: [PATCH 30/34] readme : add Bunny in supported models [no ci] (#7469) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ccd12e211..461259afb 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,7 @@ Typically finetunes of the base models below are supported as well. - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) +- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny) **HTTP server** From 007489e895bad02e4e54758bf0bdf2d6a4cdb7c1 Mon Sep 17 00:00:00 2001 From: Tristan Druyen Date: Thu, 23 May 2024 16:15:15 +0200 Subject: [PATCH 31/34] Fix phi3 chat template confusion with zephyr (#7449) * Fix phi3 template matching vs zephyr * Add regression test for new phi3 chat template * Implement review suggestions * Fix phi3 jinja test templates & match by <|end|> * Apply suggestion Co-authored-by: Xuan Son Nguyen * Add all phi3 template variants in tests * Remove unneeded message trimming Co-authored-by: Xuan Son Nguyen * Fix tests to not expect trimmed messages --------- Co-authored-by: Xuan Son Nguyen --- llama.cpp | 18 +++++++++--------- tests/test-chat-template.cpp | 20 ++++++++++++++++---- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/llama.cpp b/llama.cpp index e540c1b39..15c660775 100644 --- a/llama.cpp +++ b/llama.cpp @@ -17852,6 +17852,15 @@ static int32_t llama_chat_apply_template_internal( } } // llama2 templates seem to not care about "add_generation_prompt" + } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) { + // Phi 3 + for (auto message : chat) { + std::string role(message->role); + ss << "<|" << role << "|>\n" << message->content << "<|end|>\n"; + } + if (add_ass) { + ss << "<|assistant|>\n"; + } } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) { // zephyr template for (auto message : chat) { @@ -17984,15 +17993,6 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; } - } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) { - // Phi 3 - for (auto message : chat) { - std::string role(message->role); - ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n"; - } - if (add_ass) { - ss << "<|assistant|>\n"; - } } else { // template not supported return -1; diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index 4fe9183b9..cef9a650b 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -49,8 +49,14 @@ int main(void) { "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}", // Llama-3 "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}", - // Phi-3 - "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + ' ' + message['content'] + '<|end|> ' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% else %}{{ eos_token }}{% endif %}" + //Phi-3-mini + "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}", + //Phi-3-small + "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + //Phi-3-medium + "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}", + //Phi-3-vision + "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}" }; std::vector expected_output = { // teknium/OpenHermes-2.5-Mistral-7B @@ -79,8 +85,14 @@ int main(void) { "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", // Llama 3 "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", - // Phi 3 - "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\nI am an assistant<|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n", + //Phi-3-mini + "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n", + //Phi-3-small + "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n", + //Phi-3-medium + "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n", + //Phi-3-vision + "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n", }; std::vector formatted_chat(1024); int32_t res; From 1debe72737ea131cb52975da3d53ed3a835df3a6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 May 2024 17:17:43 +0300 Subject: [PATCH 32/34] ggml : silence UB sanitizer error during iq2_xxs quantization (#0) --- ggml-quants.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-quants.c b/ggml-quants.c index 88f58a339..bb01ce93c 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -12144,7 +12144,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict printf("\n"); GGML_ASSERT(false); } - q2[2*ib+0] |= (grid_index << 8*k); + q2[2*ib+0] |= ((uint32_t) grid_index << 8*k); q2[2*ib+1] |= (block_signs[k] << 7*k); } GGML_ASSERT(scale >= 0); From 74f33adf5f8b20b08fc5a6aa17ce081abe86ef2f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 May 2024 17:43:18 +0300 Subject: [PATCH 33/34] readme : remove trailing space (#7469) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 461259afb..2ee267fdf 100644 --- a/README.md +++ b/README.md @@ -141,7 +141,7 @@ Typically finetunes of the base models below are supported as well. - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) -- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny) +- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny) **HTTP server** From 0df0aa8e43c3378975269a51f9b876c8692e70da Mon Sep 17 00:00:00 2001 From: Neo Zhang <14088817+arthw@users.noreply.github.com> Date: Fri, 24 May 2024 10:06:56 +0800 Subject: [PATCH 34/34] add build shared lib in win release package (#7438) --- examples/sycl/win-build-sycl.bat | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat index 1b0dc41ba..b8037aae8 100644 --- a/examples/sycl/win-build-sycl.bat +++ b/examples/sycl/win-build-sycl.bat @@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR :: for FP16 :: faster for long-prompt inference -:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON +:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON :: for FP32 -cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release +cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release if %errorlevel% neq 0 goto ERROR :: build example/main only :: make main