diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index c88134dbb..eb8c4b472 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -9,5 +9,3 @@ contact_links: - name: Want to contribute? url: https://github.com/ggerganov/llama.cpp/wiki/contribute about: Head to the contribution guide page of the wiki for areas you can help with - - diff --git a/Makefile b/Makefile index 8ae4f1dc4..2730d8b60 100644 --- a/Makefile +++ b/Makefile @@ -62,6 +62,11 @@ TEST_TARGETS = \ tests/test-tokenizer-1-bpe \ tests/test-tokenizer-1-spm +# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned +LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ + simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ + retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm + # Deprecation aliases ifdef LLAMA_CUBLAS $(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.) @@ -1086,6 +1091,7 @@ clean: rm -vrf ggml/src/ggml-cuda/template-instances/*.o rm -rvf $(BUILD_TARGETS) rm -rvf $(TEST_TARGETS) + rm -rvf $(LEGACY_TARGETS) find examples pocs -type f -name "*.o" -delete # diff --git a/common/common.cpp b/common/common.cpp index 5a0d0ee03..2c05a4d4a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -757,7 +757,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.cache_type_v = argv[++i]; return true; } - if (arg == "--multiline-input") { + if (arg == "-mli" || arg == "--multiline-input") { params.multiline_input = true; return true; } diff --git a/common/common.h b/common/common.h index 627b7ed85..65c0ef81a 100644 --- a/common/common.h +++ b/common/common.h @@ -459,4 +459,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha void yaml_dump_non_result_info( FILE * stream, const gpt_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); - diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 003edae78..9eb406cb8 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -88,6 +88,7 @@ models = [ {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B {"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", }, {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", }, + {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, ] diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4c63bfc85..bae145589 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -490,6 +490,9 @@ class Model: if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": # ref: https://huggingface.co/LumiOpen/Viking-7B res = "viking" + if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": + # ref: https://huggingface.co/core42/jais-13b + res = "jais" if res is None: logger.warning("\n") @@ -1939,7 +1942,7 @@ class Phi3MiniModel(Model): if len(rope_scaling_type) == 0: raise KeyError('Missing the required key rope_scaling.type') - if rope_scaling_type == 'su': + if rope_scaling_type == 'su' or rope_scaling_type == 'longrope': attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 elif rope_scaling_type == 'yarn': attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 @@ -2968,6 +2971,96 @@ class T5Model(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("JAISLMHeadModel") +class JaisModel(Model): + model_arch = gguf.MODEL_ARCH.JAIS + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # ALiBi position embedding + assert self.hparams["position_embedding_type"] == "alibi" + + # Embeddings scale + self.embeddings_scale = 1.0 + # note: For some JAIS flavors, output is tied to (same as) wte in original model + self.output_is_wte = False + if 'mup_embeddings_scale' in self.hparams: + self.output_is_wte = True # Hack (?) + self.embeddings_scale = self.hparams['mup_embeddings_scale'] + elif 'embeddings_scale' in self.hparams: + self.embeddings_scale = self.hparams['embeddings_scale'] + else: + assert False + + self.width_scale = 1.0 + if 'mup_output_alpha' in self.hparams: + assert 'mup_width_scale' in self.hparams + self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] + elif 'width_scale' in self.hparams: + self.width_scale = self.hparams['width_scale'] + else: + assert False + + self.max_alibi_bias = 8.0 + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + tensors: list[tuple[str, Tensor]] = [] + + # we don't need these + if name.endswith((".attn.bias")): + return tensors + + if name.endswith(("relative_pe.slopes")): + # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) + # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, + # but Jais's PyTorch model simply precalculates the slope values and places them + # in relative_pes.slopes + n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) + first_val = float(data_torch._data[0]) + self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) + + return tensors + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): + data_torch = data_torch.transpose(1, 0) + + new_name = self.map_tensor_name(name) + + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + tensors.append((new_name, data_torch * self.embeddings_scale)) + if self.output_is_wte: + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale)) + elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + assert not self.output_is_wte + tensors.append((new_name, data_torch * self.width_scale)) + else: + tensors.append((new_name, data_torch)) + + return tensors + + def write_tensors(self): + super().write_tensors() + self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) + + ###### CONVERSION LOGIC ###### @@ -3123,7 +3216,8 @@ def main() -> None: "auto": gguf.LlamaFileType.GUESSED, } - if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"): + is_split = args.split_max_tensors > 0 or args.split_max_size != "0" + if args.use_temp_file and is_split: logger.error("Error: Cannot use temp file when splitting") sys.exit(1) @@ -3160,11 +3254,12 @@ def main() -> None: if args.vocab_only: logger.info("Exporting model vocab...") model_instance.write_vocab() - logger.info("Model vocab successfully exported.") + logger.info(f"Model vocab successfully exported to {model_instance.fname_out}") else: logger.info("Exporting model...") model_instance.write() - logger.info("Model successfully exported.") + out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out + logger.info(f"Model successfully exported to {out_path}") if __name__ == '__main__': diff --git a/examples/embedding/README.md b/examples/embedding/README.md index 86df18958..e3705b454 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -58,4 +58,3 @@ The above command will output space-separated float values. ```powershell embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null ``` - diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index ca71dd687..0e682154d 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -659,4 +659,3 @@ int main(int argc, char ** argv) { return 0; } - diff --git a/examples/lookup/README.md b/examples/lookup/README.md index 5bfb0de93..71c345c03 100644 --- a/examples/lookup/README.md +++ b/examples/lookup/README.md @@ -10,4 +10,3 @@ More info: https://github.com/ggerganov/llama.cpp/pull/4484 https://github.com/ggerganov/llama.cpp/issues/4226 - diff --git a/examples/main-cmake-pkg/.gitignore b/examples/main-cmake-pkg/.gitignore index e32c11c7f..67c01d64c 100644 --- a/examples/main-cmake-pkg/.gitignore +++ b/examples/main-cmake-pkg/.gitignore @@ -48,4 +48,3 @@ build*/ out/ tmp/ - diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt index a97ded365..3b38db292 100644 --- a/examples/main-cmake-pkg/CMakeLists.txt +++ b/examples/main-cmake-pkg/CMakeLists.txt @@ -30,4 +30,3 @@ target_include_directories(${TARGET} PRIVATE ${_common_path}) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) - diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index efde8dfdf..dbe445391 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1991,6 +1991,12 @@ int main(int argc, char ** argv) { params.n_batch = std::min(params.n_batch, n_kv); } else { params.n_batch = std::min(params.n_batch, params.n_ctx); + if (params.kl_divergence) { + params.n_parallel = 1; + } else { + // ensure there's at least enough seq_ids for HellaSwag + params.n_parallel = std::max(4, params.n_parallel); + } } if (params.ppl_stride > 0) { @@ -2015,9 +2021,6 @@ int main(int argc, char ** argv) { llama_model * model; llama_context * ctx; - // ensure there's at least enough seq_ids for HellaSwag - params.n_parallel = std::max(4, params.n_parallel); - // load the model and apply lora adapter, if any std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == NULL) { diff --git a/examples/server-embd.py b/examples/server-embd.py index 118e04271..a9a36a44c 100644 --- a/examples/server-embd.py +++ b/examples/server-embd.py @@ -31,4 +31,3 @@ for i in range(n-1): embedding2 = np.array(result[j]) similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2)) print(f"Similarity between {i} and {j}: {similarity:.2f}") - diff --git a/examples/server/tests/features/passkey.feature b/examples/server/tests/features/passkey.feature index 1bde7aab8..6a5a84e6a 100644 --- a/examples/server/tests/features/passkey.feature +++ b/examples/server/tests/features/passkey.feature @@ -52,4 +52,3 @@ Feature: Passkey / Self-extend with context shift #| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 | #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0 # 987 | - diff --git a/examples/server/themes/buttons-top/index.html b/examples/server/themes/buttons-top/index.html index 6af30d307..8334bcde5 100644 --- a/examples/server/themes/buttons-top/index.html +++ b/examples/server/themes/buttons-top/index.html @@ -1054,4 +1054,3 @@ - diff --git a/examples/server/themes/wild/index.html b/examples/server/themes/wild/index.html index 772e716cd..8361c5774 100644 --- a/examples/server/themes/wild/index.html +++ b/examples/server/themes/wild/index.html @@ -1058,4 +1058,3 @@ - diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index da0e4aaba..111366fb0 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -34,4 +34,3 @@ fi #use multiple GPUs with same max compute units #ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 - diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat index cdae5a528..17dd1ff5c 100644 --- a/examples/sycl/win-build-sycl.bat +++ b/examples/sycl/win-build-sycl.bat @@ -31,4 +31,3 @@ exit /B 0 :ERROR echo comomand error: %errorlevel% exit /B %errorlevel% - diff --git a/examples/sycl/win-run-llama2.bat b/examples/sycl/win-run-llama2.bat index 1d4d7d2cd..f0385cdf0 100644 --- a/examples/sycl/win-run-llama2.bat +++ b/examples/sycl/win-run-llama2.bat @@ -7,5 +7,3 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:" .\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0 - - diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index e7543ae79..6c3226c37 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -63,4 +63,3 @@ GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend); #ifdef __cplusplus } #endif - diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 649ef5a08..1c9ccc8a1 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2711,27 +2711,40 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: { - struct ggml_tensor * a; - struct ggml_tensor * b; + struct ggml_tensor * a = op->src[0]; if (op->op == GGML_OP_MUL_MAT) { - a = op->src[0]; - b = op->src[1]; - } else { - a = op->src[2]; - b = op->src[1]; - } - if (a->ne[3] != b->ne[3]) { - return false; - } - ggml_type a_type = a->type; - if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || - a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S || - a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) { - if (b->ne[1] == 1 && ggml_nrows(b) > 1) { + struct ggml_tensor * b = op->src[1]; + if (a->ne[3] != b->ne[3]) { return false; } } - return true; + switch (a->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_Q8_K: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + return true; + default: + return false; + } } break; case GGML_OP_GET_ROWS: { diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 472f4ace1..4ff06b871 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -227,6 +227,10 @@ typedef float2 dfloat2; #define RDNA2 #endif +#if defined(__gfx1010__) || defined(__gfx1012__) +#define RDNA1 +#endif + #ifndef __has_builtin #define __has_builtin(x) 0 #endif diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 12d741f01..3db57034b 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -487,4 +487,3 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { GGML_ASSERT(false); } } - diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 1396e7a75..deaed066f 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -60,12 +60,16 @@ static constexpr __device__ int get_mmq_x_max_device() { } static constexpr int get_mmq_y_host(const int cc) { - return int8_mma_available(cc) || cc >= CC_VOLTA ? 128 : 64; + return cc >= CC_OFFSET_AMD ? (cc == CC_RDNA1 ? 64 : 128) : (cc >= CC_VOLTA ? 128 : 64); } static constexpr __device__ int get_mmq_y_device() { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA1) + return 64; +#else return 128; +#endif // defined RDNA1 #else #if __CUDA_ARCH__ >= CC_VOLTA return 128; @@ -2259,9 +2263,9 @@ static __device__ void mul_mat_q_process_tile( template #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) +#if defined(RDNA3) || defined(RDNA2) || defined(RDNA1) __launch_bounds__(WARP_SIZE*nwarps, 2) -#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(RDNA3) || defined(RDNA2) || defined(RDNA1) #else #if __CUDA_ARCH__ >= CC_VOLTA __launch_bounds__(WARP_SIZE*nwarps, 1) diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index e2796fd60..c3503479b 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -6537,4 +6537,3 @@ template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; - diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 4d436a8f0..30983b872 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -130,4 +130,3 @@ void iq3xs_free_impl(int grid_size); #ifdef __cplusplus } #endif - diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index dfd4a7c2c..476d847ca 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -351,4 +351,10 @@ static __dpct_inline__ float warp_reduce_max(float x, return x; } +// Helper for vec loading aligned data +template +inline sycl::vec vec_aligned_load(const Tp* aligned_ptr) { + return *reinterpret_cast*>(aligned_ptr); +} + #endif // GGML_SYCL_COMMON_HPP diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp index ce9de2b42..a15271b51 100644 --- a/ggml/src/ggml-sycl/convert.cpp +++ b/ggml/src/ggml-sycl/convert.cpp @@ -152,12 +152,15 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor scale_local_acc(sycl::range<1>(12), cgh); + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q4_K(vx, y, item_ct1); + dequantize_block_q4_K(vx, y, scale_local_acc.get_pointer(), item_ct1); }); + }); } } diff --git a/ggml/src/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp index b6080d83a..ed8ad098b 100644 --- a/ggml/src/ggml-sycl/dequantize.hpp +++ b/ggml/src/ggml-sycl/dequantize.hpp @@ -293,7 +293,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri #if QK_K == 256 static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { if (j < 4) { - d = q[j] & 63; m = q[j + 4] & 63; + d = q[j] & 63; + m = q[j + 4] & 63; } else { d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); @@ -303,7 +304,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8 template static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1) { + uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) { const block_q4_K * x = (const block_q4_K *) vx; const int i = item_ct1.get_group(2); @@ -318,19 +319,26 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri dst_t * y = yy + i*QK_K + 64*il + n*ir; - const float dall = x[i].dm[0]; - const float dmin = x[i].dm[1]; + const sycl::half2 dm = x[i].dm; + const float dall = dm[0]; + const float dmin = dm[1]; - const uint8_t * q = x[i].qs + 32*il + n*ir; + if (tid < 12) + scales_local[tid] = x[i].scales[tid]; + item_ct1.barrier(sycl::access::fence_space::local_space); uint8_t sc, m; - get_scale_min_k4(is + 0, x[i].scales, sc, m); - const float d1 = dall * sc; const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[i].scales, sc, m); - const float d2 = dall * sc; const float m2 = dmin * m; + get_scale_min_k4(is + 0, scales_local, sc, m); + const float d1 = dall * sc; + const float m1 = dmin * m; + get_scale_min_k4(is + 1, scales_local, sc, m); + const float d2 = dall * sc; + const float m2 = dmin * m; + + sycl::vec q_vec = vec_aligned_load(x[i].qs + 32*il + n*ir); for (int l = 0; l < n; ++l) { - y[l + 0] = d1 * (q[l] & 0xF) - m1; - y[l +32] = d2 * (q[l] >> 4) - m2; + y[l + 0] = d1 * (q_vec[l] & 0xF) - m1; + y[l +32] = d2 * (q_vec[l] >> 4) - m2; } #else const int tid = item_ct1.get_local_id(2); diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index 1ff297218..5e98660dc 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -255,7 +255,7 @@ namespace dpct void set_pitch(size_t pitch) { _pitch = pitch; } size_t get_x() { return _x; } - void set_x(size_t x) { _x = x; }; + void set_x(size_t x) { _x = x; } size_t get_y() { return _y; } void set_y(size_t y) { _y = y; } @@ -1056,7 +1056,7 @@ namespace dpct #error "Only support Windows and Linux." #endif next_free = mapped_address_space; - }; + } public: using buffer_id_t = int; @@ -1077,7 +1077,7 @@ namespace dpct #else #error "Only support Windows and Linux." #endif - }; + } mem_mgr(const mem_mgr &) = delete; mem_mgr &operator=(const mem_mgr &) = delete; diff --git a/ggml/src/ggml-vulkan-shaders.hpp b/ggml/src/ggml-vulkan-shaders.hpp index 01ff66f71..f0c4c6baf 100644 --- a/ggml/src/ggml-vulkan-shaders.hpp +++ b/ggml/src/ggml-vulkan-shaders.hpp @@ -144954,4 +144954,3 @@ unsigned char sum_rows_f32_data[] = { }; const uint64_t sum_rows_f32_len = 2112; - diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f5502afbe..bc91ac3a7 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5312,7 +5312,7 @@ void ggml_mul_mat_set_prec( as -> [cols, rows, n_expert] ids -> [n_experts_used, n_tokens] (i32) b -> [cols, n_expert_used, n_tokens] - c -> [cols, n_expert_used, n_tokens] + c -> [rows, n_expert_used, n_tokens] in b, n_experts_used can be broadcasted to match the n_expert_used of ids diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e87c58266..419f10cee 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -164,6 +164,7 @@ class MODEL_ARCH(IntEnum): DEEPSEEK2 = auto() BITNET = auto() T5 = auto() + JAIS = auto() class MODEL_TENSOR(IntEnum): @@ -288,6 +289,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.T5: "t5", + MODEL_ARCH.JAIS: "jais", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -954,6 +956,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ENC_FFN_UP, MODEL_TENSOR.ENC_OUTPUT_NORM, ], + MODEL_ARCH.JAIS: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0bed43939..20e28423b 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -10,7 +10,7 @@ class TensorNameMap: # Token embeddings MODEL_TENSOR.TOKEN_EMBD: ( "gpt_neox.embed_in", # gptneox - "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx + "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais "transformer.word_embeddings", # falcon "word_embeddings", # bloom "model.embed_tokens", # llama-hf @@ -49,7 +49,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 @@ -58,7 +58,7 @@ class TensorNameMap: # Output norm MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox - "transformer.ln_f", # gpt2 gpt-j falcon + "transformer.ln_f", # gpt2 gpt-j falcon jais "model.norm", # llama-hf baichuan internlm2 "norm", # llama-pth "transformer.norm_f", # mpt dbrx @@ -81,7 +81,7 @@ class TensorNameMap: # Attention norm MODEL_TENSOR.ATTN_NORM: ( "gpt_neox.layers.{bid}.input_layernorm", # gptneox - "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen + "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais "transformer.blocks.{bid}.norm_1", # mpt "transformer.h.{bid}.input_layernorm", # falcon7b "h.{bid}.input_layernorm", # bloom @@ -109,7 +109,7 @@ class TensorNameMap: # Attention query-key-value MODEL_TENSOR.ATTN_QKV: ( "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox - "transformer.h.{bid}.attn.c_attn", # gpt2 qwen + "transformer.h.{bid}.attn.c_attn", # gpt2 qwen jais "transformer.blocks.{bid}.attn.Wqkv", # mpt "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx "transformer.h.{bid}.self_attention.query_key_value", # falcon @@ -160,7 +160,7 @@ class TensorNameMap: # Attention output MODEL_TENSOR.ATTN_OUT: ( "gpt_neox.layers.{bid}.attention.dense", # gptneox - "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen + "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen jais "transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.h.{bid}.self_attention.dense", # falcon "h.{bid}.self_attention.dense", # bloom @@ -202,7 +202,7 @@ class TensorNameMap: # Feed-forward norm MODEL_TENSOR.FFN_NORM: ( "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox - "transformer.h.{bid}.ln_2", # gpt2 refact qwen + "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais "h.{bid}.post_attention_layernorm", # bloom "transformer.blocks.{bid}.norm_2", # mpt "model.layers.{bid}.post_attention_layernorm", # llama-hf @@ -239,7 +239,7 @@ class TensorNameMap: # Feed-forward up MODEL_TENSOR.FFN_UP: ( "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox - "transformer.h.{bid}.mlp.c_fc", # gpt2 + "transformer.h.{bid}.mlp.c_fc", # gpt2 jais "transformer.blocks.{bid}.ffn.up_proj", # mpt "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon "h.{bid}.mlp.dense_h_to_4h", # bloom @@ -285,6 +285,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.gate_proj", # llama-hf refact "layers.{bid}.feed_forward.w1", # llama-pth "transformer.h.{bid}.mlp.w2", # qwen + "transformer.h.{bid}.mlp.c_fc2", # jais "model.layers.layers.{bid}.mlp.gate_proj", # plamo "model.layers.{bid}.feed_forward.w1", # internlm2 "encoder.layers.{bid}.mlp.fc12", # nomic-bert @@ -308,7 +309,7 @@ class TensorNameMap: # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox - "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen + "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen jais "transformer.blocks.{bid}.ffn.down_proj", # mpt "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon "h.{bid}.mlp.dense_4h_to_h", # bloom diff --git a/include/llama.h b/include/llama.h index cafeafb85..c5b618292 100644 --- a/include/llama.h +++ b/include/llama.h @@ -89,6 +89,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, LLAMA_VOCAB_PRE_TYPE_PORO = 15, LLAMA_VOCAB_PRE_TYPE_VIKING = 16, + LLAMA_VOCAB_PRE_TYPE_JAIS = 17, }; // note: these values should be synchronized with ggml_rope diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh index 586d6ea18..0d6d4032d 100644 --- a/scripts/pod-llama.sh +++ b/scripts/pod-llama.sh @@ -210,4 +210,3 @@ fi # more benches #GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 #GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 - diff --git a/src/llama.cpp b/src/llama.cpp index eea532f6a..73f52435a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -228,6 +228,7 @@ enum llm_arch { LLM_ARCH_DEEPSEEK2, LLM_ARCH_BITNET, LLM_ARCH_T5, + LLM_ARCH_JAIS, LLM_ARCH_UNKNOWN, }; @@ -269,6 +270,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_DEEPSEEK2, "deepseek2" }, { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, + { LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1236,6 +1238,21 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_JAIS, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2035,6 +2052,7 @@ enum e_model { MODEL_410M, MODEL_0_5B, MODEL_1B, + MODEL_1_3B, MODEL_1_4B, MODEL_2B, MODEL_2_8B, @@ -4276,6 +4294,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_410M: return "410M"; case MODEL_0_5B: return "0.5B"; case MODEL_1B: return "1B"; + case MODEL_1_3B: return "1.3B"; case MODEL_1_4B: return "1.4B"; case MODEL_2B: return "2B"; case MODEL_2_8B: return "2.8B"; @@ -4898,6 +4917,18 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_JAIS: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias); + + switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1_3B; break; + case 40: model.type = e_model::MODEL_13B; break; + /* TODO: add variants */ + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -5129,6 +5160,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "viking") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING; + } else if ( + tokenizer_pre == "jais") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -6962,6 +6996,44 @@ static bool llm_load_tensors( layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1}); } } break; + case LLM_ARCH_JAIS: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // Output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -12354,6 +12426,97 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_jais() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); + } + + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = ggml_mul_mat(ctx0, model.output, cur); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -12585,6 +12748,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_bitnet(); } break; + case LLM_ARCH_JAIS: + { + result = llm.build_jais(); + } break; default: GGML_ASSERT(false); } @@ -13947,6 +14114,7 @@ struct llm_tokenizer_bpe { break; case LLAMA_VOCAB_PRE_TYPE_GPT2: case LLAMA_VOCAB_PRE_TYPE_OLMO: + case LLAMA_VOCAB_PRE_TYPE_JAIS: regex_exprs = { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }; @@ -17826,6 +17994,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_MAMBA: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_T5: + case LLM_ARCH_JAIS: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp index 4a939898b..02bdf7823 100644 --- a/src/unicode-data.cpp +++ b/src/unicode-data.cpp @@ -7030,4 +7030,3 @@ const std::vector unicode_ranges_nfd = { // start, last, nfd {0x02FA1C, 0x02FA1C, 0x009F3B}, {0x02FA1D, 0x02FA1D, 0x02A600}, }; - diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f74c0db47..2bb71ac03 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2052,6 +2052,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, + GGML_TYPE_BF16, }; // unary ops diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp index f0895ffaa..8159e276a 100644 --- a/tests/test-rope.cpp +++ b/tests/test-rope.cpp @@ -218,4 +218,3 @@ int main(int /*argc*/, const char ** /*argv*/) { return 0; } -