diff --git a/ggml-cuda/fattn-tile-f32.cu b/ggml-cuda/fattn-tile-f32.cu index 54db765e2..b8b2f69e1 100644 --- a/ggml-cuda/fattn-tile-f32.cu +++ b/ggml-cuda/fattn-tile-f32.cu @@ -283,8 +283,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * } void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * KQV = dst; - const ggml_tensor * Q = dst->src[0]; + const ggml_tensor * Q = dst->src[0]; if (Q->ne[1] <= 16) { constexpr int cols_per_block = 16; diff --git a/llama.cpp b/llama.cpp index 0a587d4cb..62964cfb7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3793,12 +3793,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { static const char * llama_model_type_name(e_model type) { switch (type) { + case MODEL_17M: return "17M"; case MODEL_22M: return "22M"; case MODEL_33M: return "33M"; case MODEL_70M: return "70M"; case MODEL_109M: return "109M"; case MODEL_137M: return "137M"; case MODEL_160M: return "160M"; + case MODEL_335M: return "335M"; case MODEL_410M: return "410M"; case MODEL_0_5B: return "0.5B"; case MODEL_1B: return "1B"; @@ -3806,6 +3808,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_2B: return "2B"; case MODEL_2_8B: return "2.8B"; case MODEL_3B: return "3B"; + case MODEL_4B: return "4B"; case MODEL_6_9B: return "6.9B"; case MODEL_7B: return "7B"; case MODEL_8B: return "8B";