diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz index 582ccc0d3..e876776e8 100644 Binary files a/examples/server/public/index.html.gz and b/examples/server/public/index.html.gz differ diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index fefdce55b..5f97df5fd 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -5,10 +5,6 @@ #include "llama.h" #include "common/base64.hpp" -#ifndef NDEBUG -// crash the server in debug mode, otherwise send an http 500 error -#define CPPHTTPLIB_NO_EXCEPTIONS 1 -#endif // increase max payload length to allow use of larger context size #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576 #include "httplib.h" diff --git a/examples/server/webui/index.html b/examples/server/webui/index.html index d3893ea4e..1ad85e739 100644 --- a/examples/server/webui/index.html +++ b/examples/server/webui/index.html @@ -154,8 +154,6 @@ placeholder="Type a message (Shift+Enter to add a new line)" v-model="inputMsg" @keydown.enter.exact.prevent="sendMessage" - @keydown.enter.shift.exact.prevent="inputMsg += '\n'" - :disabled="isGenerating" id="msg-input" dir="auto" > diff --git a/examples/server/webui/src/main.js b/examples/server/webui/src/main.js index 90f4ca368..50197a355 100644 --- a/examples/server/webui/src/main.js +++ b/examples/server/webui/src/main.js @@ -468,7 +468,10 @@ const mainApp = createApp({ URL.revokeObjectURL(url); }, async sendMessage() { - if (!this.inputMsg) return; + // prevent sending empty message + // also allow typing the message while generating, but does not allow sending it (to match UX/UI behavior of other chat apps) + if (!this.inputMsg || this.isGenerating) return; + const currConvId = this.viewingConvId; StorageUtils.appendMsg(currConvId, { diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 232163c1c..174916bc9 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -176,6 +176,14 @@ static constexpr bool new_mma_available(const int cc) { return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING; } +static constexpr __device__ int ggml_cuda_get_physical_warp_size() { +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) + return __AMDGCN_WAVEFRONT_SIZE; +#else + return 32; +#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +} + [[noreturn]] static __device__ void no_device_code( const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index cfd7c0f44..d40ee2da4 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -516,6 +516,12 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) { nullptr; } +// The HIP compiler for some reason complains that it can't unroll a loop because of the jt*ncols + j >= ne01 conditional. +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpass-failed" +#endif // __clang__ + template // D == head size #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) __launch_bounds__(D, 1) @@ -614,6 +620,10 @@ static __global__ void flash_attn_stream_k_fixup( } } +#ifdef __clang__ +#pragma clang diagnostic pop +#endif // __clang__ + template // D == head size #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) __launch_bounds__(D, 1) diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index 1054ff95d..45702ad65 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -561,7 +561,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst); break; // case 256: - // ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst); + // ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst); // break; default: GGML_ABORT("fatal error"); diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index b1e66d470..b0cf152f5 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -235,7 +235,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst return; } - if (!new_mma_available(cc)) { + if (!fp16_mma_available(cc)) { if (prec == GGML_PREC_DEFAULT) { if (Q->ne[1] <= 8) { ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); @@ -265,6 +265,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst // The MMA implementation needs Turing or newer, use the old WMMA code for Volta: if (cc == GGML_CUDA_CC_VOLTA) { ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst); + return; } ggml_cuda_flash_attn_ext_mma_f16(ctx, dst); diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu index ac45f2d17..5a9ddd958 100644 --- a/ggml/src/ggml-cuda/mmv.cu +++ b/ggml/src/ggml-cuda/mmv.cu @@ -5,9 +5,10 @@ template static __global__ void mul_mat_vec( const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row, const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) { - const int64_t row = blockIdx.x; - const int64_t channel = blockIdx.z; - const int tid = threadIdx.x; + const int64_t row = blockIdx.x; + const int64_t channel = blockIdx.z; + const int tid = threadIdx.x; + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); x += (channel/channel_ratio)*stride_channel_x + row*stride_row; y += channel *stride_channel_y; @@ -18,8 +19,8 @@ static __global__ void mul_mat_vec( extern __shared__ char data_mmv[]; float * buf_iw = (float *) data_mmv; - if (block_size > WARP_SIZE) { - if (tid < WARP_SIZE) { + if (block_size > warp_size) { + if (tid < warp_size) { buf_iw[tid] = 0.0f; } __syncthreads(); @@ -67,16 +68,16 @@ static __global__ void mul_mat_vec( static_assert(std::is_same::value, "unsupported type"); } - sumf = warp_reduce_sum(sumf); + sumf = warp_reduce_sum(sumf); - if (block_size > WARP_SIZE) { - buf_iw[tid/WARP_SIZE] = sumf; + if (block_size > warp_size) { + buf_iw[tid/warp_size] = sumf; __syncthreads(); - if (tid >= WARP_SIZE) { + if (tid >= warp_size) { return; } sumf = buf_iw[tid]; - sumf = warp_reduce_sum(sumf); + sumf = warp_reduce_sum(sumf); } if (tid != 0) { @@ -96,10 +97,19 @@ static void launch_mul_mat_vec_cuda( GGML_ASSERT(stride_row % 2 == 0); GGML_ASSERT(nchannels_y % nchannels_x == 0); const int64_t channel_ratio = nchannels_y / nchannels_x; + int device; + int warp_size; - int64_t block_size_best = WARP_SIZE; - int64_t niter_best = (ncols + 2*WARP_SIZE - 1) / (2*WARP_SIZE); - for (int64_t block_size = 2*WARP_SIZE; block_size <= 256; block_size += WARP_SIZE) { + CUDA_CHECK(cudaGetDevice(&device)); + warp_size = ggml_cuda_info().devices[device].warp_size; + + int64_t block_size_best = warp_size; + int64_t niter_best = (ncols + 2*warp_size - 1) / (2*warp_size); + int64_t max_block_size = 256; + if(ggml_cuda_info().devices[device].cc > GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_info().devices[device].cc < GGML_CUDA_CC_RDNA1) { + max_block_size = 128; + } + for (int64_t block_size = 2*warp_size; block_size <= max_block_size; block_size += warp_size) { const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size); if (niter < niter_best) { niter_best = niter; @@ -107,7 +117,7 @@ static void launch_mul_mat_vec_cuda( } } - const int smem = WARP_SIZE*sizeof(float); + const int smem = warp_size*sizeof(float); const dim3 block_nums(nrows, 1, nchannels_y); const dim3 block_dims(block_size_best, 1, 1); switch (block_size_best) { diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu index da377200e..aac6e0999 100644 --- a/ggml/src/ggml-cuda/softmax.cu +++ b/ggml/src/ggml-cuda/softmax.cu @@ -18,7 +18,7 @@ __device__ float __forceinline__ t2f32(half val) { #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wpass-failed" -#endif +#endif // __clang__ template static __global__ void soft_max_f32( const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, @@ -126,7 +126,7 @@ static __global__ void soft_max_f32( } #ifdef __clang__ #pragma clang diagnostic pop -#endif +#endif // __clang__ static __global__ void soft_max_back_f32( const float * grad, const float * dstf, float * dst, const int ncols, const float scale) { diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 129478ed7..81964611c 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -1,5 +1,6 @@ #pragma once +#define HIP_ENABLE_WARP_SYNC_BUILTINS 1 #include #include #include @@ -8,6 +9,7 @@ // for rocblas_initialize() #include "rocblas/rocblas.h" #endif // __HIP_PLATFORM_AMD__ + #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index ddb9d817e..34f1cbf69 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -32f0b85987396945afea2291d5f4c5862434292b +498e0ecd2c4f9379439fd413805af10e8e9ff349