diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz
index 582ccc0d3..e876776e8 100644
Binary files a/examples/server/public/index.html.gz and b/examples/server/public/index.html.gz differ
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index fefdce55b..5f97df5fd 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -5,10 +5,6 @@
#include "llama.h"
#include "common/base64.hpp"
-#ifndef NDEBUG
-// crash the server in debug mode, otherwise send an http 500 error
-#define CPPHTTPLIB_NO_EXCEPTIONS 1
-#endif
// increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
#include "httplib.h"
diff --git a/examples/server/webui/index.html b/examples/server/webui/index.html
index d3893ea4e..1ad85e739 100644
--- a/examples/server/webui/index.html
+++ b/examples/server/webui/index.html
@@ -154,8 +154,6 @@
placeholder="Type a message (Shift+Enter to add a new line)"
v-model="inputMsg"
@keydown.enter.exact.prevent="sendMessage"
- @keydown.enter.shift.exact.prevent="inputMsg += '\n'"
- :disabled="isGenerating"
id="msg-input"
dir="auto"
>
diff --git a/examples/server/webui/src/main.js b/examples/server/webui/src/main.js
index 90f4ca368..50197a355 100644
--- a/examples/server/webui/src/main.js
+++ b/examples/server/webui/src/main.js
@@ -468,7 +468,10 @@ const mainApp = createApp({
URL.revokeObjectURL(url);
},
async sendMessage() {
- if (!this.inputMsg) return;
+ // prevent sending empty message
+ // also allow typing the message while generating, but does not allow sending it (to match UX/UI behavior of other chat apps)
+ if (!this.inputMsg || this.isGenerating) return;
+
const currConvId = this.viewingConvId;
StorageUtils.appendMsg(currConvId, {
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 232163c1c..174916bc9 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -176,6 +176,14 @@ static constexpr bool new_mma_available(const int cc) {
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
}
+static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+ return __AMDGCN_WAVEFRONT_SIZE;
+#else
+ return 32;
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+}
+
[[noreturn]]
static __device__ void no_device_code(
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index cfd7c0f44..d40ee2da4 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -516,6 +516,12 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
nullptr;
}
+// The HIP compiler for some reason complains that it can't unroll a loop because of the jt*ncols + j >= ne01 conditional.
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpass-failed"
+#endif // __clang__
+
template // D == head size
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1)
@@ -614,6 +620,10 @@ static __global__ void flash_attn_stream_k_fixup(
}
}
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif // __clang__
+
template // D == head size
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1)
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
index 1054ff95d..45702ad65 100644
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -561,7 +561,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
break;
// case 256:
- // ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
+ // ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
// break;
default:
GGML_ABORT("fatal error");
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
index b1e66d470..b0cf152f5 100644
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -235,7 +235,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
return;
}
- if (!new_mma_available(cc)) {
+ if (!fp16_mma_available(cc)) {
if (prec == GGML_PREC_DEFAULT) {
if (Q->ne[1] <= 8) {
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
@@ -265,6 +265,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
// The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
if (cc == GGML_CUDA_CC_VOLTA) {
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
+ return;
}
ggml_cuda_flash_attn_ext_mma_f16(ctx, dst);
diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu
index ac45f2d17..5a9ddd958 100644
--- a/ggml/src/ggml-cuda/mmv.cu
+++ b/ggml/src/ggml-cuda/mmv.cu
@@ -5,9 +5,10 @@ template
static __global__ void mul_mat_vec(
const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
- const int64_t row = blockIdx.x;
- const int64_t channel = blockIdx.z;
- const int tid = threadIdx.x;
+ const int64_t row = blockIdx.x;
+ const int64_t channel = blockIdx.z;
+ const int tid = threadIdx.x;
+ constexpr int warp_size = ggml_cuda_get_physical_warp_size();
x += (channel/channel_ratio)*stride_channel_x + row*stride_row;
y += channel *stride_channel_y;
@@ -18,8 +19,8 @@ static __global__ void mul_mat_vec(
extern __shared__ char data_mmv[];
float * buf_iw = (float *) data_mmv;
- if (block_size > WARP_SIZE) {
- if (tid < WARP_SIZE) {
+ if (block_size > warp_size) {
+ if (tid < warp_size) {
buf_iw[tid] = 0.0f;
}
__syncthreads();
@@ -67,16 +68,16 @@ static __global__ void mul_mat_vec(
static_assert(std::is_same::value, "unsupported type");
}
- sumf = warp_reduce_sum(sumf);
+ sumf = warp_reduce_sum(sumf);
- if (block_size > WARP_SIZE) {
- buf_iw[tid/WARP_SIZE] = sumf;
+ if (block_size > warp_size) {
+ buf_iw[tid/warp_size] = sumf;
__syncthreads();
- if (tid >= WARP_SIZE) {
+ if (tid >= warp_size) {
return;
}
sumf = buf_iw[tid];
- sumf = warp_reduce_sum(sumf);
+ sumf = warp_reduce_sum(sumf);
}
if (tid != 0) {
@@ -96,10 +97,19 @@ static void launch_mul_mat_vec_cuda(
GGML_ASSERT(stride_row % 2 == 0);
GGML_ASSERT(nchannels_y % nchannels_x == 0);
const int64_t channel_ratio = nchannels_y / nchannels_x;
+ int device;
+ int warp_size;
- int64_t block_size_best = WARP_SIZE;
- int64_t niter_best = (ncols + 2*WARP_SIZE - 1) / (2*WARP_SIZE);
- for (int64_t block_size = 2*WARP_SIZE; block_size <= 256; block_size += WARP_SIZE) {
+ CUDA_CHECK(cudaGetDevice(&device));
+ warp_size = ggml_cuda_info().devices[device].warp_size;
+
+ int64_t block_size_best = warp_size;
+ int64_t niter_best = (ncols + 2*warp_size - 1) / (2*warp_size);
+ int64_t max_block_size = 256;
+ if(ggml_cuda_info().devices[device].cc > GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_info().devices[device].cc < GGML_CUDA_CC_RDNA1) {
+ max_block_size = 128;
+ }
+ for (int64_t block_size = 2*warp_size; block_size <= max_block_size; block_size += warp_size) {
const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
if (niter < niter_best) {
niter_best = niter;
@@ -107,7 +117,7 @@ static void launch_mul_mat_vec_cuda(
}
}
- const int smem = WARP_SIZE*sizeof(float);
+ const int smem = warp_size*sizeof(float);
const dim3 block_nums(nrows, 1, nchannels_y);
const dim3 block_dims(block_size_best, 1, 1);
switch (block_size_best) {
diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu
index da377200e..aac6e0999 100644
--- a/ggml/src/ggml-cuda/softmax.cu
+++ b/ggml/src/ggml-cuda/softmax.cu
@@ -18,7 +18,7 @@ __device__ float __forceinline__ t2f32(half val) {
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
-#endif
+#endif // __clang__
template
static __global__ void soft_max_f32(
const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y,
@@ -126,7 +126,7 @@ static __global__ void soft_max_f32(
}
#ifdef __clang__
#pragma clang diagnostic pop
-#endif
+#endif // __clang__
static __global__ void soft_max_back_f32(
const float * grad, const float * dstf, float * dst, const int ncols, const float scale) {
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index 129478ed7..81964611c 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -1,5 +1,6 @@
#pragma once
+#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
#include
#include
#include
@@ -8,6 +9,7 @@
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__
+
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index ddb9d817e..34f1cbf69 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-32f0b85987396945afea2291d5f4c5862434292b
+498e0ecd2c4f9379439fd413805af10e8e9ff349