Merge branch 'master' into r1-toolcall
This commit is contained in:
commit
bc6d910f6d
12 changed files with 54 additions and 26 deletions
Binary file not shown.
|
@ -5,10 +5,6 @@
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common/base64.hpp"
|
#include "common/base64.hpp"
|
||||||
|
|
||||||
#ifndef NDEBUG
|
|
||||||
// crash the server in debug mode, otherwise send an http 500 error
|
|
||||||
#define CPPHTTPLIB_NO_EXCEPTIONS 1
|
|
||||||
#endif
|
|
||||||
// increase max payload length to allow use of larger context size
|
// increase max payload length to allow use of larger context size
|
||||||
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
||||||
#include "httplib.h"
|
#include "httplib.h"
|
||||||
|
|
|
@ -154,8 +154,6 @@
|
||||||
placeholder="Type a message (Shift+Enter to add a new line)"
|
placeholder="Type a message (Shift+Enter to add a new line)"
|
||||||
v-model="inputMsg"
|
v-model="inputMsg"
|
||||||
@keydown.enter.exact.prevent="sendMessage"
|
@keydown.enter.exact.prevent="sendMessage"
|
||||||
@keydown.enter.shift.exact.prevent="inputMsg += '\n'"
|
|
||||||
:disabled="isGenerating"
|
|
||||||
id="msg-input"
|
id="msg-input"
|
||||||
dir="auto"
|
dir="auto"
|
||||||
></textarea>
|
></textarea>
|
||||||
|
|
|
@ -468,7 +468,10 @@ const mainApp = createApp({
|
||||||
URL.revokeObjectURL(url);
|
URL.revokeObjectURL(url);
|
||||||
},
|
},
|
||||||
async sendMessage() {
|
async sendMessage() {
|
||||||
if (!this.inputMsg) return;
|
// prevent sending empty message
|
||||||
|
// also allow typing the message while generating, but does not allow sending it (to match UX/UI behavior of other chat apps)
|
||||||
|
if (!this.inputMsg || this.isGenerating) return;
|
||||||
|
|
||||||
const currConvId = this.viewingConvId;
|
const currConvId = this.viewingConvId;
|
||||||
|
|
||||||
StorageUtils.appendMsg(currConvId, {
|
StorageUtils.appendMsg(currConvId, {
|
||||||
|
|
|
@ -176,6 +176,14 @@ static constexpr bool new_mma_available(const int cc) {
|
||||||
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
|
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
||||||
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
return __AMDGCN_WAVEFRONT_SIZE;
|
||||||
|
#else
|
||||||
|
return 32;
|
||||||
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
}
|
||||||
|
|
||||||
[[noreturn]]
|
[[noreturn]]
|
||||||
static __device__ void no_device_code(
|
static __device__ void no_device_code(
|
||||||
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
||||||
|
|
|
@ -516,6 +516,12 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
|
||||||
nullptr;
|
nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The HIP compiler for some reason complains that it can't unroll a loop because of the jt*ncols + j >= ne01 conditional.
|
||||||
|
#ifdef __clang__
|
||||||
|
#pragma clang diagnostic push
|
||||||
|
#pragma clang diagnostic ignored "-Wpass-failed"
|
||||||
|
#endif // __clang__
|
||||||
|
|
||||||
template<int D, int ncols, int KQ_stride> // D == head size
|
template<int D, int ncols, int KQ_stride> // D == head size
|
||||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(D, 1)
|
__launch_bounds__(D, 1)
|
||||||
|
@ -614,6 +620,10 @@ static __global__ void flash_attn_stream_k_fixup(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef __clang__
|
||||||
|
#pragma clang diagnostic pop
|
||||||
|
#endif // __clang__
|
||||||
|
|
||||||
template<int D, int parallel_blocks> // D == head size
|
template<int D, int parallel_blocks> // D == head size
|
||||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(D, 1)
|
__launch_bounds__(D, 1)
|
||||||
|
|
|
@ -561,7 +561,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
|
||||||
ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
|
ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
|
||||||
break;
|
break;
|
||||||
// case 256:
|
// case 256:
|
||||||
// ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
|
// ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
|
||||||
// break;
|
// break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
|
|
|
@ -235,7 +235,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!new_mma_available(cc)) {
|
if (!fp16_mma_available(cc)) {
|
||||||
if (prec == GGML_PREC_DEFAULT) {
|
if (prec == GGML_PREC_DEFAULT) {
|
||||||
if (Q->ne[1] <= 8) {
|
if (Q->ne[1] <= 8) {
|
||||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||||
|
@ -265,6 +265,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||||
// The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
|
// The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
|
||||||
if (cc == GGML_CUDA_CC_VOLTA) {
|
if (cc == GGML_CUDA_CC_VOLTA) {
|
||||||
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
|
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cuda_flash_attn_ext_mma_f16(ctx, dst);
|
ggml_cuda_flash_attn_ext_mma_f16(ctx, dst);
|
||||||
|
|
|
@ -8,6 +8,7 @@ static __global__ void mul_mat_vec(
|
||||||
const int64_t row = blockIdx.x;
|
const int64_t row = blockIdx.x;
|
||||||
const int64_t channel = blockIdx.z;
|
const int64_t channel = blockIdx.z;
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
|
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||||
|
|
||||||
x += (channel/channel_ratio)*stride_channel_x + row*stride_row;
|
x += (channel/channel_ratio)*stride_channel_x + row*stride_row;
|
||||||
y += channel *stride_channel_y;
|
y += channel *stride_channel_y;
|
||||||
|
@ -18,8 +19,8 @@ static __global__ void mul_mat_vec(
|
||||||
extern __shared__ char data_mmv[];
|
extern __shared__ char data_mmv[];
|
||||||
float * buf_iw = (float *) data_mmv;
|
float * buf_iw = (float *) data_mmv;
|
||||||
|
|
||||||
if (block_size > WARP_SIZE) {
|
if (block_size > warp_size) {
|
||||||
if (tid < WARP_SIZE) {
|
if (tid < warp_size) {
|
||||||
buf_iw[tid] = 0.0f;
|
buf_iw[tid] = 0.0f;
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
@ -67,16 +68,16 @@ static __global__ void mul_mat_vec(
|
||||||
static_assert(std::is_same<T, void>::value, "unsupported type");
|
static_assert(std::is_same<T, void>::value, "unsupported type");
|
||||||
}
|
}
|
||||||
|
|
||||||
sumf = warp_reduce_sum(sumf);
|
sumf = warp_reduce_sum<warp_size>(sumf);
|
||||||
|
|
||||||
if (block_size > WARP_SIZE) {
|
if (block_size > warp_size) {
|
||||||
buf_iw[tid/WARP_SIZE] = sumf;
|
buf_iw[tid/warp_size] = sumf;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
if (tid >= WARP_SIZE) {
|
if (tid >= warp_size) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
sumf = buf_iw[tid];
|
sumf = buf_iw[tid];
|
||||||
sumf = warp_reduce_sum(sumf);
|
sumf = warp_reduce_sum<warp_size>(sumf);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tid != 0) {
|
if (tid != 0) {
|
||||||
|
@ -96,10 +97,19 @@ static void launch_mul_mat_vec_cuda(
|
||||||
GGML_ASSERT(stride_row % 2 == 0);
|
GGML_ASSERT(stride_row % 2 == 0);
|
||||||
GGML_ASSERT(nchannels_y % nchannels_x == 0);
|
GGML_ASSERT(nchannels_y % nchannels_x == 0);
|
||||||
const int64_t channel_ratio = nchannels_y / nchannels_x;
|
const int64_t channel_ratio = nchannels_y / nchannels_x;
|
||||||
|
int device;
|
||||||
|
int warp_size;
|
||||||
|
|
||||||
int64_t block_size_best = WARP_SIZE;
|
CUDA_CHECK(cudaGetDevice(&device));
|
||||||
int64_t niter_best = (ncols + 2*WARP_SIZE - 1) / (2*WARP_SIZE);
|
warp_size = ggml_cuda_info().devices[device].warp_size;
|
||||||
for (int64_t block_size = 2*WARP_SIZE; block_size <= 256; block_size += WARP_SIZE) {
|
|
||||||
|
int64_t block_size_best = warp_size;
|
||||||
|
int64_t niter_best = (ncols + 2*warp_size - 1) / (2*warp_size);
|
||||||
|
int64_t max_block_size = 256;
|
||||||
|
if(ggml_cuda_info().devices[device].cc > GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_info().devices[device].cc < GGML_CUDA_CC_RDNA1) {
|
||||||
|
max_block_size = 128;
|
||||||
|
}
|
||||||
|
for (int64_t block_size = 2*warp_size; block_size <= max_block_size; block_size += warp_size) {
|
||||||
const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
|
const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
|
||||||
if (niter < niter_best) {
|
if (niter < niter_best) {
|
||||||
niter_best = niter;
|
niter_best = niter;
|
||||||
|
@ -107,7 +117,7 @@ static void launch_mul_mat_vec_cuda(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int smem = WARP_SIZE*sizeof(float);
|
const int smem = warp_size*sizeof(float);
|
||||||
const dim3 block_nums(nrows, 1, nchannels_y);
|
const dim3 block_nums(nrows, 1, nchannels_y);
|
||||||
const dim3 block_dims(block_size_best, 1, 1);
|
const dim3 block_dims(block_size_best, 1, 1);
|
||||||
switch (block_size_best) {
|
switch (block_size_best) {
|
||||||
|
|
|
@ -18,7 +18,7 @@ __device__ float __forceinline__ t2f32<half>(half val) {
|
||||||
#ifdef __clang__
|
#ifdef __clang__
|
||||||
#pragma clang diagnostic push
|
#pragma clang diagnostic push
|
||||||
#pragma clang diagnostic ignored "-Wpass-failed"
|
#pragma clang diagnostic ignored "-Wpass-failed"
|
||||||
#endif
|
#endif // __clang__
|
||||||
template <bool use_shared, int ncols_template, int block_size_template, typename T>
|
template <bool use_shared, int ncols_template, int block_size_template, typename T>
|
||||||
static __global__ void soft_max_f32(
|
static __global__ void soft_max_f32(
|
||||||
const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y,
|
const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y,
|
||||||
|
@ -126,7 +126,7 @@ static __global__ void soft_max_f32(
|
||||||
}
|
}
|
||||||
#ifdef __clang__
|
#ifdef __clang__
|
||||||
#pragma clang diagnostic pop
|
#pragma clang diagnostic pop
|
||||||
#endif
|
#endif // __clang__
|
||||||
|
|
||||||
static __global__ void soft_max_back_f32(
|
static __global__ void soft_max_back_f32(
|
||||||
const float * grad, const float * dstf, float * dst, const int ncols, const float scale) {
|
const float * grad, const float * dstf, float * dst, const int ncols, const float scale) {
|
||||||
|
|
2
ggml/src/ggml-cuda/vendors/hip.h
vendored
2
ggml/src/ggml-cuda/vendors/hip.h
vendored
|
@ -1,5 +1,6 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
|
||||||
#include <hip/hip_runtime.h>
|
#include <hip/hip_runtime.h>
|
||||||
#include <hipblas/hipblas.h>
|
#include <hipblas/hipblas.h>
|
||||||
#include <hip/hip_fp16.h>
|
#include <hip/hip_fp16.h>
|
||||||
|
@ -8,6 +9,7 @@
|
||||||
// for rocblas_initialize()
|
// for rocblas_initialize()
|
||||||
#include "rocblas/rocblas.h"
|
#include "rocblas/rocblas.h"
|
||||||
#endif // __HIP_PLATFORM_AMD__
|
#endif // __HIP_PLATFORM_AMD__
|
||||||
|
|
||||||
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
||||||
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
||||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
32f0b85987396945afea2291d5f4c5862434292b
|
498e0ecd2c4f9379439fd413805af10e8e9ff349
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue