Merge branch 'master' into concedo_experimental
# Conflicts: # Makefile # README.md
This commit is contained in:
commit
b814bb217d
3 changed files with 15 additions and 8 deletions
|
@ -72,7 +72,7 @@ static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * t
|
|||
|
||||
// check if a tensor is allocated by this buffer
|
||||
static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
|
||||
return tensor->buffer == alloc->buffer;
|
||||
return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
|
||||
}
|
||||
|
||||
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||
|
|
17
ggml-cuda.cu
17
ggml-cuda.cu
|
@ -90,6 +90,13 @@
|
|||
#include <cuda_runtime.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda_fp16.h>
|
||||
// CUDA 10.2 does not have these macro definitions.
|
||||
#ifndef CUBLAS_TF32_TENSOR_OP_MATH
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||
#define cublasComputeType_t cudaDataType_t
|
||||
#endif
|
||||
#endif // defined(GGML_USE_HIPBLAS)
|
||||
|
||||
#include "ggml-cuda.h"
|
||||
|
@ -5267,17 +5274,17 @@ static __global__ void im2col_f32_f16(
|
|||
const int ky = (i - kd) / OW;
|
||||
const int ix = i % OW;
|
||||
|
||||
const int iiw = ix * s0 + kx * d0 - p0;
|
||||
const int iih = blockIdx.y * s1 + ky * d1 - p1;
|
||||
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
||||
const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
|
||||
|
||||
const int offset_dst =
|
||||
const int64_t offset_dst =
|
||||
(blockIdx.y * OW + ix) * CHW +
|
||||
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = __float2half(0.0f);
|
||||
} else {
|
||||
const int offset_src = blockIdx.z * offset_delta;
|
||||
const int64_t offset_src = blockIdx.z * offset_delta;
|
||||
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
||||
}
|
||||
}
|
||||
|
@ -8835,7 +8842,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|||
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
|
||||
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
||||
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
|
||||
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
||||
cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
|
||||
|
||||
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
||||
|
|
|
@ -3679,7 +3679,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
|
||||
const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
|
||||
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
||||
const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
|
||||
const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
|
||||
const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
|
||||
vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
|
||||
const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
|
||||
|
@ -6628,7 +6628,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
|
||||
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
||||
const int8x16_t scales = vld1q_s8(scale);
|
||||
const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
|
||||
const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
|
||||
|
||||
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
|
||||
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue