diff --git a/ggml-alloc.c b/ggml-alloc.c index a97436b17..a27dd54b0 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -72,7 +72,7 @@ static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * t // check if a tensor is allocated by this buffer static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) { - return tensor->buffer == alloc->buffer; + return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer); } static bool ggml_is_view(struct ggml_tensor * t) { diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0dc07eb13..c26c52d72 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -90,6 +90,13 @@ #include #include #include +// CUDA 10.2 does not have these macro definitions. +#ifndef CUBLAS_TF32_TENSOR_OP_MATH +#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH +#define CUBLAS_COMPUTE_16F CUDA_R_16F +#define CUBLAS_COMPUTE_32F CUDA_R_32F +#define cublasComputeType_t cudaDataType_t +#endif #endif // defined(GGML_USE_HIPBLAS) #include "ggml-cuda.h" @@ -5267,17 +5274,17 @@ static __global__ void im2col_f32_f16( const int ky = (i - kd) / OW; const int ix = i % OW; - const int iiw = ix * s0 + kx * d0 - p0; - const int iih = blockIdx.y * s1 + ky * d1 - p1; + const int64_t iiw = ix * s0 + kx * d0 - p0; + const int64_t iih = blockIdx.y * s1 + ky * d1 - p1; - const int offset_dst = + const int64_t offset_dst = (blockIdx.y * OW + ix) * CHW + (blockIdx.z * (KW * KH) + ky * KW + kx); if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { dst[offset_dst] = __float2half(0.0f); } else { - const int offset_src = blockIdx.z * offset_delta; + const int64_t offset_src = blockIdx.z * offset_delta; dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]); } } @@ -8835,7 +8842,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ? - cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; + cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice; for (int32_t row_id = 0; row_id < n_as; ++row_id) { const struct ggml_tensor * src0_row = dst->src[row_id + 2]; diff --git a/ggml-quants.c b/ggml-quants.c index ca3a4980a..4e64a6a9f 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3679,7 +3679,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4); const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); - const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}; + const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}}; const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])), vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0]))); const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])), @@ -6628,7 +6628,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); const int8x16_t scales = vld1q_s8(scale); - const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}; + const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}}; const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])), vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),