diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0511923c8..ca088d952 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2110,7 +2110,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream)); { - dim3 block_dims(std::min((unsigned int)ne10, 512u)); + dim3 block_dims(std::min((unsigned int)ne10, 768u)); dim3 grid_dims(ids->ne[1], n_ids); k_copy_src1_to_contiguous<<>>( src1_original, src1_contiguous.get(), @@ -2139,7 +2139,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row); { - dim3 block_dims(std::min((unsigned int)ne0, 512u)); + dim3 block_dims(std::min((unsigned int)ne0, 768u)); dim3 grid_dims(num_src1_rows); k_copy_dst_from_contiguous<<>>( dst_original, dst_contiguous.get(), diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index effeb39f7..2d5271ea6 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -101,7 +101,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) { } else if (t->type == GGML_TYPE_I8) { tv.push_back((float)*(int8_t *) &buf[i]); } else if (quantized) { - tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type)); + tt.to_float(&buf[i], vq.data(), bs); tv.insert(tv.end(), vq.begin(), vq.end()); } else { GGML_ASSERT(false);