CUDA: faster q2_K, q3_K MMQ + int8 tensor cores (#7921)
* CUDA: faster q2_K, q3_K MMQ + int8 tensor cores * try CI fix * try CI fix * try CI fix * fix data race * rever q2_K precision related changes
This commit is contained in:
parent
66ef1ceedf
commit
76d66ee0be
6 changed files with 468 additions and 330 deletions
|
@ -73,6 +73,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
|
|||
const dim3 block_nums(1, nrows, 1);
|
||||
const size_t shared_mem = ncols_pad * sizeof(int);
|
||||
|
||||
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
|
||||
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
|
||||
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue