diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cu b/ggml/src/ggml-cuda/cross-entropy-loss.cu
index c9685a67b..27599a2b0 100644
--- a/ggml/src/ggml-cuda/cross-entropy-loss.cu
+++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu
@@ -10,8 +10,8 @@ static __global__ void cross_entropy_loss_f32(
         const float * __restrict__ logits, const float * __restrict__ labels, float * __restrict__ dst, const int nclasses, const int k) {
     extern __shared__ float tmp[];
 
-    logits += blockIdx.x*nclasses;
-    labels += blockIdx.x*nclasses;
+    logits += int64_t(blockIdx.x)*nclasses;
+    labels += int64_t(blockIdx.x)*nclasses;
 
     // Find maximum for softmax:
     float max_logit = -INFINITY;
@@ -55,9 +55,9 @@ static __global__ void cross_entropy_loss_back_f32(
         float * __restrict__ dst, const int nclasses) {
     extern __shared__ float tmp[];
 
-    logits += blockIdx.x*nclasses;
-    labels += blockIdx.x*nclasses;
-    dst    += blockIdx.x*nclasses;
+    logits += int64_t(blockIdx.x)*nclasses;
+    labels += int64_t(blockIdx.x)*nclasses;
+    dst    += int64_t(blockIdx.x)*nclasses;
 
     float maxval = -INFINITY;
     for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
@@ -115,10 +115,10 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
 
     const dim3 blocks_dim(WARP_SIZE, 1, 1);
     const dim3 blocks_num(nrows, 1, 1);
-    const int nbytes_shared = ne00*sizeof(float);
+    const size_t nbytes_shared = ne00*sizeof(float);
 
-    const int id    = ggml_cuda_get_device();
-    const int smpbo = ggml_cuda_info().devices[id].smpbo;
+    const int    id    = ggml_cuda_get_device();
+    const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
 
     ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
 
@@ -169,10 +169,10 @@ void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_ten
 
     const dim3 blocks_dim(WARP_SIZE, 1, 1);
     const dim3 blocks_num(nrows, 1, 1);
-    const int nbytes_shared = ne00*sizeof(float);
+    const size_t nbytes_shared = ne00*sizeof(float);
 
-    const int id    = ggml_cuda_get_device();
-    const int smpbo = ggml_cuda_info().devices[id].smpbo;
+    const int    id    = ggml_cuda_get_device();
+    const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
 
     if (nbytes_shared <= smpbo) {
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu
index aad63a1a0..d991ec972 100644
--- a/ggml/src/ggml-cuda/norm.cu
+++ b/ggml/src/ggml-cuda/norm.cu
@@ -5,8 +5,8 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
     const int tid = threadIdx.x;
 
-    x   += row*ncols;
-    dst += row*ncols;
+    x   += int64_t(row)*ncols;
+    dst += int64_t(row)*ncols;
 
     float2 mean_var = make_float2(0.0f, 0.0f);
 
@@ -101,8 +101,8 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
     const int tid = threadIdx.x;
 
-    x   += row*ncols;
-    dst += row*ncols;
+    x   += int64_t(row)*ncols;
+    dst += int64_t(row)*ncols;
 
     float tmp = 0.0f; // partial sum for thread in warp
 
@@ -140,9 +140,9 @@ static __global__ void rms_norm_back_f32(
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
     const int tid = threadIdx.x;
 
-    grad += row*ncols;
-    xf   += row*ncols;
-    dst  += row*ncols;
+    grad += int64_t(row)*ncols;
+    xf   += int64_t(row)*ncols;
+    dst  += int64_t(row)*ncols;
 
     float sum_xx = 0.0f; // sum for squares of x, equivalent to forward pass
     float sum_xg = 0.0f; // sum for x * gradient, needed because RMS norm mixes inputs
diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu
index 9e6cb2637..9aa4b8489 100644
--- a/ggml/src/ggml-cuda/softmax.cu
+++ b/ggml/src/ggml-cuda/softmax.cu
@@ -23,9 +23,9 @@ static __global__ void soft_max_f32(
     const int rowx = blockIdx.x;
     const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
 
-    x    += rowx*ncols;
-    mask += rowy*ncols * (mask != nullptr);
-    dst  += rowx*ncols;
+    x    += int64_t(rowx)*ncols;
+    mask += int64_t(rowy)*ncols * (mask != nullptr);
+    dst  += int64_t(rowx)*ncols;
 
     const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
 
@@ -124,9 +124,9 @@ static __global__ void soft_max_back_f32(
     const int tid  = threadIdx.x;
     const int rowx = blockIdx.x;
 
-    grad += rowx*ncols;
-    dstf += rowx*ncols;
-    dst  += rowx*ncols;
+    grad += int64_t(rowx)*ncols;
+    dstf += int64_t(rowx)*ncols;
+    dst  += int64_t(rowx)*ncols;
 
     float dgf_dot = 0.0f; // dot product of dst from forward pass and gradients