diff --git a/ggml/src/ggml-opencl2/ggml-opencl2.cpp b/ggml/src/ggml-opencl2/ggml-opencl2.cpp
index 68b48eab8..ce5c09087 100644
--- a/ggml/src/ggml-opencl2/ggml-opencl2.cpp
+++ b/ggml/src/ggml-opencl2/ggml-opencl2.cpp
@@ -805,21 +805,46 @@ struct ggml_tensor_extra_cl {
 // `offset`, which indicate their locations in the scratch buffer.
 struct ggml_tensor_extra_cl_q4_0 {
     // Quantized values.
-    cl_mem q;
+    cl_mem q = nullptr;
     // Quantized values in image1d_buffer_t.
-    cl_mem q_img;
+    cl_mem q_img = nullptr;
     // Scales.
-    cl_mem d;
+    cl_mem d = nullptr;
     // Scales in image1d_buffer_t.
-    cl_mem d_img;
+    cl_mem d_img = nullptr;
     // Size of quantized values.
-    size_t size_q;
+    size_t size_q = 0;
     // Size of scales.
-    size_t size_d;
+    size_t size_d = 0;
+
+    ~ggml_tensor_extra_cl_q4_0() {
+        reset();
+    }
 
     void reset() {
-        q = nullptr;
-        d = nullptr;
+        // When SMALL_ALLOC is not enabled, q and d are subbuffers into
+        // the bigger buffer allocated in ggml_backend_buffer.
+        // They must be properly released so that the original buffer can be
+        // properly released to avoid memory leak.
+        // When SMALL_ALLOC is enabled, q and d point to the buffers in
+        // ggml_backend_opencl2_buffer_context. These buffers get released when
+        // the context is deleted, so there is no need to release them here.
+        if (q != nullptr) {
+#ifndef GGML_OPENCL_SMALL_ALLOC
+            CL_CHECK(clReleaseMemObject(q));
+#endif
+            q = nullptr;
+        }
+        if (d != nullptr) {
+#ifndef GGML_OPENCL_SMALL_ALLOC
+            CL_CHECK(clReleaseMemObject(d));
+#endif
+            d = nullptr;
+        }
+        // Currently, q_img and d_img are only initialized when SMALL_ALLOC is
+        // enabled. They point to the images in ggml_backend_opencl2_buffer_context.
+        // So, there is no need to release them here.
+        // TODO: initialize them for non SMALL_PATH path, or remove them.
         q_img = nullptr;
         d_img = nullptr;
         size_q = 0;
@@ -1428,7 +1453,8 @@ static void ggml_backend_opencl2_buffer_set_tensor(ggml_backend_buffer_t buffer,
         GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
 
         // Allocate the new extra and create aliases from the original.
-        ggml_tensor_extra_cl_q4_0 * extra = new ggml_tensor_extra_cl_q4_0();
+        ggml_backend_opencl2_buffer_context * ctx = (ggml_backend_opencl2_buffer_context *) buffer->context;
+        ggml_tensor_extra_cl_q4_0 * extra = ctx->ggml_opencl2_alloc_temp_tensor_extra_q4_0();
 
         size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
         size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;