From 8ad0bb30df71f69a1eb8f1dee642d59e04d1c4ea Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Wed, 27 Nov 2024 18:09:05 -0800
Subject: [PATCH] opencl: integrate backend dyn.load interface and fix compiler
 and format warnings

---
 ggml/include/ggml-opencl2.h                   |  2 --
 ggml/src/ggml-opencl2/CMakeLists.txt          |  2 +-
 ggml/src/ggml-opencl2/ggml-opencl2.cpp        | 30 +++++++------------
 ggml/src/ggml-opencl2/kernels/embed_kernel.py | 30 ++++++++++++-------
 ggml/src/ggml-opencl2/kernels/ggml-opencl2.cl |  2 +-
 .../ggml-opencl2/kernels/ggml-opencl2_cvt.cl  |  2 +-
 .../kernels/ggml-opencl2_gemv_noshuffle.cl    |  4 +--
 .../ggml-opencl2_gemv_noshuffle_general.cl    |  4 +--
 .../ggml-opencl2/kernels/ggml-opencl2_mm.cl   |  2 +-
 .../kernels/ggml-opencl2_transpose_16.cl      |  6 ++--
 10 files changed, 41 insertions(+), 43 deletions(-)

diff --git a/ggml/include/ggml-opencl2.h b/ggml/include/ggml-opencl2.h
index 5db311968..837c1cc02 100644
--- a/ggml/include/ggml-opencl2.h
+++ b/ggml/include/ggml-opencl2.h
@@ -30,8 +30,6 @@ GGML_BACKEND_API bool ggml_backend_is_opencl2(ggml_backend_t backend);
 GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl2_buffer_type(void);
 GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl2_host_buffer_type(void);
 
-GGML_BACKEND_API ggml_backend_t ggml_backend_reg_opencl2_init(const char * params, void * user_data);
-
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl2_reg(void);
 
 #ifdef  __cplusplus
diff --git a/ggml/src/ggml-opencl2/CMakeLists.txt b/ggml/src/ggml-opencl2/CMakeLists.txt
index 474374084..d5cc816a4 100644
--- a/ggml/src/ggml-opencl2/CMakeLists.txt
+++ b/ggml/src/ggml-opencl2/CMakeLists.txt
@@ -6,7 +6,7 @@ if (OpenCL_FOUND)
 
     set(TARGET_NAME ggml-opencl2)
 
-    add_library(${TARGET_NAME}
+    ggml_add_backend_library(${TARGET_NAME}
                 ggml-opencl2.cpp
                 ../../include/ggml-opencl2.h)
     target_link_libraries(${TARGET_NAME} PRIVATE ggml-base ${OpenCL_LIBRARIES})
diff --git a/ggml/src/ggml-opencl2/ggml-opencl2.cpp b/ggml/src/ggml-opencl2/ggml-opencl2.cpp
index ce5c09087..64aa99cff 100644
--- a/ggml/src/ggml-opencl2/ggml-opencl2.cpp
+++ b/ggml/src/ggml-opencl2/ggml-opencl2.cpp
@@ -1,6 +1,12 @@
 // SPDX-FileCopyrightText: Copyright (c) Qualcomm Innovation Center, Inc. All rights reserved
 // SPDX-License-Identifier: MIT
 
+#define CL_TARGET_OPENCL_VERSION 220
+
+// suppress warnings in CL headers for GCC and Clang
+#pragma GCC diagnostic ignored "-Wgnu-anonymous-struct"
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+
 #include "ggml-opencl2.h"
 #include "ggml-backend.h"
 #include "ggml-impl.h"
@@ -1237,10 +1243,6 @@ static void ggml_backend_opencl2_buffer_init_tensor(ggml_backend_buffer_t buffer
             tensor->extra = extra;
         }
     }
-
-    // This should be removed. Keep it to make it easier to identify the backend
-    // when debugging until backend is removed from tensor.
-    tensor->backend = GGML_BACKEND_TYPE_GPU;
 }
 
 // The optimized gemm and gemv kernels are used for large matrices without batch.
@@ -1938,18 +1940,7 @@ static struct ggml_backend_device_i ggml_backend_opencl2_device_i = {
     /* .event_synchronize    = */ NULL,
 };
 
-//
-// Backend registration
-//
-
-GGML_API ggml_backend_t ggml_backend_reg_opencl2_init(const char * params, void * user_data) {
-    return ggml_backend_opencl2_init();
-
-    GGML_UNUSED(params);
-    GGML_UNUSED(user_data);
-}
-
-// new API
+// Backend registry
 
 static const char * ggml_backend_opencl2_reg_get_name(ggml_backend_reg_t reg) {
     return "OpenCL2";
@@ -1986,6 +1977,7 @@ ggml_backend_reg_t ggml_backend_opencl2_reg(void) {
 
     if (!initialized) {
         reg = ggml_backend_reg {
+            /* .api_version = */ GGML_BACKEND_API_VERSION,
             /* .iface   = */ ggml_backend_opencl2_reg_i,
             /* .context = */ NULL,
         };
@@ -2004,6 +1996,8 @@ ggml_backend_reg_t ggml_backend_opencl2_reg(void) {
     return &reg;
 }
 
+GGML_BACKEND_DL_IMPL(ggml_backend_opencl2_reg)
+
 //------------------------------------------------------------------------------
 // Debugging utils
 //------------------------------------------------------------------------------
@@ -2921,13 +2915,11 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
     // init CL objects
     // <--------------------------------------------> //
     cl_int              status;
-    cl_event            evt;
     cl_image_format     img_fmt_1d;
     cl_image_desc       img_desc_1d;
     cl_buffer_region    region;
     cl_mem              A_image1d;
     cl_mem              B_image1d;
-    cl_mem              A_sub_buffer;
     cl_mem              B_sub_buffer;
     cl_mem              C_d;
     // for B transpose
@@ -3623,7 +3615,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
     GGML_ASSERT(src1->extra);
 
     // GGML_OP_CPY happens between src0 and src1.
-    // GGML_OP_DUP and GGML_OP_CONT happen between src0 and dst. 
+    // GGML_OP_DUP and GGML_OP_CONT happen between src0 and dst.
     UNUSED(dst);
 
     const int ne00 = src0 ? src0->ne[0] : 0;
diff --git a/ggml/src/ggml-opencl2/kernels/embed_kernel.py b/ggml/src/ggml-opencl2/kernels/embed_kernel.py
index 6117014b3..a4b864f2a 100644
--- a/ggml/src/ggml-opencl2/kernels/embed_kernel.py
+++ b/ggml/src/ggml-opencl2/kernels/embed_kernel.py
@@ -1,19 +1,27 @@
+#
+
 import sys
+import logging
+logger = logging.getLogger("opencl-embed-kernerl")
+
 
 def main():
-  if len(sys.argv) != 3:
-    print("Usage: python embed_kernel.py <input_file> <output_file>")
-    exit(1)
+    logging.basicConfig(level=logging.INFO)
 
-  ifile = open(sys.argv[1], "r")
-  ofile = open(sys.argv[2], "w")
+    if len(sys.argv) != 3:
+        logger.info("Usage: python embed_kernel.py <input_file> <output_file>")
+        sys.exit(1)
 
-  ofile.write("R\"(\n\n")
-  ofile.write(ifile.read())
-  ofile.write("\n)\"")
+    ifile = open(sys.argv[1], "r")
+    ofile = open(sys.argv[2], "w")
+
+    ofile.write("R\"(\n\n")
+    ofile.write(ifile.read())
+    ofile.write("\n)\"")
+
+    ifile.close()
+    ofile.close()
 
-  ifile.close()
-  ofile.close()
 
 if __name__ == "__main__":
-  main()
+    main()
diff --git a/ggml/src/ggml-opencl2/kernels/ggml-opencl2.cl b/ggml/src/ggml-opencl2/kernels/ggml-opencl2.cl
index a29f41be7..e6c25b380 100644
--- a/ggml/src/ggml-opencl2/kernels/ggml-opencl2.cl
+++ b/ggml/src/ggml-opencl2/kernels/ggml-opencl2.cl
@@ -6,7 +6,7 @@
 #elif defined(cl_amd_fp16)
 #pragma OPENCL EXTENSION cl_amd_fp16 : enable
 #else
-#error "Half precision floating point not supportedby OpenCL implementation on your device." 
+#error "Half precision floating point not supportedby OpenCL implementation on your device."
 #endif
 
 #ifdef cl_khr_subgroups
diff --git a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_cvt.cl b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_cvt.cl
index d0a0e7724..bf9ac7d1a 100644
--- a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_cvt.cl
+++ b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_cvt.cl
@@ -11,7 +11,7 @@
 #elif defined(cl_amd_fp16)
 #pragma OPENCL EXTENSION cl_amd_fp16 : enable
 #else
-#error "Half precision floating point not supportedby OpenCL implementation on your device." 
+#error "Half precision floating point not supportedby OpenCL implementation on your device."
 #endif
 
 #ifdef cl_khr_subgroups
diff --git a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle.cl b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle.cl
index aef74235b..2f5c41f55 100644
--- a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle.cl
+++ b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle.cl
@@ -8,7 +8,7 @@
 #pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 
-// assume 
+// assume
 #define QK4_0 32
 #define N_SIMDGROUP 4
 
@@ -204,7 +204,7 @@ __kernel void kernel_gemv_noshuffle(
         int ne10,               // K
         int ne12,               // 1
         int ne0,                // M
-        int ne1,                // N 
+        int ne1,                // N
         int r2,                 // 1
         int r3)
 {
diff --git a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle_general.cl b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle_general.cl
index cb0f7853a..5326af3fe 100644
--- a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle_general.cl
+++ b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle_general.cl
@@ -8,7 +8,7 @@
 #pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 
-// assume 
+// assume
 #define QK4_0 32
 #define N_SIMDGROUP 4
 
@@ -204,7 +204,7 @@ __kernel void kernel_gemv_noshuffle(
         int ne10,               // K
         int ne12,               // 1
         int ne0,                // M
-        int ne1,                // N 
+        int ne1,                // N
         int r2,                 // 1
         int r3)
 {
diff --git a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_mm.cl b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_mm.cl
index 37541e51f..91311ee73 100644
--- a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_mm.cl
+++ b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_mm.cl
@@ -10,7 +10,7 @@
 #elif defined(cl_amd_fp16)
 #pragma OPENCL EXTENSION cl_amd_fp16 : enable
 #else
-#error "Half precision floating point not supportedby OpenCL implementation on your device." 
+#error "Half precision floating point not supportedby OpenCL implementation on your device."
 #endif
 
 #ifdef cl_khr_subgroups
diff --git a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_transpose_16.cl b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_transpose_16.cl
index 550b4c71a..a2a9ab558 100644
--- a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_transpose_16.cl
+++ b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_transpose_16.cl
@@ -4,9 +4,9 @@
 // 16-bit transpose, loading/storing an 8x8 tile of elements
 
 kernel void kernel_transpose_16(
-    __read_only image1d_buffer_t input, 
-    __write_only image1d_buffer_t output, 
-    const uint rows, 
+    __read_only image1d_buffer_t input,
+    __write_only image1d_buffer_t output,
+    const uint rows,
     const uint cols
 ) {