diff --git a/README-sycl.md b/README-sycl.md index cfa248a95..37f0306dc 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, ## OS -| OS | Status | Verified | -|---------|---------|------------------------------------| -| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 | -| Windows | Support | Windows 11 | +| OS | Status | Verified | +|---------|---------|------------------------------------------------| +| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux | +| Windows | Support | Windows 11 | ## Hardware @@ -70,7 +70,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, |-------------------------------|---------|---------------------------------------| | Intel Data Center Max Series | Support | Max 1550, 1100 | | Intel Data Center Flex Series | Support | Flex 170 | -| Intel Arc Series | Support | Arc 770, 730M | +| Intel Arc Series | Support | Arc 770, 730M, Arc A750 | | Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake | | Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 | diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 2afdb3abd..c00890447 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -178,6 +178,7 @@ struct cmd_params { std::vector type_v; std::vector n_threads; std::vector n_gpu_layers; + std::vector rpc_servers; std::vector split_mode; std::vector main_gpu; std::vector no_kv_offload; @@ -202,6 +203,7 @@ static const cmd_params cmd_params_defaults = { /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {cpu_get_num_math()}, /* n_gpu_layers */ {99}, + /* rpc_servers */ {""}, /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* main_gpu */ {0}, /* no_kv_offload */ {false}, @@ -230,6 +232,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); @@ -384,6 +387,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); + } else if (arg == "-rpc" || arg == "--rpc") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rpc_servers.push_back(argv[i]); } else if (arg == "-sm" || arg == "--split-mode") { if (++i >= argc) { invalid_param = true; @@ -519,6 +528,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } + if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } @@ -541,6 +551,7 @@ struct cmd_params_instance { ggml_type type_v; int n_threads; int n_gpu_layers; + std::string rpc_servers; llama_split_mode split_mode; int main_gpu; bool no_kv_offload; @@ -553,6 +564,9 @@ struct cmd_params_instance { llama_model_params mparams = llama_model_default_params(); mparams.n_gpu_layers = n_gpu_layers; + if (!rpc_servers.empty()) { + mparams.rpc_servers = rpc_servers.c_str(); + } mparams.split_mode = split_mode; mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); @@ -564,6 +578,7 @@ struct cmd_params_instance { bool equal_mparams(const cmd_params_instance & other) const { return model == other.model && n_gpu_layers == other.n_gpu_layers && + rpc_servers == other.rpc_servers && split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && @@ -592,6 +607,7 @@ static std::vector get_cmd_params_instances(const cmd_param // this ordering minimizes the number of times that each model needs to be reloaded for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) + for (const auto & rpc : params.rpc_servers) for (const auto & sm : params.split_mode) for (const auto & mg : params.main_gpu) for (const auto & ts : params.tensor_split) @@ -618,6 +634,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .type_v = */ tv, /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, + /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, @@ -643,6 +660,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .type_v = */ tv, /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, + /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, @@ -668,6 +686,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .type_v = */ tv, /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, + /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, @@ -692,6 +711,7 @@ struct test { static const bool kompute; static const bool metal; static const bool sycl; + static const bool rpc; static const bool gpu_blas; static const bool blas; static const std::string cpu_info; @@ -790,6 +810,9 @@ struct test { if (sycl) { return GGML_SYCL_NAME; } + if (rpc) { + return "RPC"; + } if (gpu_blas) { return "GPU BLAS"; } @@ -803,7 +826,7 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { "build_commit", "build_number", - "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", + "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas", "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", @@ -859,7 +882,7 @@ struct test { std::vector values = { build_commit, std::to_string(build_number), std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan), - std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas), + std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas), cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_ubatch), @@ -894,6 +917,7 @@ const bool test::metal = !!ggml_cpu_has_metal(); const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); const bool test::blas = !!ggml_cpu_has_blas(); const bool test::sycl = !!ggml_cpu_has_sycl(); +const bool test::rpc = !!ggml_cpu_has_rpc(); const std::string test::cpu_info = get_cpu_info(); const std::string test::gpu_info = get_gpu_info(); diff --git a/ggml-cuda/concat.cu b/ggml-cuda/concat.cu index fb9dee8f8..dac10ec36 100644 --- a/ggml-cuda/concat.cu +++ b/ggml-cuda/concat.cu @@ -1,5 +1,6 @@ #include "concat.cuh" +// contiguous kernels static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) { int nidx = threadIdx.x + blockIdx.x * blockDim.x; if (nidx >= ne0) { @@ -92,39 +93,104 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, int n concat_f32_dim2<<>>(x, y, dst, ne0, ne02); } +// non-contiguous kernel (slow) +static __global__ void concat_f32_non_cont( + const char * src0, + const char * src1, + char * dst, + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne03, + uint64_t nb00, + uint64_t nb01, + uint64_t nb02, + uint64_t nb03, + int64_t /*ne10*/, + int64_t /*ne11*/, + int64_t /*ne12*/, + int64_t /*ne13*/, + uint64_t nb10, + uint64_t nb11, + uint64_t nb12, + uint64_t nb13, + int64_t ne0, + int64_t /*ne1*/, + int64_t /*ne2*/, + int64_t /*ne3*/, + uint64_t nb0, + uint64_t nb1, + uint64_t nb2, + uint64_t nb3, + int32_t dim) { + const int64_t i3 = blockIdx.z; + const int64_t i2 = blockIdx.y; + const int64_t i1 = blockIdx.x; + + int64_t o[4] = {0, 0, 0, 0}; + o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03)); + + const float * x; + + for (int i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) { + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00); + } else { + x = (const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10); + } + + float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + *y = *x; + } +} + + void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - const float * src0_d = (const float *)src0->data; - const float * src1_d = (const float *)src1->data; - - float * dst_d = (float *)dst->data; cudaStream_t stream = ctx.stream(); const int32_t dim = ((int32_t *) dst->op_params)[0]; - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); - if (dim != 3) { - for (int i3 = 0; i3 < dst->ne[3]; i3++) { - concat_f32_cuda( - src0_d + i3 * (src0->nb[3] / 4), - src1_d + i3 * (src1->nb[3] / 4), - dst_d + i3 * ( dst->nb[3] / 4), - src0->ne[0], src0->ne[1], src0->ne[2], - dst->ne[0], dst->ne[1], dst->ne[2], dim, stream); + if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { + const float * src0_d = (const float *)src0->data; + const float * src1_d = (const float *)src1->data; + + float * dst_d = (float *)dst->data; + + if (dim != 3) { + for (int i3 = 0; i3 < dst->ne[3]; i3++) { + concat_f32_cuda( + src0_d + i3 * (src0->nb[3] / 4), + src1_d + i3 * (src1->nb[3] / 4), + dst_d + i3 * ( dst->nb[3] / 4), + src0->ne[0], src0->ne[1], src0->ne[2], + dst->ne[0], dst->ne[1], dst->ne[2], dim, stream); + } + } else { + const size_t size0 = ggml_nbytes(src0); + const size_t size1 = ggml_nbytes(src1); + + CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream)); } } else { - const size_t size0 = ggml_nbytes(src0); - const size_t size1 = ggml_nbytes(src1); - - CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream)); + dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]); + concat_f32_non_cont<<>>( + (const char *)src0->data, + (const char *)src1->data, + ( char *)dst->data, + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], + src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], + src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim); } } diff --git a/ggml.c b/ggml.c index 023077ca6..e6e2397b7 100644 --- a/ggml.c +++ b/ggml.c @@ -60,6 +60,9 @@ typedef volatile LONG atomic_int; typedef atomic_int atomic_bool; +typedef atomic_int atomic_flag; + +#define ATOMIC_FLAG_INIT 0 static void atomic_store(atomic_int * ptr, LONG val) { InterlockedExchange(ptr, val); @@ -73,6 +76,12 @@ static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) { static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) { return atomic_fetch_add(ptr, -(dec)); } +static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) { + return InterlockedExchange(ptr, 1); +} +static void atomic_flag_clear(atomic_flag * ptr) { + InterlockedExchange(ptr, 0); +} typedef HANDLE pthread_t; @@ -2883,24 +2892,20 @@ struct ggml_state { // global state static struct ggml_state g_state; -static atomic_int g_state_barrier = 0; +static atomic_flag g_state_critical = ATOMIC_FLAG_INIT; // barrier via spin lock inline static void ggml_critical_section_start(void) { - int processing = atomic_fetch_add(&g_state_barrier, 1); - - while (processing > 0) { - // wait for other threads to finish - atomic_fetch_sub(&g_state_barrier, 1); - sched_yield(); // TODO: reconsider this - processing = atomic_fetch_add(&g_state_barrier, 1); + while (atomic_flag_test_and_set(&g_state_critical)) { + // spin + sched_yield(); } } // TODO: make this somehow automatically executed // some sort of "sentry" mechanism inline static void ggml_critical_section_end(void) { - atomic_fetch_sub(&g_state_barrier, 1); + atomic_flag_clear(&g_state_critical); } #if defined(__gnu_linux__) @@ -6392,6 +6397,16 @@ struct ggml_tensor * ggml_rope_custom_inplace( ); } +struct ggml_tensor * ggml_rope_xpos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int n_dims, + float base, + bool down) { + return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true); +} + // ggml_rope_back struct ggml_tensor * ggml_rope_back( @@ -11012,7 +11027,7 @@ static void ggml_compute_forward_concat_f32( static void ggml_compute_forward_concat( const struct ggml_compute_params * params, - struct ggml_tensor* dst) { + struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; @@ -22857,6 +22872,14 @@ int ggml_cpu_has_sycl(void) { #endif } +int ggml_cpu_has_rpc(void) { +#if defined(GGML_USE_RPC) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_gpublas(void) { return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl(); diff --git a/ggml.h b/ggml.h index 4e6bcb30f..f9deac7e8 100644 --- a/ggml.h +++ b/ggml.h @@ -1548,6 +1548,14 @@ extern "C" { float beta_slow), "use ggml_rope_ext_inplace instead"); + struct ggml_tensor * ggml_rope_xpos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int n_dims, + float base, + bool down); + // compute correction dims for YaRN RoPE scaling GGML_CALL void ggml_rope_yarn_corr_dims( int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); @@ -2420,6 +2428,7 @@ extern "C" { GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_sycl (void); + GGML_API int ggml_cpu_has_rpc (void); GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_matmul_int8(void); diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index cf22afc41..3f8ddf37b 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -106,8 +106,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-kompute.h -> ggml-kompute.h # src/ggml-metal.h -> ggml-metal.h # src/ggml-metal.m -> ggml-metal.m - # src/ggml-mpi.h -> ggml-mpi.h - # src/ggml-mpi.c -> ggml-mpi.c # src/ggml-opencl.cpp -> ggml-opencl.cpp # src/ggml-opencl.h -> ggml-opencl.h # src/ggml-quants.c -> ggml-quants.c @@ -145,8 +143,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \ -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \ -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \ - -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \ - -e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \ -e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \ -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \ -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \ diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 57bede67b..5042f82ae 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -126d34985705a5a2222723c145cb4e125ac689f3 +2aae01fd9b8f9399f343cf18f46f38996ef52e2c diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index ec47fb27c..fbae6b7f8 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -14,8 +14,6 @@ cp -rpv ../ggml/src/ggml-kompute.h ./ggml-kompute.h cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal -cp -rpv ../ggml/src/ggml-mpi.h ./ggml-mpi.h -cp -rpv ../ggml/src/ggml-mpi.c ./ggml-mpi.c cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index b200ccccd..5cde21c66 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1262,22 +1262,37 @@ struct test_concat : public test_case { const std::array ne_a; const int64_t ne_b_d; const int dim; + const int v; // view (1 << 0: non-cont a, 1 << 1: non-cont b) std::string vars() override { - return VARS_TO_STR4(type, ne_a, ne_b_d, dim); + return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v); } test_concat(ggml_type type = GGML_TYPE_F32, std::array ne_a = {10, 10, 10, 10}, int64_t ne_b_d = 10, - int dim = 2) - : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim) {} + int dim = 2, int v = 0) + : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {} ggml_tensor * build_graph(ggml_context * ctx) override { auto ne_b = ne_a; ne_b[dim] = ne_b_d; - ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data()); - ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data()); + ggml_tensor * a; + if (v & 1) { + auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3; + a = ggml_new_tensor(ctx, type, 4, ne.data()); + a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0); + } else { + a = ggml_new_tensor(ctx, type, 4, ne_a.data()); + } + ggml_tensor * b; + if (v & 2) { + auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4; + b = ggml_new_tensor(ctx, type, 4, ne.data()); + b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0); + } else { + b = ggml_new_tensor(ctx, type, 4, ne_b.data()); + } ggml_tensor * out = ggml_concat(ctx, a, b, dim); return out; } @@ -2215,9 +2230,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } } - for (int dim : { 0, 1, 2, 3, }) { - test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim)); - test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim)); + for (int v : { 0, 1, 2, 3 }) { + for (int dim : { 0, 1, 2, 3, }) { + test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v)); + test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v)); + } } for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {