From a836c8f534ab789b02da149fbdaf7735500bff74 Mon Sep 17 00:00:00 2001 From: David Pflug Date: Sun, 14 Jan 2024 10:46:00 -0500 Subject: [PATCH 01/11] llama : fix missing quotes (#4937) --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 63f37ecdb..7af38718c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7099,7 +7099,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & } #ifdef PRETOKENIZERDEBUG - LLAMA_LOG_WARN(TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif llm_tokenizer_spm tokenizer(vocab); llama_escape_whitespace(raw_text); @@ -7120,7 +7120,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); #ifdef PRETOKENIZERDEBUG - LLAMA_LOG_WARN(TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif llm_tokenizer_bpe tokenizer(vocab); tokenizer.tokenize(raw_text, output); From 4a3156de2fac9a8ee4279de7804d4e352dcfe121 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Mon, 15 Jan 2024 07:48:06 +0200 Subject: [PATCH 02/11] CUDA: faster dequantize kernels for Q4_0 and Q4_1 (#4938) Co-authored-by: Iwan Kawrakow --- ggml-cuda.cu | 77 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index bd3814c72..a870718a7 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1105,6 +1105,61 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in #endif // GGML_CUDA_F16 } +template +static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) { + + const int i = blockIdx.x; + + // assume 32 threads + const int tid = threadIdx.x; + const int il = tid/8; + const int ir = tid%8; + const int ib = 8*i + ir; + if (ib >= nb32) { + return; + } + + dst_t * y = yy + 256*i + 32*ir + 4*il; + + const block_q4_0 * x = (const block_q4_0 *)vx + ib; + const float d = __half2float(x->d); + const float dm = -8*d; + + const uint8_t * q = x->qs + 4*il; + + for (int l = 0; l < 4; ++l) { + y[l+ 0] = d * (q[l] & 0xF) + dm; + y[l+16] = d * (q[l] >> 4) + dm; + } +} + +template +static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) { + + const int i = blockIdx.x; + + // assume 32 threads + const int tid = threadIdx.x; + const int il = tid/8; + const int ir = tid%8; + const int ib = 8*i + ir; + if (ib >= nb32) { + return; + } + + dst_t * y = yy + 256*i + 32*ir + 4*il; + + const block_q4_1 * x = (const block_q4_1 *)vx + ib; + const float2 d = __half22float2(x->dm); + + const uint8_t * q = x->qs + 4*il; + + for (int l = 0; l < 4; ++l) { + y[l+ 0] = d.x * (q[l] & 0xF) + d.y; + y[l+16] = d.x * (q[l] >> 4) + d.y; + } +} + //================================== k-quants template @@ -6253,6 +6308,20 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu #endif } +template +static void dequantize_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb32 = k / 32; + const int nb = (k + 255) / 256; + dequantize_block_q4_0<<>>(vx, y, nb32); +} + +template +static void dequantize_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb32 = k / 32; + const int nb = (k + 255) / 256; + dequantize_block_q4_1<<>>(vx, y, nb32); +} + template static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; @@ -6301,9 +6370,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { int id; switch (type) { case GGML_TYPE_Q4_0: - return dequantize_block_cuda; + return dequantize_q4_0_cuda; case GGML_TYPE_Q4_1: - return dequantize_block_cuda; + return dequantize_q4_1_cuda; case GGML_TYPE_Q5_0: return dequantize_block_cuda; case GGML_TYPE_Q5_1: @@ -6338,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: - return dequantize_block_cuda; + return dequantize_q4_0_cuda; case GGML_TYPE_Q4_1: - return dequantize_block_cuda; + return dequantize_q4_1_cuda; case GGML_TYPE_Q5_0: return dequantize_block_cuda; case GGML_TYPE_Q5_1: From 2faaef39799c97a53bec3898141478700da25757 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Mon, 15 Jan 2024 10:09:38 +0200 Subject: [PATCH 03/11] llama : check for 256 divisibility for IQ2_XS, IQ2_XXS (#4950) Co-authored-by: Iwan Kawrakow --- llama.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 7af38718c..f9718060d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8559,7 +8559,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty //} bool convert_incompatible_tensor = false; if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || - new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { + new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || + new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -8571,6 +8572,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty } if (convert_incompatible_tensor) { switch (new_type) { + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break; case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break; case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; From ddb008d845cd50bb090bf051f570130524042936 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 15 Jan 2024 13:27:00 +0200 Subject: [PATCH 04/11] cuda : fix dequantize kernel names (#4938) --- ggml-cuda.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index a870718a7..c3e14bc96 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6309,14 +6309,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu } template -static void dequantize_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { +static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb32 = k / 32; const int nb = (k + 255) / 256; dequantize_block_q4_0<<>>(vx, y, nb32); } template -static void dequantize_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { +static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb32 = k / 32; const int nb = (k + 255) / 256; dequantize_block_q4_1<<>>(vx, y, nb32); @@ -6370,9 +6370,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { int id; switch (type) { case GGML_TYPE_Q4_0: - return dequantize_q4_0_cuda; + return dequantize_row_q4_0_cuda; case GGML_TYPE_Q4_1: - return dequantize_q4_1_cuda; + return dequantize_row_q4_1_cuda; case GGML_TYPE_Q5_0: return dequantize_block_cuda; case GGML_TYPE_Q5_1: @@ -6407,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: - return dequantize_q4_0_cuda; + return dequantize_row_q4_0_cuda; case GGML_TYPE_Q4_1: - return dequantize_q4_1_cuda; + return dequantize_row_q4_1_cuda; case GGML_TYPE_Q5_0: return dequantize_block_cuda; case GGML_TYPE_Q5_1: From d9aa4ffa6e0296d42f1f676dd85de97c8491eb73 Mon Sep 17 00:00:00 2001 From: "Victor Z. Peng" Date: Mon, 15 Jan 2024 04:41:46 -0800 Subject: [PATCH 05/11] awq-py : fix typo in awq-py/README.md (#4947) --- awq-py/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awq-py/README.md b/awq-py/README.md index 59354f4e3..16e68d027 100644 --- a/awq-py/README.md +++ b/awq-py/README.md @@ -43,7 +43,7 @@ Example for llama model # For llama7b and llama2 models python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf # For mistral and mpt models -python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf +python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf ``` ## Quantize From 4483396751c79dea540808b9cb9238245d06da2b Mon Sep 17 00:00:00 2001 From: David Friehs Date: Mon, 15 Jan 2024 14:06:52 +0100 Subject: [PATCH 06/11] llama : apply classifier-free guidance to logits directly (#4951) --- common/sampling.cpp | 9 ++++--- llama.cpp | 60 ++++++++++++++++++++++++++++++--------------- llama.h | 17 +++++++++---- 3 files changed, 57 insertions(+), 29 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 8e45909f1..dd1ffeb1b 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -190,6 +190,11 @@ static llama_token llama_sampling_sample_impl( logits[it->first] += it->second; } + if (ctx_cfg) { + float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx); + llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale); + } + cur.clear(); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { @@ -198,10 +203,6 @@ static llama_token llama_sampling_sample_impl( llama_token_data_array cur_p = { cur.data(), cur.size(), false }; - if (ctx_cfg) { - llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale); - } - // apply penalties const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev; const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n); diff --git a/llama.cpp b/llama.cpp index f9718060d..46c4d11c8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7898,39 +7898,59 @@ static void llama_log_softmax(float * array, size_t size) { } } +void llama_sample_apply_guidance( + struct llama_context * ctx, + float * logits, + float * logits_guidance, + float scale) { + GGML_ASSERT(ctx); + + const auto t_start_sample_us = ggml_time_us(); + const auto n_vocab = llama_n_vocab(llama_get_model(ctx)); + + llama_log_softmax(logits, n_vocab); + llama_log_softmax(logits_guidance, n_vocab); + + for (int i = 0; i < n_vocab; ++i) { + auto & l = logits[i]; + const auto & g = logits_guidance[i]; + + l = scale * (l - g) + g; + } + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; +} + void llama_sample_classifier_free_guidance( struct llama_context * ctx, llama_token_data_array * candidates, struct llama_context * guidance_ctx, float scale) { - int64_t t_start_sample_us = ggml_time_us(); - GGML_ASSERT(ctx); + int64_t t_start_sample_us; - auto n_vocab = llama_n_vocab(llama_get_model(ctx)); + t_start_sample_us = ggml_time_us(); + const size_t n_vocab = llama_n_vocab(llama_get_model(ctx)); - GGML_ASSERT(n_vocab == (int)candidates->size); + GGML_ASSERT(n_vocab == candidates->size); GGML_ASSERT(!candidates->sorted); - std::vector logits_base; - logits_base.reserve(candidates->size); - for (size_t i = 0; i < candidates->size; ++i) { - logits_base.push_back(candidates->data[i].logit); - } - llama_log_softmax(logits_base.data(), candidates->size); - - float* logits_guidance = llama_get_logits(guidance_ctx); - llama_log_softmax(logits_guidance, n_vocab); - - for (int i = 0; i < n_vocab; ++i) { - float logit_guidance = logits_guidance[i]; - float logit_base = logits_base[i]; - candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance; + std::vector logits_base(n_vocab); + for (size_t i = 0; i < n_vocab; ++i) { + logits_base[i] = candidates->data[i].logit; } - if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + float * logits_guidance = llama_get_logits(guidance_ctx); + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale); + t_start_sample_us = ggml_time_us(); + + for (size_t i = 0; i < n_vocab; ++i) { + candidates->data[i].logit = logits_base[i]; } + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; } llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) { diff --git a/llama.h b/llama.h index 79c8335b6..a570b0d69 100644 --- a/llama.h +++ b/llama.h @@ -714,14 +714,21 @@ extern "C" { float penalty_present); /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 - /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. - /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. - /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. - LLAMA_API void llama_sample_classifier_free_guidance( + /// @param logits Logits extracted from the original generation context. + /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. + /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. + LLAMA_API void llama_sample_apply_guidance( + struct llama_context * ctx, + float * logits, + float * logits_guidance, + float scale); + + LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance( struct llama_context * ctx, llama_token_data_array * candidates, struct llama_context * guidance_ctx, - float scale); + float scale), + "use llama_sample_apply_guidance() instead"); /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. LLAMA_API void llama_sample_softmax( From 3e5ca7931c68152e4ec18d126e9c832dd84914c8 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Mon, 15 Jan 2024 20:40:48 +0200 Subject: [PATCH 07/11] pass cpu-architecture arguments only to host code (C;C++) (#4943) --- CMakeLists.txt | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2741568ed..7bd640966 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -594,6 +594,13 @@ if (NOT MSVC) endif() endif() +function(add_compile_option_cpp ARG) + # Adds a compile option to C/C++ only, but not for Cuda. + # Use, e.g., for CPU-architecture flags. + add_compile_options($<$:${ARG}>) + add_compile_options($<$:${ARG}>) +endfunction() + if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64")) message(STATUS "ARM detected") if (MSVC) @@ -628,8 +635,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE include(cmake/FindSIMD.cmake) endif () if (LLAMA_AVX512) - add_compile_options($<$:/arch:AVX512>) - add_compile_options($<$:/arch:AVX512>) + add_compile_option_cpp(/arch:AVX512) # MSVC has no compile-time flags enabling specific # AVX512 extensions, neither it defines the # macros corresponding to the extensions. @@ -643,37 +649,35 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE add_compile_definitions($<$:__AVX512VNNI__>) endif() elseif (LLAMA_AVX2) - add_compile_options($<$:/arch:AVX2>) - add_compile_options($<$:/arch:AVX2>) + add_compile_option_cpp(/arch:AVX2) elseif (LLAMA_AVX) - add_compile_options($<$:/arch:AVX>) - add_compile_options($<$:/arch:AVX>) + add_compile_option_cpp(/arch:AVX) endif() else() if (LLAMA_NATIVE) - add_compile_options(-march=native) + add_compile_option_cpp(-march=native) endif() if (LLAMA_F16C) - add_compile_options(-mf16c) + add_compile_option_cpp(-mf16c) endif() if (LLAMA_FMA) - add_compile_options(-mfma) + add_compile_option_cpp(-mfma) endif() if (LLAMA_AVX) - add_compile_options(-mavx) + add_compile_option_cpp(-mavx) endif() if (LLAMA_AVX2) - add_compile_options(-mavx2) + add_compile_option_cpp(-mavx2) endif() if (LLAMA_AVX512) - add_compile_options(-mavx512f) - add_compile_options(-mavx512bw) + add_compile_option_cpp(-mavx512f) + add_compile_option_cpp(-mavx512bw) endif() if (LLAMA_AVX512_VBMI) - add_compile_options(-mavx512vbmi) + add_compile_option_cpp(-mavx512vbmi) endif() if (LLAMA_AVX512_VNNI) - add_compile_options(-mavx512vnni) + add_compile_option_cpp(-mavx512vnni) endif() endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") From e0324285a569d0583cf2f4a07a2402221ee25f58 Mon Sep 17 00:00:00 2001 From: stduhpf Date: Tue, 16 Jan 2024 12:04:32 +0100 Subject: [PATCH 08/11] speculative : threading options (#4959) * speculative: expose draft threading * fix usage format * accept -td and -tbd args * speculative: revert default behavior when -td is unspecified * fix trailing whitespace --- common/common.cpp | 22 ++++++++++++++++++++++ common/common.h | 2 ++ examples/speculative/speculative.cpp | 4 ++++ 3 files changed, 28 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index c11006bcb..2b0865fff 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -167,6 +167,24 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_batch <= 0) { params.n_threads_batch = std::thread::hardware_concurrency(); } + } else if (arg == "-td" || arg == "--threads-draft") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads_draft = std::stoi(argv[i]); + if (params.n_threads_draft <= 0) { + params.n_threads_draft = std::thread::hardware_concurrency(); + } + } else if (arg == "-tbd" || arg == "--threads-batch-draft") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads_batch_draft = std::stoi(argv[i]); + if (params.n_threads_batch_draft <= 0) { + params.n_threads_batch_draft = std::thread::hardware_concurrency(); + } } else if (arg == "-p" || arg == "--prompt") { if (++i >= argc) { invalid_param = true; @@ -845,6 +863,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); printf(" -tb N, --threads-batch N\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); + printf(" -td N, --threads-draft N"); + printf(" number of threads to use during generation (default: same as --threads)"); + printf(" -tbd N, --threads-batch-draft N\n"); + printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n"); printf(" -p PROMPT, --prompt PROMPT\n"); printf(" prompt to start generation with (default: empty)\n"); printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); diff --git a/common/common.h b/common/common.h index 096468243..1f43e6282 100644 --- a/common/common.h +++ b/common/common.h @@ -46,7 +46,9 @@ struct gpt_params { uint32_t seed = -1; // RNG seed int32_t n_threads = get_num_physical_cores(); + int32_t n_threads_draft = -1; int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) + int32_t n_threads_batch_draft = -1; int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 20f1fb5bf..7b3af01f3 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -65,6 +65,10 @@ int main(int argc, char ** argv) { // load the draft model params.model = params.model_draft; params.n_gpu_layers = params.n_gpu_layers_draft; + if (params.n_threads_draft > 0) { + params.n_threads = params.n_threads_draft; + } + params.n_threads_batch = params.n_threads_batch_draft; std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); { From d75c232e1da56f19ac4d2530dadbe0ab3a11fde5 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 16 Jan 2024 12:14:19 +0100 Subject: [PATCH 09/11] finetune : use LLAMA_FILE_MAGIC_GGLA (#4961) This commit replaces the magic number LLAMA_FILE_MAGIC_LORA used in finetune.cpp with LLAMA_FILE_MAGIC_GGLA defined in llama.h. Signed-off-by: Daniel Bevenius --- examples/finetune/finetune.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index eaca42fc1..a6620fd73 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1138,9 +1138,8 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor return tn_buf.data(); }; - uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla' // write_magic - file.write_u32(LLAMA_FILE_MAGIC_LORA); // magic + file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic file.write_u32(1); // version // write_hparams file.write_u32(lora->hparams.lora_r); From a0b3ac8c48b66206b9c5921ce57bd5c0ea6557c3 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Tue, 16 Jan 2024 03:16:33 -0800 Subject: [PATCH 10/11] ggml : introduce GGML_CALL function annotation (#4850) This change makes it possible to build ggml-cuda.cu and ggml-metal.m as independent dynamic shared objects, that may be conditionally linked at runtime in a multiplatform binary. It introduces a GGML_CALL annotation that documents which functions have a cyclic call relationship, between the application code and GPU modules. This change does nothing, unless the build defines -DGGML_MULTIPLATFORM which causes back-references and function pointers to conform to MS ABI which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms --- ggml-backend-impl.h | 60 +++++++++++----------- ggml-backend.c | 80 ++++++++++++++--------------- ggml-backend.h | 50 +++++++++--------- ggml-cuda.cu | 121 ++++++++++++++++++++++---------------------- ggml-cuda.h | 32 ++++++------ ggml-metal.h | 4 +- ggml-metal.m | 42 +++++++-------- ggml.c | 32 ++++++------ ggml.h | 58 ++++++++++++--------- 9 files changed, 244 insertions(+), 235 deletions(-) diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index 1db32901f..1397828d9 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -16,14 +16,14 @@ extern "C" { typedef void * ggml_backend_buffer_type_context_t; struct ggml_backend_buffer_type_i { - const char * (*get_name) (ggml_backend_buffer_type_t buft); - ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); - size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment - size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding - bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend + const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); + ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); + size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment + size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding + bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend // check if tensor data is in host memory // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) - bool (*is_host) (ggml_backend_buffer_type_t buft); + bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft); }; struct ggml_backend_buffer_type { @@ -35,15 +35,15 @@ extern "C" { typedef void * ggml_backend_buffer_context_t; struct ggml_backend_buffer_i { - const char * (*get_name) (ggml_backend_buffer_t buffer); - void (*free_buffer)(ggml_backend_buffer_t buffer); - void * (*get_base) (ggml_backend_buffer_t buffer); - void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer - void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); - void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras + const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer); + void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer); + void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer); + void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer + void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value); + void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras }; struct ggml_backend_buffer { @@ -54,7 +54,7 @@ extern "C" { enum ggml_backend_buffer_usage usage; }; - ggml_backend_buffer_t ggml_backend_buffer_init( + GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init( ggml_backend_buffer_type_t buft, struct ggml_backend_buffer_i iface, ggml_backend_buffer_context_t context, @@ -70,31 +70,31 @@ extern "C" { typedef void * ggml_backend_context_t; struct ggml_backend_i { - const char * (*get_name)(ggml_backend_t backend); + const char * (*GGML_CALL get_name)(ggml_backend_t backend); - void (*free)(ggml_backend_t backend); + void (*GGML_CALL free)(ggml_backend_t backend); // buffer allocation - ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend); + ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend); // (optional) asynchronous tensor data access - void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst); + void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst); // (optional) complete all pending operations - void (*synchronize)(ggml_backend_t backend); + void (*GGML_CALL synchronize)(ggml_backend_t backend); // compute graph with a plan - ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); - void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); + void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan (async) - bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); + bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); // check if the backend supports an operation - bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); }; struct ggml_backend { @@ -107,9 +107,9 @@ extern "C" { // Backend registry // - typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data); + typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data); - void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data); + GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data); #ifdef __cplusplus } diff --git a/ggml-backend.c b/ggml-backend.c index 505dbba47..f5424fb90 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -19,7 +19,7 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { return buft->iface.get_name(buft); } -ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { return buft->iface.alloc_buffer(buft, size); } @@ -27,7 +27,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) { return buft->iface.get_alignment(buft); } -size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { +GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { // get_alloc_size is optional, defaults to ggml_nbytes if (buft->iface.get_alloc_size) { return buft->iface.get_alloc_size(buft, tensor); @@ -48,7 +48,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { // backend buffer -ggml_backend_buffer_t ggml_backend_buffer_init( +GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init( ggml_backend_buffer_type_t buft, struct ggml_backend_buffer_i iface, ggml_backend_buffer_context_t context, @@ -95,7 +95,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { return base; } -void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { +GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { // init_tensor is optional if (buffer->iface.init_tensor) { buffer->iface.init_tensor(buffer, tensor); @@ -191,7 +191,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten } } -void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); @@ -201,7 +201,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size); } -void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); @@ -318,9 +318,9 @@ struct ggml_backend_reg { static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG]; static size_t ggml_backend_registry_count = 0; -static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data); +GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data); -static void ggml_backend_registry_init(void) { +GGML_CALL static void ggml_backend_registry_init(void) { static bool initialized = false; if (initialized) { @@ -333,18 +333,18 @@ static void ggml_backend_registry_init(void) { // add forward decls here to avoid including the backend headers #ifdef GGML_USE_CUBLAS - extern void ggml_backend_cuda_reg_devices(void); + extern GGML_CALL void ggml_backend_cuda_reg_devices(void); ggml_backend_cuda_reg_devices(); #endif #ifdef GGML_USE_METAL - extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); - extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); + extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); + extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL); #endif } -void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { +GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG); size_t id = ggml_backend_registry_count; @@ -439,33 +439,33 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) { // backend CPU -static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) { +GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) { return "CPU"; GGML_UNUSED(buffer); } -static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { return (void *)buffer->context; } -static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { free(buffer->context); } -static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); GGML_UNUSED(buffer); } -static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); return true; @@ -475,7 +475,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { memset(buffer->context, value, buffer->size); } @@ -506,13 +506,13 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 -static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { +GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "CPU"; GGML_UNUSED(buft); } -static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? @@ -521,25 +521,25 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size); } -static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return TENSOR_ALIGNMENT; GGML_UNUSED(buft); } -static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { +GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_cpu(backend); GGML_UNUSED(buft); } -static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; GGML_UNUSED(buft); } -ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { /* .iface = */ { /* .get_name = */ ggml_backend_cpu_buffer_type_get_name, @@ -561,23 +561,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { #include -static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { +GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "CPU_HBM"; GGML_UNUSED(buft); } -static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) { +GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) { return "CPU_HBM"; GGML_UNUSED(buf); } -static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { hbw_free(buffer->context); } -static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { //void * ptr = hbw_malloc(size); void * ptr; int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); @@ -617,20 +617,20 @@ struct ggml_backend_cpu_context { size_t work_size; }; -static const char * ggml_backend_cpu_name(ggml_backend_t backend) { +GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) { return "CPU"; GGML_UNUSED(backend); } -static void ggml_backend_cpu_free(ggml_backend_t backend) { +GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; free(cpu_ctx->work_data); free(cpu_ctx); free(backend); } -static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) { +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_cpu_buffer_type(); GGML_UNUSED(backend); @@ -641,7 +641,7 @@ struct ggml_backend_plan_cpu { struct ggml_cgraph cgraph; }; -static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { +GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); @@ -656,7 +656,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend return cpu_plan; } -static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; free(cpu_plan->cplan.work_data); @@ -665,7 +665,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen GGML_UNUSED(backend); } -static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); @@ -673,7 +673,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac GGML_UNUSED(backend); } -static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); @@ -690,7 +690,7 @@ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c return true; } -static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { +GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { switch (op->op) { case GGML_OP_MUL_MAT: return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; @@ -732,7 +732,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { return cpu_backend; } -bool ggml_backend_is_cpu(ggml_backend_t backend) { +GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) { return backend && backend->iface.get_name == ggml_backend_cpu_name; } @@ -743,11 +743,11 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ctx->n_threads = n_threads; } -ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { +GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size); } -static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) { +GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) { return ggml_backend_cpu_init(); GGML_UNUSED(params); diff --git a/ggml-backend.h b/ggml-backend.h index 4eb244af1..12b4b4ab7 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -17,12 +17,12 @@ extern "C" { // // buffer type - GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); - GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); - GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); - GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); - GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); - GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); + GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); + GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); + GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); + GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); + GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); + GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); // buffer enum ggml_backend_buffer_usage { @@ -30,18 +30,18 @@ extern "C" { GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1, }; - GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); - GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); - GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); - GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); + GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); + GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); + GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); + GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); // // Backend @@ -58,8 +58,8 @@ extern "C" { GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); GGML_API void ggml_backend_synchronize(ggml_backend_t backend); @@ -80,13 +80,13 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_cpu_init(void); - GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend); - GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); + GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); // Create a backend buffer from an existing pointer - GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); + GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); - GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); + GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); #ifdef GGML_USE_CPU_HBM GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); @@ -183,7 +183,7 @@ extern "C" { GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); - typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); + typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); // Compare the output of two backends GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c3e14bc96..568c411af 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7615,11 +7615,11 @@ struct cuda_pool_alloc { static bool g_cublas_loaded = false; -bool ggml_cublas_loaded(void) { +GGML_CALL bool ggml_cublas_loaded(void) { return g_cublas_loaded; } -void ggml_init_cublas() { +GGML_CALL void ggml_init_cublas() { static bool initialized = false; if (!initialized) { @@ -7707,7 +7707,7 @@ void ggml_init_cublas() { } } -void * ggml_cuda_host_malloc(size_t size) { +GGML_CALL void * ggml_cuda_host_malloc(size_t size) { if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { return nullptr; } @@ -7725,7 +7725,7 @@ void * ggml_cuda_host_malloc(size_t size) { return ptr; } -void ggml_cuda_host_free(void * ptr) { +GGML_CALL void ggml_cuda_host_free(void * ptr) { CUDA_CHECK(cudaFreeHost(ptr)); } @@ -9242,7 +9242,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); } -bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { +GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { if (!g_cublas_loaded) return false; const int64_t ne10 = src1->ne[0]; @@ -10013,7 +10013,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); } -static void ggml_cuda_set_main_device(const int main_device) { +GGML_CALL static void ggml_cuda_set_main_device(const int main_device) { if (main_device >= g_device_count) { fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", main_device, g_device_count, g_main_device); @@ -10028,7 +10028,7 @@ static void ggml_cuda_set_main_device(const int main_device) { } } -bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { if (!g_cublas_loaded) return false; ggml_cuda_func_t func; @@ -10186,7 +10186,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ return true; } -int ggml_cuda_get_device_count() { +GGML_CALL int ggml_cuda_get_device_count() { int device_count; if (cudaGetDeviceCount(&device_count) != cudaSuccess) { return 0; @@ -10194,7 +10194,7 @@ int ggml_cuda_get_device_count() { return device_count; } -void ggml_cuda_get_device_description(int device, char * description, size_t description_size) { +GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); snprintf(description, description_size, "%s", prop.name); @@ -10244,27 +10244,27 @@ struct ggml_backend_cuda_buffer_context { } }; -static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { +GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; return ctx->name.c_str(); } -static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { +GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name; } -static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; CUDA_CHECK(cudaFree(ctx->dev_ptr)); delete ctx; } -static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; return ctx->dev_ptr; } -static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; if (tensor->view_src != NULL && tensor->view_offs == 0) { @@ -10296,7 +10296,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g } } -static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; @@ -10307,7 +10307,7 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg CUDA_CHECK(cudaDeviceSynchronize()); } -static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; @@ -10318,7 +10318,7 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co CUDA_CHECK(cudaDeviceSynchronize()); } -static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { +GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { if (ggml_backend_buffer_is_cuda(src->buffer)) { ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context; @@ -10335,7 +10335,7 @@ static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, co return false; } -static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); @@ -10357,19 +10357,18 @@ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { }; // cuda buffer type - struct ggml_backend_cuda_buffer_type_context { int device; std::string name; }; -static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) { +GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) { ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; return ctx->name.c_str(); } -static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; ggml_cuda_set_device(buft_ctx->device); @@ -10388,13 +10387,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); } -static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 128; UNUSED(buft); } -static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { +GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { int64_t row_low = 0; int64_t row_high = ggml_nrows(tensor); int64_t nrows_split = row_high - row_low; @@ -10414,7 +10413,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t UNUSED(buft); } -static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { +GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { if (!ggml_backend_is_cuda(backend)) { return false; } @@ -10434,7 +10433,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .is_host = */ NULL, }; -ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { // FIXME: this is not thread safe if (device >= ggml_backend_cuda_get_device_count()) { return nullptr; @@ -10479,7 +10478,7 @@ struct ggml_backend_cuda_split_buffer_context { std::vector tensor_extras; }; -static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) { +GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) { return GGML_CUDA_NAME "_Split"; UNUSED(buffer); @@ -10490,19 +10489,19 @@ static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_ // return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name; //} -static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; delete ctx; } -static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced return (void *)0x1000; UNUSED(buffer); } -static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; @@ -10552,7 +10551,7 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf tensor->extra = extra; } -static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -10586,7 +10585,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff } } -static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -10620,7 +10619,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff } } -static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { UNUSED(buffer); UNUSED(value); } @@ -10639,13 +10638,13 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { // cuda split buffer type -static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) { +GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) { return GGML_CUDA_NAME "_Split"; UNUSED(buft); } -static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // instead, we allocate them for each tensor separately in init_tensor // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, @@ -10655,13 +10654,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); } -static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 128; UNUSED(buft); } -static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { +GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; size_t total_size = 0; @@ -10688,13 +10687,13 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_bu return total_size; } -static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { +GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_cuda(backend); UNUSED(buft); } -static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return false; UNUSED(buft); @@ -10709,7 +10708,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, }; -ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { // FIXME: this is not thread safe static std::map, struct ggml_backend_buffer_type> buft_map; @@ -10745,23 +10744,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten // host buffer type -static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) { +GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) { return GGML_CUDA_NAME "_Host"; UNUSED(buft); } -static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { +GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { return GGML_CUDA_NAME "_Host"; UNUSED(buffer); } -static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_cuda_host_free(buffer->context); } -static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { void * ptr = ggml_cuda_host_malloc(size); if (ptr == nullptr) { @@ -10777,7 +10776,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm return buffer; } -ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { /* .iface = */ { /* .get_name = */ ggml_backend_cuda_host_buffer_type_name, @@ -10795,26 +10794,26 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { // backend -static const char * ggml_backend_cuda_name(ggml_backend_t backend) { +GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; return cuda_ctx->name.c_str(); } -static void ggml_backend_cuda_free(ggml_backend_t backend) { +GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; delete cuda_ctx; delete backend; } -static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; return ggml_backend_cuda_buffer_type(cuda_ctx->device); } -static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); @@ -10823,7 +10822,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0])); } -static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); @@ -10832,7 +10831,7 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); } -static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { +GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) { @@ -10843,7 +10842,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggm return false; } -static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { +GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0])); @@ -10851,7 +10850,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { UNUSED(backend); } -static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_set_main_device(cuda_ctx->device); @@ -10890,7 +10889,7 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph return true; } -static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { +GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -11016,7 +11015,7 @@ static ggml_backend_i ggml_backend_cuda_interface = { /* .supports_op = */ ggml_backend_cuda_supports_op, }; -ggml_backend_t ggml_backend_cuda_init(int device) { +GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { ggml_init_cublas(); // TODO: remove from ggml.c if (device < 0 || device >= ggml_cuda_get_device_count()) { @@ -11040,35 +11039,35 @@ ggml_backend_t ggml_backend_cuda_init(int device) { return cuda_backend; } -bool ggml_backend_is_cuda(ggml_backend_t backend) { +GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) { return backend && backend->iface.get_name == ggml_backend_cuda_name; } -int ggml_backend_cuda_get_device_count() { +GGML_CALL int ggml_backend_cuda_get_device_count() { return ggml_cuda_get_device_count(); } -void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) { +GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) { ggml_cuda_get_device_description(device, description, description_size); } -void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) { +GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) { ggml_cuda_set_device(device); CUDA_CHECK(cudaMemGetInfo(free, total)); } // backend registry -static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { +GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data); return cuda_backend; UNUSED(params); } -extern "C" int ggml_backend_cuda_reg_devices(); +extern "C" GGML_CALL int ggml_backend_cuda_reg_devices(); -int ggml_backend_cuda_reg_devices() { +GGML_CALL int ggml_backend_cuda_reg_devices() { int device_count = ggml_cuda_get_device_count(); //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization for (int i = 0; i < device_count; i++) { diff --git a/ggml-cuda.h b/ggml-cuda.h index d19cbf3fd..b1ebd61d7 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -18,34 +18,34 @@ extern "C" { #define GGML_CUDA_MAX_DEVICES 16 // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`. -GGML_API void ggml_init_cublas(void); +GGML_API GGML_CALL void ggml_init_cublas(void); // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`. -GGML_API bool ggml_cublas_loaded(void); +GGML_API GGML_CALL bool ggml_cublas_loaded(void); -GGML_API void * ggml_cuda_host_malloc(size_t size); -GGML_API void ggml_cuda_host_free(void * ptr); +GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size); +GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr); -GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); +GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); -GGML_API int ggml_cuda_get_device_count(void); -GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size); +GGML_API GGML_CALL int ggml_cuda_get_device_count(void); +GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size); // backend API -GGML_API ggml_backend_t ggml_backend_cuda_init(int device); +GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device); -GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend); +GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend); -GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); // split tensor buffer that splits matrices by rows across multiple devices -GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU -GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); -GGML_API int ggml_backend_cuda_get_device_count(void); -GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); -GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); +GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void); +GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); +GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); #ifdef __cplusplus } diff --git a/ggml-metal.h b/ggml-metal.h index cd5e2995f..8b0bfc5f1 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -47,11 +47,11 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); -GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); +GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); -GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); // helper to check if the device supports a specific family // ideally, the user code should be doing these checks diff --git a/ggml-metal.m b/ggml-metal.m index 2ca726055..867f2fd48 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2294,13 +2294,13 @@ static void ggml_backend_metal_free_device(void) { } } -static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) { +GGML_CALL static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) { return "Metal"; UNUSED(buffer); } -static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; for (int i = 0; i < ctx->n_buffers; i++) { @@ -2315,25 +2315,25 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) free(ctx); } -static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_CALL static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; return ctx->all_data; } -static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); UNUSED(buffer); } -static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); UNUSED(buffer); } -static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); return true; @@ -2343,7 +2343,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c UNUSED(buffer); } -static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_CALL static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; memset(ctx->all_data, value, ctx->all_size); @@ -2363,13 +2363,13 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { // default buffer type -static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { +GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "Metal"; UNUSED(buft); } -static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); const size_t size_page = sysconf(_SC_PAGESIZE); @@ -2421,24 +2421,24 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); } -static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 32; UNUSED(buft); } -static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { +GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend); UNUSED(buft); } -static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; UNUSED(buft); } -ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { +GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = { /* .iface = */ { /* .get_name = */ ggml_backend_metal_buffer_type_get_name, @@ -2456,7 +2456,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { // buffer from ptr -ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { +GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); ctx->all_data = data; @@ -2543,31 +2543,31 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz // backend -static const char * ggml_backend_metal_name(ggml_backend_t backend) { +GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; UNUSED(backend); } -static void ggml_backend_metal_free(ggml_backend_t backend) { +GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) { struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; ggml_metal_free(ctx); free(backend); } -static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_metal_buffer_type(); UNUSED(backend); } -static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; return ggml_metal_graph_compute(metal_ctx, cgraph); } -static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { +GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; return ggml_metal_supports_op(metal_ctx, op); @@ -2630,9 +2630,9 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; } -ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning +GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning -ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) { +GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) { return ggml_backend_metal_init(); GGML_UNUSED(params); diff --git a/ggml.c b/ggml.c index ef5888ab2..5779f32d2 100644 --- a/ggml.c +++ b/ggml.c @@ -1990,19 +1990,19 @@ void ggml_print_objects(const struct ggml_context * ctx) { GGML_PRINT("%s: --- end ---\n", __func__); } -int64_t ggml_nelements(const struct ggml_tensor * tensor) { +GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -int64_t ggml_nrows(const struct ggml_tensor * tensor) { +GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -size_t ggml_nbytes(const struct ggml_tensor * tensor) { +GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) { size_t nbytes; size_t blck_size = ggml_blck_size(tensor->type); if (blck_size == 1) { @@ -2025,15 +2025,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN); } -int ggml_blck_size(enum ggml_type type) { +GGML_CALL int ggml_blck_size(enum ggml_type type) { return type_traits[type].blck_size; } -size_t ggml_type_size(enum ggml_type type) { +GGML_CALL size_t ggml_type_size(enum ggml_type type) { return type_traits[type].type_size; } -size_t ggml_row_size(enum ggml_type type, int64_t ne) { +GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) { assert(ne % ggml_blck_size(type) == 0); return ggml_type_size(type)*ne/ggml_blck_size(type); } @@ -2042,15 +2042,15 @@ double ggml_type_sizef(enum ggml_type type) { return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; } -const char * ggml_type_name(enum ggml_type type) { +GGML_CALL const char * ggml_type_name(enum ggml_type type) { return type_traits[type].type_name; } -bool ggml_is_quantized(enum ggml_type type) { +GGML_CALL bool ggml_is_quantized(enum ggml_type type) { return type_traits[type].is_quantized; } -const char * ggml_op_name(enum ggml_op op) { +GGML_CALL const char * ggml_op_name(enum ggml_op op) { return GGML_OP_NAME[op]; } @@ -2062,7 +2062,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) { return GGML_UNARY_OP_NAME[op]; } -const char * ggml_op_desc(const struct ggml_tensor * t) { +GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) { if (t->op == GGML_OP_UNARY) { enum ggml_unary_op uop = ggml_get_unary_op(t); return ggml_unary_op_name(uop); @@ -2072,7 +2072,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) { } } -size_t ggml_element_size(const struct ggml_tensor * tensor) { +GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) { return ggml_type_size(tensor->type); } @@ -2154,11 +2154,11 @@ size_t ggml_tensor_overhead(void) { return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE; } -bool ggml_is_transposed(const struct ggml_tensor * tensor) { +GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } -bool ggml_is_contiguous(const struct ggml_tensor * tensor) { +GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return @@ -2177,7 +2177,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -bool ggml_is_permuted(const struct ggml_tensor * tensor) { +GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3]; @@ -3079,7 +3079,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) { return (float *)(tensor->data); } -enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { +GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { GGML_ASSERT(tensor->op == GGML_OP_UNARY); return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0); } @@ -11653,7 +11653,7 @@ static void ggml_rope_cache_init( } } -void ggml_rope_yarn_corr_dims( +GGML_CALL void ggml_rope_yarn_corr_dims( int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] ) { // start and end correction dims diff --git a/ggml.h b/ggml.h index 1187074f7..837c52e68 100644 --- a/ggml.h +++ b/ggml.h @@ -187,6 +187,16 @@ # define GGML_API #endif +#ifdef GGML_MULTIPLATFORM +# if defined(_WIN32) +# define GGML_CALL +# else +# define GGML_CALL __attribute__((__ms_abi__)) +# endif +#else +# define GGML_CALL +#endif + // TODO: support for clang #ifdef __GNUC__ # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) @@ -649,41 +659,41 @@ extern "C" { GGML_API void ggml_print_object (const struct ggml_object * obj); GGML_API void ggml_print_objects(const struct ggml_context * ctx); - GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); - GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); - GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); - GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN + GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor); + GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor); + GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN - GGML_API int ggml_blck_size(enum ggml_type type); - GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block - GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row + GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type); + GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block + GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row GGML_DEPRECATED( GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float "use ggml_row_size() instead"); - GGML_API const char * ggml_type_name(enum ggml_type type); - GGML_API const char * ggml_op_name (enum ggml_op op); - GGML_API const char * ggml_op_symbol(enum ggml_op op); + GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type); + GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op); + GGML_API const char * ggml_op_symbol(enum ggml_op op); - GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); - GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name + GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); + GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name - GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); + GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor); - GGML_API bool ggml_is_quantized(enum ggml_type type); + GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type); // TODO: temporary until model loading of ggml examples is refactored GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); - GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); - GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); - GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); - GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); - GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars + GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor); + GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor); + GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); + GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1); @@ -770,7 +780,7 @@ extern "C" { GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); - GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); + GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); @@ -1413,7 +1423,7 @@ extern "C" { float beta_slow); // compute correction dims for YaRN RoPE scaling - void ggml_rope_yarn_corr_dims( + GGML_CALL void ggml_rope_yarn_corr_dims( int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); // xPos RoPE, in-place, returns view(a) From 122ed4840cc6d209df6043e027f9f8a03aee01da Mon Sep 17 00:00:00 2001 From: Maximilian Winter Date: Tue, 16 Jan 2024 13:10:48 +0100 Subject: [PATCH 11/11] examples : fix and improv docs for the grammar generator (#4909) * Create pydantic-models-to-grammar.py * Added some comments for usage * Refactored Grammar Generator Added example and usage instruction. * Update pydantic_models_to_grammar.py * Update pydantic-models-to-grammar-examples.py * Renamed module and imported it. * Update pydantic-models-to-grammar.py * Renamed file and fixed grammar generator issue. * Fixed some issues and bugs of the grammar generator. Imporved Documentation * Update pydantic_models_to_grammar.py --- examples/pydantic_models_to_grammar.py | 877 +++++++++++++++---------- 1 file changed, 519 insertions(+), 358 deletions(-) diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py index 41b98fdc1..848c1c367 100644 --- a/examples/pydantic_models_to_grammar.py +++ b/examples/pydantic_models_to_grammar.py @@ -4,6 +4,7 @@ from copy import copy from inspect import isclass, getdoc from types import NoneType +from docstring_parser import parse from pydantic import BaseModel, create_model, Field from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias from enum import Enum @@ -25,9 +26,10 @@ class PydanticDataType(Enum): ENUM (str): Represents an enum data type. CUSTOM_CLASS (str): Represents a custom class data type. """ + STRING = "string" TRIPLE_QUOTED_STRING = "triple_quoted_string" - MARKDOWN_STRING = "markdown_string" + MARKDOWN_CODE_BLOCK = "markdown_code_block" BOOLEAN = "boolean" INTEGER = "integer" FLOAT = "float" @@ -78,10 +80,10 @@ def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str: def format_model_and_field_name(model_name: str) -> str: - parts = re.findall('[A-Z][^A-Z]*', model_name) + parts = re.findall("[A-Z][^A-Z]*", model_name) if not parts: # Check if the list is empty return model_name.lower().replace("_", "-") - return '-'.join(part.lower().replace("_", "-") for part in parts) + return "-".join(part.lower().replace("_", "-") for part in parts) def generate_list_rule(element_type): @@ -93,29 +95,31 @@ def generate_list_rule(element_type): """ rule_name = f"{map_pydantic_type_to_gbnf(element_type)}-list" element_rule = map_pydantic_type_to_gbnf(element_type) - list_rule = fr'{rule_name} ::= "[" {element_rule} ("," {element_rule})* "]"' + list_rule = rf'{rule_name} ::= "[" {element_rule} ("," {element_rule})* "]"' return list_rule def get_members_structure(cls, rule_name): if issubclass(cls, Enum): # Handle Enum types - members = [f'\"\\\"{member.value}\\\"\"' for name, member in cls.__members__.items()] + members = [f'"\\"{member.value}\\""' for name, member in cls.__members__.items()] return f"{cls.__name__.lower()} ::= " + " | ".join(members) if cls.__annotations__ and cls.__annotations__ != {}: result = f'{rule_name} ::= "{{"' type_list_rules = [] # Modify this comprehension - members = [f' \"\\\"{name}\\\"\" ":" {map_pydantic_type_to_gbnf(param_type)}' - for name, param_type in cls.__annotations__.items() - if name != 'self'] + members = [ + f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param_type)}' + for name, param_type in cls.__annotations__.items() + if name != "self" + ] result += '"," '.join(members) result += ' "}"' return result, type_list_rules elif rule_name == "custom-class-any": - result = f'{rule_name} ::= ' - result += 'value' + result = f"{rule_name} ::= " + result += "value" type_list_rules = [] return result, type_list_rules else: @@ -124,9 +128,11 @@ def get_members_structure(cls, rule_name): result = f'{rule_name} ::= "{{"' type_list_rules = [] # Modify this comprehension too - members = [f' \"\\\"{name}\\\"\" ":" {map_pydantic_type_to_gbnf(param.annotation)}' - for name, param in parameters.items() - if name != 'self' and param.annotation != inspect.Parameter.empty] + members = [ + f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param.annotation)}' + for name, param in parameters.items() + if name != "self" and param.annotation != inspect.Parameter.empty + ] result += '", "'.join(members) result += ' "}"' @@ -141,8 +147,8 @@ def regex_to_gbnf(regex_pattern: str) -> str: gbnf_rule = regex_pattern # Translate common regex components to GBNF - gbnf_rule = gbnf_rule.replace('\\d', '[0-9]') - gbnf_rule = gbnf_rule.replace('\\s', '[ \t\n]') + gbnf_rule = gbnf_rule.replace("\\d", "[0-9]") + gbnf_rule = gbnf_rule.replace("\\s", "[ \t\n]") # Handle quantifiers and other regex syntax that is similar in GBNF # (e.g., '*', '+', '?', character classes) @@ -158,12 +164,12 @@ def generate_gbnf_integer_rules(max_digit=None, min_digit=None): Generates GBNF (Generalized Backus-Naur Form) rules for integers based on the given maximum and minimum digits. Parameters: - max_digit (int): The maximum number of digits for the integer. Default is None. - min_digit (int): The minimum number of digits for the integer. Default is None. + max_digit (int): The maximum number of digits for the integer. Default is None. + min_digit (int): The minimum number of digits for the integer. Default is None. Returns: - integer_rule (str): The identifier for the integer rule generated. - additional_rules (list): A list of additional rules generated based on the given maximum and minimum digits. + integer_rule (str): The identifier for the integer rule generated. + additional_rules (list): A list of additional rules generated based on the given maximum and minimum digits. """ additional_rules = [] @@ -178,21 +184,21 @@ def generate_gbnf_integer_rules(max_digit=None, min_digit=None): # Handling Integer Rules if max_digit is not None or min_digit is not None: # Start with an empty rule part - integer_rule_part = '' + integer_rule_part = "" # Add mandatory digits as per min_digit if min_digit is not None: - integer_rule_part += '[0-9] ' * min_digit + integer_rule_part += "[0-9] " * min_digit # Add optional digits up to max_digit if max_digit is not None: optional_digits = max_digit - (min_digit if min_digit is not None else 0) - integer_rule_part += ''.join(['[0-9]? ' for _ in range(optional_digits)]) + integer_rule_part += "".join(["[0-9]? " for _ in range(optional_digits)]) # Trim the rule part and append it to additional rules integer_rule_part = integer_rule_part.strip() if integer_rule_part: - additional_rules.append(f'{integer_rule} ::= {integer_rule_part}') + additional_rules.append(f"{integer_rule} ::= {integer_rule_part}") return integer_rule, additional_rules @@ -224,21 +230,26 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None additional_rules = [] # Define the integer part rule - integer_part_rule = "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + ( + integer_part_rule = ( + "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + ( f"-min{min_digit}" if min_digit is not None else "") + ) # Define the fractional part rule based on precision constraints fractional_part_rule = "fractional-part" - fractional_rule_part = '' + fractional_rule_part = "" if max_precision is not None or min_precision is not None: fractional_part_rule += (f"-max{max_precision}" if max_precision is not None else "") + ( - f"-min{min_precision}" if min_precision is not None else "") + f"-min{min_precision}" if min_precision is not None else "" + ) # Minimum number of digits - fractional_rule_part = '[0-9]' * (min_precision if min_precision is not None else 1) + fractional_rule_part = "[0-9]" * (min_precision if min_precision is not None else 1) # Optional additional digits - fractional_rule_part += ''.join([' [0-9]?'] * ( - (max_precision - (min_precision if min_precision is not None else 1)) if max_precision is not None else 0)) - additional_rules.append(f'{fractional_part_rule} ::= {fractional_rule_part}') + fractional_rule_part += "".join( + [" [0-9]?"] * ((max_precision - ( + min_precision if min_precision is not None else 1)) if max_precision is not None else 0) + ) + additional_rules.append(f"{fractional_part_rule} ::= {fractional_rule_part}") # Define the float rule float_rule = f"float-{max_digit if max_digit is not None else 'X'}-{min_digit if min_digit is not None else 'X'}-{max_precision if max_precision is not None else 'X'}-{min_precision if min_precision is not None else 'X'}" @@ -246,20 +257,19 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None # Generating the integer part rule definition, if necessary if max_digit is not None or min_digit is not None: - integer_rule_part = '[0-9]' + integer_rule_part = "[0-9]" if min_digit is not None and min_digit > 1: - integer_rule_part += ' [0-9]' * (min_digit - 1) + integer_rule_part += " [0-9]" * (min_digit - 1) if max_digit is not None: - integer_rule_part += ''.join([' [0-9]?'] * (max_digit - (min_digit if min_digit is not None else 1))) - additional_rules.append(f'{integer_part_rule} ::= {integer_rule_part.strip()}') + integer_rule_part += "".join([" [0-9]?"] * (max_digit - (min_digit if min_digit is not None else 1))) + additional_rules.append(f"{integer_part_rule} ::= {integer_rule_part.strip()}") return float_rule, additional_rules -def generate_gbnf_rule_for_type(model_name, field_name, - field_type, is_optional, processed_models, created_rules, - field_info=None) -> \ - Tuple[str, list]: +def generate_gbnf_rule_for_type( + model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None +) -> Tuple[str, list]: """ Generate GBNF rule for a given field type. @@ -282,20 +292,19 @@ def generate_gbnf_rule_for_type(model_name, field_name, if isclass(field_type) and issubclass(field_type, BaseModel): nested_model_name = format_model_and_field_name(field_type.__name__) - nested_model_rules = generate_gbnf_grammar(field_type, processed_models, created_rules) + nested_model_rules, _ = generate_gbnf_grammar(field_type, processed_models, created_rules) rules.extend(nested_model_rules) gbnf_type, rules = nested_model_name, rules elif isclass(field_type) and issubclass(field_type, Enum): - enum_values = [f'\"\\\"{e.value}\\\"\"' for e in field_type] # Adding escaped quotes + enum_values = [f'"\\"{e.value}\\""' for e in field_type] # Adding escaped quotes enum_rule = f"{model_name}-{field_name} ::= {' | '.join(enum_values)}" rules.append(enum_rule) gbnf_type, rules = model_name + "-" + field_name, rules - elif get_origin(field_type) == list or field_type == list: # Array + elif get_origin(field_type) == list: # Array element_type = get_args(field_type)[0] - element_rule_name, additional_rules = generate_gbnf_rule_for_type(model_name, - f"{field_name}-element", - element_type, is_optional, processed_models, - created_rules) + element_rule_name, additional_rules = generate_gbnf_rule_for_type( + model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules + ) rules.extend(additional_rules) array_rule = f"""{model_name}-{field_name} ::= "[" ws {element_rule_name} ("," ws {element_rule_name})* "]" """ rules.append(array_rule) @@ -303,10 +312,9 @@ def generate_gbnf_rule_for_type(model_name, field_name, elif get_origin(field_type) == set or field_type == set: # Array element_type = get_args(field_type)[0] - element_rule_name, additional_rules = generate_gbnf_rule_for_type(model_name, - f"{field_name}-element", - element_type, is_optional, processed_models, - created_rules) + element_rule_name, additional_rules = generate_gbnf_rule_for_type( + model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules + ) rules.extend(additional_rules) array_rule = f"""{model_name}-{field_name} ::= "[" ws {element_rule_name} ("," ws {element_rule_name})* "]" """ rules.append(array_rule) @@ -318,15 +326,13 @@ def generate_gbnf_rule_for_type(model_name, field_name, elif gbnf_type.startswith("custom-dict-"): key_type, value_type = get_args(field_type) - additional_key_type, additional_key_rules = generate_gbnf_rule_for_type(model_name, - f"{field_name}-key-type", - key_type, is_optional, processed_models, - created_rules) - additional_value_type, additional_value_rules = generate_gbnf_rule_for_type(model_name, - f"{field_name}-value-type", - value_type, is_optional, - processed_models, created_rules) - gbnf_type = fr'{gbnf_type} ::= "{{" ( {additional_key_type} ":" {additional_value_type} ("," {additional_key_type} ":" {additional_value_type})* )? "}}" ' + additional_key_type, additional_key_rules = generate_gbnf_rule_for_type( + model_name, f"{field_name}-key-type", key_type, is_optional, processed_models, created_rules + ) + additional_value_type, additional_value_rules = generate_gbnf_rule_for_type( + model_name, f"{field_name}-value-type", value_type, is_optional, processed_models, created_rules + ) + gbnf_type = rf'{gbnf_type} ::= "{{" ( {additional_key_type} ": " {additional_value_type} ("," "\n" ws {additional_key_type} ":" {additional_value_type})* )? "}}" ' rules.extend(additional_key_rules) rules.extend(additional_value_rules) @@ -336,19 +342,16 @@ def generate_gbnf_rule_for_type(model_name, field_name, for union_type in union_types: if isinstance(union_type, _GenericAlias): - union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(model_name, - field_name, union_type, - False, - processed_models, created_rules) + union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type( + model_name, field_name, union_type, False, processed_models, created_rules + ) union_rules.append(union_gbnf_type) rules.extend(union_rules_list) - elif not issubclass(union_type, NoneType): - union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(model_name, - field_name, union_type, - False, - processed_models, created_rules) + union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type( + model_name, field_name, union_type, False, processed_models, created_rules + ) union_rules.append(union_gbnf_type) rules.extend(union_rules_list) @@ -363,45 +366,58 @@ def generate_gbnf_rule_for_type(model_name, field_name, else: gbnf_type = f"{model_name}-{field_name}-union" elif isclass(field_type) and issubclass(field_type, str): - if field_info and hasattr(field_info, 'json_schema_extra') and field_info.json_schema_extra is not None: - - triple_quoted_string = field_info.json_schema_extra.get('triple_quoted_string', False) - markdown_string = field_info.json_schema_extra.get('markdown_string', False) + if field_info and hasattr(field_info, "json_schema_extra") and field_info.json_schema_extra is not None: + triple_quoted_string = field_info.json_schema_extra.get("triple_quoted_string", False) + markdown_string = field_info.json_schema_extra.get("markdown_code_block", False) gbnf_type = PydanticDataType.TRIPLE_QUOTED_STRING.value if triple_quoted_string else PydanticDataType.STRING.value - gbnf_type = PydanticDataType.MARKDOWN_STRING.value if markdown_string else gbnf_type + gbnf_type = PydanticDataType.MARKDOWN_CODE_BLOCK.value if markdown_string else gbnf_type - elif field_info and hasattr(field_info, 'pattern'): + elif field_info and hasattr(field_info, "pattern"): # Convert regex pattern to grammar rule regex_pattern = field_info.regex.pattern gbnf_type = f"pattern-{field_name} ::= {regex_to_gbnf(regex_pattern)}" else: gbnf_type = PydanticDataType.STRING.value - elif isclass(field_type) and issubclass(field_type, float) and field_info and hasattr(field_info, - 'json_schema_extra') and field_info.json_schema_extra is not None: + elif ( + isclass(field_type) + and issubclass(field_type, float) + and field_info + and hasattr(field_info, "json_schema_extra") + and field_info.json_schema_extra is not None + ): # Retrieve precision attributes for floats - max_precision = field_info.json_schema_extra.get('max_precision') if field_info and hasattr(field_info, - 'json_schema_extra') else None - min_precision = field_info.json_schema_extra.get('min_precision') if field_info and hasattr(field_info, - 'json_schema_extra') else None - max_digits = field_info.json_schema_extra.get('max_digit') if field_info and hasattr(field_info, - 'json_schema_extra') else None - min_digits = field_info.json_schema_extra.get('min_digit') if field_info and hasattr(field_info, - 'json_schema_extra') else None + max_precision = ( + field_info.json_schema_extra.get("max_precision") if field_info and hasattr(field_info, + "json_schema_extra") else None + ) + min_precision = ( + field_info.json_schema_extra.get("min_precision") if field_info and hasattr(field_info, + "json_schema_extra") else None + ) + max_digits = field_info.json_schema_extra.get("max_digit") if field_info and hasattr(field_info, + "json_schema_extra") else None + min_digits = field_info.json_schema_extra.get("min_digit") if field_info and hasattr(field_info, + "json_schema_extra") else None # Generate GBNF rule for float with given attributes - gbnf_type, rules = generate_gbnf_float_rules(max_digit=max_digits, min_digit=min_digits, - max_precision=max_precision, - min_precision=min_precision) + gbnf_type, rules = generate_gbnf_float_rules( + max_digit=max_digits, min_digit=min_digits, max_precision=max_precision, min_precision=min_precision + ) - elif isclass(field_type) and issubclass(field_type, int) and field_info and hasattr(field_info, - 'json_schema_extra') and field_info.json_schema_extra is not None: + elif ( + isclass(field_type) + and issubclass(field_type, int) + and field_info + and hasattr(field_info, "json_schema_extra") + and field_info.json_schema_extra is not None + ): # Retrieve digit attributes for integers - max_digits = field_info.json_schema_extra.get('max_digit') if field_info and hasattr(field_info, - 'json_schema_extra') else None - min_digits = field_info.json_schema_extra.get('min_digit') if field_info and hasattr(field_info, - 'json_schema_extra') else None + max_digits = field_info.json_schema_extra.get("max_digit") if field_info and hasattr(field_info, + "json_schema_extra") else None + min_digits = field_info.json_schema_extra.get("min_digit") if field_info and hasattr(field_info, + "json_schema_extra") else None # Generate GBNF rule for integer with given attributes gbnf_type, rules = generate_gbnf_integer_rules(max_digit=max_digits, min_digit=min_digits) @@ -443,13 +459,13 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created if not issubclass(model, BaseModel): # For non-Pydantic classes, generate model_fields from __annotations__ or __init__ - if hasattr(model, '__annotations__') and model.__annotations__: + if hasattr(model, "__annotations__") and model.__annotations__: model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()} else: init_signature = inspect.signature(model.__init__) parameters = init_signature.parameters - model_fields = {name: (param.annotation, param.default) for name, param in parameters.items() - if name != 'self'} + model_fields = {name: (param.annotation, param.default) for name, param in parameters.items() if + name != "self"} else: # For Pydantic models, use model_fields and check for ellipsis (required fields) model_fields = model.__annotations__ @@ -469,51 +485,55 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created field_type = field_info field_info = model.model_fields[field_name] is_optional = field_info.is_required is False and get_origin(field_type) is Optional - rule_name, additional_rules = generate_gbnf_rule_for_type(model_name, - format_model_and_field_name(field_name), - field_type, is_optional, - processed_models, created_rules, field_info) - look_for_markdown_code_block = True if rule_name == "markdown_string" else False + rule_name, additional_rules = generate_gbnf_rule_for_type( + model_name, format_model_and_field_name(field_name), field_type, is_optional, processed_models, + created_rules, field_info + ) + look_for_markdown_code_block = True if rule_name == "markdown_code_block" else False look_for_triple_quoted_string = True if rule_name == "triple_quoted_string" else False if not look_for_markdown_code_block and not look_for_triple_quoted_string: if rule_name not in created_rules: created_rules[rule_name] = additional_rules - model_rule_parts.append(f' ws \"\\\"{field_name}\\\"\" ": " {rule_name}') # Adding escaped quotes + model_rule_parts.append(f' ws "\\"{field_name}\\"" ":" ws {rule_name}') # Adding escaped quotes nested_rules.extend(additional_rules) else: - has_triple_quoted_string = look_for_markdown_code_block - has_markdown_code_block = look_for_triple_quoted_string + has_triple_quoted_string = look_for_triple_quoted_string + has_markdown_code_block = look_for_markdown_code_block fields_joined = r' "," "\n" '.join(model_rule_parts) - model_rule = fr'{model_name} ::= "{{" "\n" {fields_joined} "\n" ws "}}"' - - if look_for_markdown_code_block or look_for_triple_quoted_string: - model_rule += ' ws "}"' + model_rule = rf'{model_name} ::= "{{" "\n" {fields_joined} "\n" ws "}}"' + has_special_string = False if has_triple_quoted_string: + model_rule += '"\\n" ws "}"' model_rule += '"\\n" triple-quoted-string' + has_special_string = True if has_markdown_code_block: + model_rule += '"\\n" ws "}"' model_rule += '"\\n" markdown-code-block' + has_special_string = True all_rules = [model_rule] + nested_rules - return all_rules, has_markdown_code_block, has_triple_quoted_string + return all_rules, has_special_string -def generate_gbnf_grammar_from_pydantic_models(models: List[Type[BaseModel]], outer_object_name: str = None, - outer_object_content: str = None, list_of_outputs: bool = False) -> str: +def generate_gbnf_grammar_from_pydantic_models( + models: List[Type[BaseModel]], outer_object_name: str = None, outer_object_content: str = None, + list_of_outputs: bool = False +) -> str: """ Generate GBNF Grammar from Pydantic Models. This method takes a list of Pydantic models and uses them to generate a GBNF grammar string. The generated grammar string can be used for parsing and validating data using the generated * grammar. - Parameters: - models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from. - outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. - outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. - list_of_outputs (str, optional): Allows a list of output objects + Args: + models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + list_of_outputs (str, optional): Allows a list of output objects Returns: - str: The generated GBNF grammar string. + str: The generated GBNF grammar string. Examples: models = [UserModel, PostModel] @@ -527,52 +547,53 @@ def generate_gbnf_grammar_from_pydantic_models(models: List[Type[BaseModel]], ou all_rules = [] created_rules = {} if outer_object_name is None: - for model in models: - model_rules, _, _ = generate_gbnf_grammar(model, - processed_models, created_rules) + model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules) all_rules.extend(model_rules) if list_of_outputs: - root_rule = r'root ::= ws "[" grammar-models ("," grammar-models)* "]"' + "\n" + root_rule = r'root ::= (" "| "\n") "[" ws grammar-models ("," ws grammar-models)* ws "]"' + "\n" else: - root_rule = r'root ::= ws grammar-models' + "\n" + root_rule = r'root ::= (" "| "\n") grammar-models' + "\n" root_rule += "grammar-models ::= " + " | ".join( [format_model_and_field_name(model.__name__) for model in models]) all_rules.insert(0, root_rule) return "\n".join(all_rules) elif outer_object_name is not None: if list_of_outputs: - root_rule = fr'root ::= ws "[" {format_model_and_field_name(outer_object_name)} ("," {format_model_and_field_name(outer_object_name)})* "]"' + "\n" + root_rule = ( + rf'root ::= (" "| "\n") "[" ws {format_model_and_field_name(outer_object_name)} ("," ws {format_model_and_field_name(outer_object_name)})* ws "]"' + + "\n" + ) else: root_rule = f"root ::= {format_model_and_field_name(outer_object_name)}\n" - model_rule = fr'{format_model_and_field_name(outer_object_name)} ::= ws "{{" ws "\"{outer_object_name}\"" ": " grammar-models' + model_rule = ( + rf'{format_model_and_field_name(outer_object_name)} ::= (" "| "\n") "{{" ws "\"{outer_object_name}\"" ":" ws grammar-models' + ) fields_joined = " | ".join( - [fr'{format_model_and_field_name(model.__name__)}-grammar-model' for model in models]) + [rf"{format_model_and_field_name(model.__name__)}-grammar-model" for model in models]) - grammar_model_rules = f'\ngrammar-models ::= {fields_joined}' + grammar_model_rules = f"\ngrammar-models ::= {fields_joined}" mod_rules = [] for model in models: - mod_rule = fr'{format_model_and_field_name(model.__name__)}-grammar-model ::= ws' - mod_rule += fr'"\"{format_model_and_field_name(model.__name__)}\"" "," ws "\"{outer_object_content}\"" ws ":" ws {format_model_and_field_name(model.__name__)}' + '\n' + mod_rule = rf"{format_model_and_field_name(model.__name__)}-grammar-model ::= " + mod_rule += ( + rf'"\"{model.__name__}\"" "," ws "\"{outer_object_content}\"" ":" ws {format_model_and_field_name(model.__name__)}' + "\n" + ) mod_rules.append(mod_rule) grammar_model_rules += "\n" + "\n".join(mod_rules) - look_for_markdown_code_block = False - look_for_triple_quoted_string = False + for model in models: - model_rules, markdown_block, triple_quoted_string = generate_gbnf_grammar(model, - processed_models, created_rules) + model_rules, has_special_string = generate_gbnf_grammar(model, processed_models, + created_rules) + + if not has_special_string: + model_rules[0] += r'"\n" ws "}"' + all_rules.extend(model_rules) - if markdown_block: - look_for_markdown_code_block = True - if triple_quoted_string: - look_for_triple_quoted_string = True - - if not look_for_markdown_code_block and not look_for_triple_quoted_string: - model_rule += ' ws "}"' all_rules.insert(0, root_rule + model_rule + grammar_model_rules) return "\n".join(all_rules) @@ -582,10 +603,10 @@ def get_primitive_grammar(grammar): Returns the needed GBNF primitive grammar for a given GBNF grammar string. Args: - grammar (str): The string containing the GBNF grammar. + grammar (str): The string containing the GBNF grammar. Returns: - str: GBNF primitive grammar string. + str: GBNF primitive grammar string. """ type_list = [] if "string-list" in grammar: @@ -611,7 +632,7 @@ integer ::= [0-9]+""" any_block = "" if "custom-class-any" in grammar: - any_block = ''' + any_block = """ value ::= object | array | string | number | boolean | null object ::= @@ -626,7 +647,7 @@ array ::= ("," ws value)* )? "]" ws -number ::= integer | float''' +number ::= integer | float""" markdown_code_block_grammar = "" if "markdown-code-block" in grammar: @@ -641,90 +662,32 @@ closing-triple-ticks ::= "```" "\n"''' triple-quoted-string ::= triple-quotes triple-quoted-string-content triple-quotes triple-quoted-string-content ::= ( [^'] | "'" [^'] | "'" "'" [^'] )* triple-quotes ::= "'''" """ - return "\n" + '\n'.join(additional_grammar) + any_block + primitive_grammar + markdown_code_block_grammar + return "\n" + "\n".join(additional_grammar) + any_block + primitive_grammar + markdown_code_block_grammar -def generate_field_markdown(field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1) -> str: - indent = ' ' * depth - field_markdown = f"{indent}- **{field_name}** (`{field_type.__name__}`): " - - # Extracting field description from Pydantic Field using __model_fields__ - field_info = model.model_fields.get(field_name) - field_description = field_info.description if field_info and field_info.description else "No description available." - - field_markdown += field_description + '\n' - - # Handling nested BaseModel fields - if isclass(field_type) and issubclass(field_type, BaseModel): - field_markdown += f"{indent} - Details:\n" - for name, type_ in field_type.__annotations__.items(): - field_markdown += generate_field_markdown(name, type_, field_type, depth + 2) - - return field_markdown - - -def generate_markdown_report(pydantic_models: List[Type[BaseModel]]) -> str: - markdown = "" - for model in pydantic_models: - markdown += f"### {format_model_and_field_name(model.__name__)}\n" - - # Check if the model's docstring is different from BaseModel's docstring - class_doc = getdoc(model) - base_class_doc = getdoc(BaseModel) - class_description = class_doc if class_doc and class_doc != base_class_doc else "No specific description available." - - markdown += f"{class_description}\n\n" - markdown += "#### Fields\n" - - if isclass(model) and issubclass(model, BaseModel): - for name, field_type in model.__annotations__.items(): - markdown += generate_field_markdown(format_model_and_field_name(name), field_type, model) - markdown += "\n" - - return markdown - - -def format_json_example(example: dict, depth: int) -> str: +def generate_markdown_documentation( + pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields", + documentation_with_field_description=True +) -> str: """ - Format a JSON example into a readable string with indentation. + Generate markdown documentation for a list of Pydantic models. Args: - example (dict): JSON example to be formatted. - depth (int): Indentation depth. + pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes. + model_prefix (str): Prefix for the model section. + fields_prefix (str): Prefix for the fields section. + documentation_with_field_description (bool): Include field descriptions in the documentation. Returns: - str: Formatted JSON example string. - """ - indent = ' ' * depth - formatted_example = '{\n' - for key, value in example.items(): - value_text = f"'{value}'" if isinstance(value, str) else value - formatted_example += f"{indent}{key}: {value_text},\n" - formatted_example = formatted_example.rstrip(',\n') + '\n' + indent + '}' - return formatted_example - - -def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_prefix="Model", - fields_prefix="Fields", documentation_with_field_description=True) -> str: - """ - Generate text documentation for a list of Pydantic models. - - Args: - pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes. - model_prefix (str): Prefix for the model section. - fields_prefix (str): Prefix for the fields section. - documentation_with_field_description (bool): Include field descriptions in the documentation. - - Returns: - str: Generated text documentation. + str: Generated text documentation. """ documentation = "" pyd_models = [(model, True) for model in pydantic_models] for model, add_prefix in pyd_models: if add_prefix: - documentation += f"{model_prefix}: {format_model_and_field_name(model.__name__)}\n" + documentation += f"{model_prefix}: {model.__name__}\n" else: - documentation += f"Model: {format_model_and_field_name(model.__name__)}\n" + documentation += f"Model: {model.__name__}\n" # Handling multi-line model description with proper indentation @@ -733,7 +696,7 @@ def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_pr class_description = class_doc if class_doc and class_doc != base_class_doc else "" if class_description != "": documentation += " Description: " - documentation += "\n" + format_multiline_description(class_description, 2) + "\n" + documentation += format_multiline_description(class_description, 0) + "\n" if add_prefix: # Indenting the fields section @@ -753,35 +716,192 @@ def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_pr for element_type in element_types: if isclass(element_type) and issubclass(element_type, BaseModel): pyd_models.append((element_type, False)) - documentation += generate_field_text(name, field_type, model, - documentation_with_field_description=documentation_with_field_description) + documentation += generate_field_markdown( + name, field_type, model, documentation_with_field_description=documentation_with_field_description + ) documentation += "\n" - if hasattr(model, 'Config') and hasattr(model.Config, - 'json_schema_extra') and 'example' in model.Config.json_schema_extra: + if hasattr(model, "Config") and hasattr(model.Config, + "json_schema_extra") and "example" in model.Config.json_schema_extra: documentation += f" Expected Example Output for {format_model_and_field_name(model.__name__)}:\n" - json_example = json.dumps(model.Config.json_schema_extra['example']) + json_example = json.dumps(model.Config.json_schema_extra["example"]) documentation += format_multiline_description(json_example, 2) + "\n" return documentation -def generate_field_text(field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1, - documentation_with_field_description=True) -> str: +def generate_field_markdown( + field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1, + documentation_with_field_description=True +) -> str: + """ + Generate markdown documentation for a Pydantic model field. + + Args: + field_name (str): Name of the field. + field_type (Type[Any]): Type of the field. + model (Type[BaseModel]): Pydantic model class. + depth (int): Indentation depth in the documentation. + documentation_with_field_description (bool): Include field descriptions in the documentation. + + Returns: + str: Generated text documentation for the field. + """ + indent = " " * depth + + field_info = model.model_fields.get(field_name) + field_description = field_info.description if field_info and field_info.description else "" + + if get_origin(field_type) == list: + element_type = get_args(field_type)[0] + field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)} of {format_model_and_field_name(element_type.__name__)})" + if field_description != "": + field_text += ":\n" + else: + field_text += "\n" + elif get_origin(field_type) == Union: + element_types = get_args(field_type) + types = [] + for element_type in element_types: + types.append(format_model_and_field_name(element_type.__name__)) + field_text = f"{indent}{field_name} ({' or '.join(types)})" + if field_description != "": + field_text += ":\n" + else: + field_text += "\n" + else: + field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)})" + if field_description != "": + field_text += ":\n" + else: + field_text += "\n" + + if not documentation_with_field_description: + return field_text + + if field_description != "": + field_text += f" Description: " + field_description + "\n" + + # Check for and include field-specific examples if available + if hasattr(model, "Config") and hasattr(model.Config, + "json_schema_extra") and "example" in model.Config.json_schema_extra: + field_example = model.Config.json_schema_extra["example"].get(field_name) + if field_example is not None: + example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example + field_text += f"{indent} Example: {example_text}\n" + + if isclass(field_type) and issubclass(field_type, BaseModel): + field_text += f"{indent} Details:\n" + for name, type_ in field_type.__annotations__.items(): + field_text += generate_field_markdown(name, type_, field_type, depth + 2) + + return field_text + + +def format_json_example(example: dict, depth: int) -> str: + """ + Format a JSON example into a readable string with indentation. + + Args: + example (dict): JSON example to be formatted. + depth (int): Indentation depth. + + Returns: + str: Formatted JSON example string. + """ + indent = " " * depth + formatted_example = "{\n" + for key, value in example.items(): + value_text = f"'{value}'" if isinstance(value, str) else value + formatted_example += f"{indent}{key}: {value_text},\n" + formatted_example = formatted_example.rstrip(",\n") + "\n" + indent + "}" + return formatted_example + + +def generate_text_documentation( + pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields", + documentation_with_field_description=True +) -> str: + """ + Generate text documentation for a list of Pydantic models. + + Args: + pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes. + model_prefix (str): Prefix for the model section. + fields_prefix (str): Prefix for the fields section. + documentation_with_field_description (bool): Include field descriptions in the documentation. + + Returns: + str: Generated text documentation. + """ + documentation = "" + pyd_models = [(model, True) for model in pydantic_models] + for model, add_prefix in pyd_models: + if add_prefix: + documentation += f"{model_prefix}: {model.__name__}\n" + else: + documentation += f"Model: {model.__name__}\n" + + # Handling multi-line model description with proper indentation + + class_doc = getdoc(model) + base_class_doc = getdoc(BaseModel) + class_description = class_doc if class_doc and class_doc != base_class_doc else "" + if class_description != "": + documentation += " Description: " + documentation += "\n" + format_multiline_description(class_description, 2) + "\n" + + if isclass(model) and issubclass(model, BaseModel): + documentation_fields = "" + for name, field_type in model.__annotations__.items(): + # if name == "markdown_code_block": + # continue + if get_origin(field_type) == list: + element_type = get_args(field_type)[0] + if isclass(element_type) and issubclass(element_type, BaseModel): + pyd_models.append((element_type, False)) + if get_origin(field_type) == Union: + element_types = get_args(field_type) + for element_type in element_types: + if isclass(element_type) and issubclass(element_type, BaseModel): + pyd_models.append((element_type, False)) + documentation_fields += generate_field_text( + name, field_type, model, documentation_with_field_description=documentation_with_field_description + ) + if documentation_fields != "": + if add_prefix: + documentation += f" {fields_prefix}:\n{documentation_fields}" + else: + documentation += f" Fields:\n{documentation_fields}" + documentation += "\n" + + if hasattr(model, "Config") and hasattr(model.Config, + "json_schema_extra") and "example" in model.Config.json_schema_extra: + documentation += f" Expected Example Output for {format_model_and_field_name(model.__name__)}:\n" + json_example = json.dumps(model.Config.json_schema_extra["example"]) + documentation += format_multiline_description(json_example, 2) + "\n" + + return documentation + + +def generate_field_text( + field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1, + documentation_with_field_description=True +) -> str: """ Generate text documentation for a Pydantic model field. Args: - field_name (str): Name of the field. - field_type (Type[Any]): Type of the field. - model (Type[BaseModel]): Pydantic model class. - depth (int): Indentation depth in the documentation. - documentation_with_field_description (bool): Include field descriptions in the documentation. + field_name (str): Name of the field. + field_type (Type[Any]): Type of the field. + model (Type[BaseModel]): Pydantic model class. + depth (int): Indentation depth in the documentation. + documentation_with_field_description (bool): Include field descriptions in the documentation. Returns: - str: Generated text documentation for the field. + str: Generated text documentation for the field. """ - indent = ' ' * depth + indent = " " * depth field_info = model.model_fields.get(field_name) field_description = field_info.description if field_info and field_info.description else "" @@ -817,9 +937,9 @@ def generate_field_text(field_name: str, field_type: Type[Any], model: Type[Base field_text += f"{indent} Description: " + field_description + "\n" # Check for and include field-specific examples if available - if hasattr(model, 'Config') and hasattr(model.Config, - 'json_schema_extra') and 'example' in model.Config.json_schema_extra: - field_example = model.Config.json_schema_extra['example'].get(field_name) + if hasattr(model, "Config") and hasattr(model.Config, + "json_schema_extra") and "example" in model.Config.json_schema_extra: + field_example = model.Config.json_schema_extra["example"].get(field_name) if field_example is not None: example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example field_text += f"{indent} Example: {example_text}\n" @@ -837,39 +957,40 @@ def format_multiline_description(description: str, indent_level: int) -> str: Format a multiline description with proper indentation. Args: - description (str): Multiline description. - indent_level (int): Indentation level. + description (str): Multiline description. + indent_level (int): Indentation level. Returns: - str: Formatted multiline description. + str: Formatted multiline description. """ - indent = ' ' * indent_level - return indent + description.replace('\n', '\n' + indent) + indent = " " * indent_level + return indent + description.replace("\n", "\n" + indent) -def save_gbnf_grammar_and_documentation(grammar, documentation, grammar_file_path="./grammar.gbnf", - documentation_file_path="./grammar_documentation.md"): +def save_gbnf_grammar_and_documentation( + grammar, documentation, grammar_file_path="./grammar.gbnf", documentation_file_path="./grammar_documentation.md" +): """ Save GBNF grammar and documentation to specified files. Args: - grammar (str): GBNF grammar string. - documentation (str): Documentation string. - grammar_file_path (str): File path to save the GBNF grammar. - documentation_file_path (str): File path to save the documentation. + grammar (str): GBNF grammar string. + documentation (str): Documentation string. + grammar_file_path (str): File path to save the GBNF grammar. + documentation_file_path (str): File path to save the documentation. Returns: - None + None """ try: - with open(grammar_file_path, 'w') as file: + with open(grammar_file_path, "w") as file: file.write(grammar + get_primitive_grammar(grammar)) print(f"Grammar successfully saved to {grammar_file_path}") except IOError as e: print(f"An error occurred while saving the grammar file: {e}") try: - with open(documentation_file_path, 'w') as file: + with open(documentation_file_path, "w") as file: file.write(documentation) print(f"Documentation successfully saved to {documentation_file_path}") except IOError as e: @@ -881,10 +1002,10 @@ def remove_empty_lines(string): Remove empty lines from a string. Args: - string (str): Input string. + string (str): Input string. Returns: - str: String with empty lines removed. + str: String with empty lines removed. """ lines = string.splitlines() non_empty_lines = [line for line in lines if line.strip() != ""] @@ -892,95 +1013,109 @@ def remove_empty_lines(string): return string_no_empty_lines -def generate_and_save_gbnf_grammar_and_documentation(pydantic_model_list, - grammar_file_path="./generated_grammar.gbnf", - documentation_file_path="./generated_grammar_documentation.md", - outer_object_name: str = None, - outer_object_content: str = None, - model_prefix: str = "Output Model", - fields_prefix: str = "Output Fields", - list_of_outputs: bool = False, - documentation_with_field_description=True): +def generate_and_save_gbnf_grammar_and_documentation( + pydantic_model_list, + grammar_file_path="./generated_grammar.gbnf", + documentation_file_path="./generated_grammar_documentation.md", + outer_object_name: str = None, + outer_object_content: str = None, + model_prefix: str = "Output Model", + fields_prefix: str = "Output Fields", + list_of_outputs: bool = False, + documentation_with_field_description=True, +): """ Generate GBNF grammar and documentation, and save them to specified files. Args: - pydantic_model_list: List of Pydantic model classes. - grammar_file_path (str): File path to save the generated GBNF grammar. - documentation_file_path (str): File path to save the generated documentation. - outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. - outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. - model_prefix (str): Prefix for the model section in the documentation. - fields_prefix (str): Prefix for the fields section in the documentation. - list_of_outputs (bool): Whether the output is a list of items. - documentation_with_field_description (bool): Include field descriptions in the documentation. + pydantic_model_list: List of Pydantic model classes. + grammar_file_path (str): File path to save the generated GBNF grammar. + documentation_file_path (str): File path to save the generated documentation. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + model_prefix (str): Prefix for the model section in the documentation. + fields_prefix (str): Prefix for the fields section in the documentation. + list_of_outputs (bool): Whether the output is a list of items. + documentation_with_field_description (bool): Include field descriptions in the documentation. Returns: - None + None """ - documentation = generate_text_documentation(pydantic_model_list, model_prefix, fields_prefix, - documentation_with_field_description=documentation_with_field_description) - grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, - outer_object_content, list_of_outputs) + documentation = generate_markdown_documentation( + pydantic_model_list, model_prefix, fields_prefix, + documentation_with_field_description=documentation_with_field_description + ) + grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, outer_object_content, + list_of_outputs) grammar = remove_empty_lines(grammar) save_gbnf_grammar_and_documentation(grammar, documentation, grammar_file_path, documentation_file_path) -def generate_gbnf_grammar_and_documentation(pydantic_model_list, outer_object_name: str = None, - outer_object_content: str = None, - model_prefix: str = "Output Model", - fields_prefix: str = "Output Fields", list_of_outputs: bool = False, - documentation_with_field_description=True): +def generate_gbnf_grammar_and_documentation( + pydantic_model_list, + outer_object_name: str = None, + outer_object_content: str = None, + model_prefix: str = "Output Model", + fields_prefix: str = "Output Fields", + list_of_outputs: bool = False, + documentation_with_field_description=True, +): """ Generate GBNF grammar and documentation for a list of Pydantic models. Args: - pydantic_model_list: List of Pydantic model classes. - outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. - outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. - model_prefix (str): Prefix for the model section in the documentation. - fields_prefix (str): Prefix for the fields section in the documentation. - list_of_outputs (bool): Whether the output is a list of items. - documentation_with_field_description (bool): Include field descriptions in the documentation. + pydantic_model_list: List of Pydantic model classes. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + model_prefix (str): Prefix for the model section in the documentation. + fields_prefix (str): Prefix for the fields section in the documentation. + list_of_outputs (bool): Whether the output is a list of items. + documentation_with_field_description (bool): Include field descriptions in the documentation. Returns: - tuple: GBNF grammar string, documentation string. + tuple: GBNF grammar string, documentation string. """ - documentation = generate_text_documentation(copy(pydantic_model_list), model_prefix, fields_prefix, - documentation_with_field_description=documentation_with_field_description) - grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, - outer_object_content, list_of_outputs) + documentation = generate_markdown_documentation( + copy(pydantic_model_list), model_prefix, fields_prefix, + documentation_with_field_description=documentation_with_field_description + ) + grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, outer_object_content, + list_of_outputs) grammar = remove_empty_lines(grammar + get_primitive_grammar(grammar)) return grammar, documentation -def generate_gbnf_grammar_and_documentation_from_dictionaries(dictionaries: List[dict], - outer_object_name: str = None, - outer_object_content: str = None, - model_prefix: str = "Output Model", - fields_prefix: str = "Output Fields", - list_of_outputs: bool = False, - documentation_with_field_description=True): +def generate_gbnf_grammar_and_documentation_from_dictionaries( + dictionaries: List[dict], + outer_object_name: str = None, + outer_object_content: str = None, + model_prefix: str = "Output Model", + fields_prefix: str = "Output Fields", + list_of_outputs: bool = False, + documentation_with_field_description=True, +): """ Generate GBNF grammar and documentation from a list of dictionaries. Args: - dictionaries (List[dict]): List of dictionaries representing Pydantic models. - outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. - outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. - model_prefix (str): Prefix for the model section in the documentation. - fields_prefix (str): Prefix for the fields section in the documentation. - list_of_outputs (bool): Whether the output is a list of items. - documentation_with_field_description (bool): Include field descriptions in the documentation. + dictionaries (List[dict]): List of dictionaries representing Pydantic models. + outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling. + outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling. + model_prefix (str): Prefix for the model section in the documentation. + fields_prefix (str): Prefix for the fields section in the documentation. + list_of_outputs (bool): Whether the output is a list of items. + documentation_with_field_description (bool): Include field descriptions in the documentation. Returns: - tuple: GBNF grammar string, documentation string. + tuple: GBNF grammar string, documentation string. """ pydantic_model_list = create_dynamic_models_from_dictionaries(dictionaries) - documentation = generate_text_documentation(copy(pydantic_model_list), model_prefix, fields_prefix, - documentation_with_field_description=documentation_with_field_description) - grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, - outer_object_content, list_of_outputs) + documentation = generate_markdown_documentation( + copy(pydantic_model_list), model_prefix, fields_prefix, + documentation_with_field_description=documentation_with_field_description + ) + grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, outer_object_content, + list_of_outputs) grammar = remove_empty_lines(grammar + get_primitive_grammar(grammar)) return grammar, documentation @@ -990,41 +1125,61 @@ def create_dynamic_model_from_function(func: Callable): Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method. Args: - func (Callable): A function with type hints from which to create the model. + func (Callable): A function with type hints from which to create the model. Returns: - A dynamic Pydantic model class with the provided function as a 'run' method. + A dynamic Pydantic model class with the provided function as a 'run' method. """ - # Extracting type hints from the provided function - type_hints = get_type_hints(func) - type_hints.pop('return', None) - # Handling default values and annotations + # Get the signature of the function + sig = inspect.signature(func) + + # Parse the docstring + docstring = parse(func.__doc__) + dynamic_fields = {} - defaults = getattr(func, '__defaults__', ()) or () - defaults_index = len(type_hints) - len(defaults) + param_docs = [] + for param in sig.parameters.values(): + # Exclude 'self' parameter + if param.name == "self": + continue - for index, (name, typ) in enumerate(type_hints.items()): - if index >= defaults_index: - default_value = defaults[index - defaults_index] - dynamic_fields[name] = (typ, default_value) + # Assert that the parameter has a type annotation + if param.annotation == inspect.Parameter.empty: + raise TypeError(f"Parameter '{param.name}' in function '{func.__name__}' lacks a type annotation") + + # Find the parameter's description in the docstring + param_doc = next((d for d in docstring.params if d.arg_name == param.name), None) + + # Assert that the parameter has a description + if not param_doc or not param_doc.description: + raise ValueError( + f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring") + + # Add parameter details to the schema + param_doc = next((d for d in docstring.params if d.arg_name == param.name), None) + param_docs.append((param.name, param_doc)) + if param.default == inspect.Parameter.empty: + default_value = ... else: - dynamic_fields[name] = (typ, ...) - + default_value = param.default + dynamic_fields[param.name] = ( + param.annotation if param.annotation != inspect.Parameter.empty else str, default_value) # Creating the dynamic model - dynamicModel = create_model(f'{func.__name__}', **dynamic_fields) + dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) - dynamicModel.__doc__ = getdoc(func) + for param_doc in param_docs: + dynamic_model.model_fields[param_doc[0]].description = param_doc[1].description + + dynamic_model.__doc__ = docstring.short_description - # Wrapping the original function to handle instance 'self' def run_method_wrapper(self): - func_args = {name: getattr(self, name) for name in type_hints} + func_args = {name: getattr(self, name) for name, _ in dynamic_fields.items()} return func(**func_args) # Adding the wrapped function as a 'run' method - setattr(dynamicModel, 'run', run_method_wrapper) - - return dynamicModel + setattr(dynamic_model, "run", run_method_wrapper) + return dynamic_model def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable): @@ -1032,11 +1187,11 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable): Add a 'run' method to a dynamic Pydantic model, using the provided function. Args: - - model (Type[BaseModel]): Dynamic Pydantic model class. - - func (Callable): Function to be added as a 'run' method to the model. + model (Type[BaseModel]): Dynamic Pydantic model class. + func (Callable): Function to be added as a 'run' method to the model. Returns: - - Type[BaseModel]: Pydantic model class with the added 'run' method. + Type[BaseModel]: Pydantic model class with the added 'run' method. """ def run_method_wrapper(self): @@ -1044,7 +1199,7 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable): return func(**func_args) # Adding the wrapped function as a 'run' method - setattr(model, 'run', run_method_wrapper) + setattr(model, "run", run_method_wrapper) return model @@ -1054,15 +1209,15 @@ def create_dynamic_models_from_dictionaries(dictionaries: List[dict]): Create a list of dynamic Pydantic model classes from a list of dictionaries. Args: - - dictionaries (List[dict]): List of dictionaries representing model structures. + dictionaries (List[dict]): List of dictionaries representing model structures. Returns: - - List[Type[BaseModel]]: List of generated dynamic Pydantic model classes. + List[Type[BaseModel]]: List of generated dynamic Pydantic model classes. """ dynamic_models = [] for func in dictionaries: model_name = format_model_and_field_name(func.get("name", "")) - dyn_model = convert_dictionary_to_to_pydantic_model(func, model_name) + dyn_model = convert_dictionary_to_pydantic_model(func, model_name) dynamic_models.append(dyn_model) return dynamic_models @@ -1080,12 +1235,12 @@ from enum import Enum def json_schema_to_python_types(schema): type_map = { - 'any': Any, - 'string': str, - 'number': float, - 'integer': int, - 'boolean': bool, - 'array': list, + "any": Any, + "string": str, + "number": float, + "integer": int, + "boolean": bool, + "array": list, } return type_map[schema] @@ -1094,58 +1249,64 @@ def list_to_enum(enum_name, values): return Enum(enum_name, {value: value for value in values}) -def convert_dictionary_to_to_pydantic_model(dictionary: dict, model_name: str = 'CustomModel') -> Type[BaseModel]: +def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "CustomModel") -> Type[BaseModel]: """ Convert a dictionary to a Pydantic model class. Args: - - dictionary (dict): Dictionary representing the model structure. - - model_name (str): Name of the generated Pydantic model. + dictionary (dict): Dictionary representing the model structure. + model_name (str): Name of the generated Pydantic model. Returns: - - Type[BaseModel]: Generated Pydantic model class. + Type[BaseModel]: Generated Pydantic model class. """ fields = {} if "properties" in dictionary: for field_name, field_data in dictionary.get("properties", {}).items(): - if field_data == 'object': - submodel = convert_dictionary_to_to_pydantic_model(dictionary, f'{model_name}_{field_name}') + if field_data == "object": + submodel = convert_dictionary_to_pydantic_model(dictionary, f"{model_name}_{field_name}") fields[field_name] = (submodel, ...) else: - field_type = field_data.get('type', 'str') + field_type = field_data.get("type", "str") if field_data.get("enum", []): fields[field_name] = (list_to_enum(field_name, field_data.get("enum", [])), ...) - if field_type == "array": + elif field_type == "array": items = field_data.get("items", {}) if items != {}: array = {"properties": items} - array_type = convert_dictionary_to_to_pydantic_model(array, f'{model_name}_{field_name}_items') + array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items") fields[field_name] = (List[array_type], ...) else: fields[field_name] = (list, ...) - elif field_type == 'object': - submodel = convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}_{field_name}') + elif field_type == "object": + submodel = convert_dictionary_to_pydantic_model(field_data, f"{model_name}_{field_name}") fields[field_name] = (submodel, ...) + elif field_type == "required": + required = field_data.get("enum", []) + for key, field in fields.items(): + if key not in required: + fields[key] = (Optional[fields[key][0]], ...) else: field_type = json_schema_to_python_types(field_type) fields[field_name] = (field_type, ...) if "function" in dictionary: - for field_name, field_data in dictionary.get("function", {}).items(): if field_name == "name": model_name = field_data elif field_name == "description": fields["__doc__"] = field_data elif field_name == "parameters": - return convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}') + return convert_dictionary_to_pydantic_model(field_data, f"{model_name}") + if "parameters" in dictionary: field_data = {"function": dictionary} - return convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}') - + return convert_dictionary_to_pydantic_model(field_data, f"{model_name}") + if "required" in dictionary: + required = dictionary.get("required", []) + for key, field in fields.items(): + if key not in required: + fields[key] = (Optional[fields[key][0]], ...) custom_model = create_model(model_name, **fields) return custom_model - - -