The C API in llama.h claims users can implement `llama_sampler_i` to create custom `llama_sampler`. The sampler chain takes ownership and calls `llama_sampler_free` on them. However, `llama_sampler_free` is hard-coded to use `delete`. This is undefined behavior if the object wasn't also allocated via `new` from libllama's C++ runtime. Callers in C and C-compatible languages do not use C++'s `new` operator. C++ callers may not be sharing the same heap as libllama.
		
			
				
	
	
		
			270 lines
		
	
	
	
		
			8.9 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			270 lines
		
	
	
	
		
			8.9 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| #include "sampling.h"
 | |
| #include "log.h"
 | |
| 
 | |
| #ifdef LLAMA_USE_LLGUIDANCE
 | |
| 
 | |
| #    include "llguidance.h"
 | |
| #    include <cmath>
 | |
| 
 | |
| struct llama_sampler_llg {
 | |
|     const llama_vocab * vocab;
 | |
|     std::string         grammar_kind;
 | |
|     std::string         grammar_data;
 | |
|     LlgTokenizer *      tokenizer;
 | |
|     LlgConstraint *     grammar;
 | |
|     LlgMaskResult       llg_res;
 | |
|     bool                has_llg_res;
 | |
| };
 | |
| 
 | |
| static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
 | |
|                                              const char * grammar_data) {
 | |
|     LlgConstraintInit cinit;
 | |
|     llg_constraint_init_set_defaults(&cinit, tokenizer);
 | |
|     const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
 | |
|     if (log_level && *log_level) {
 | |
|         cinit.log_stderr_level = atoi(log_level);
 | |
|     }
 | |
|     auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
 | |
|     if (llg_get_error(c)) {
 | |
|         LOG_ERR("llg error: %s\n", llg_get_error(c));
 | |
|         llg_free_constraint(c);
 | |
|         return nullptr;
 | |
|     }
 | |
|     return c;
 | |
| }
 | |
| 
 | |
| static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
 | |
|     return "llguidance";
 | |
| }
 | |
| 
 | |
| static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
 | |
|     auto * ctx = (llama_sampler_llg *) smpl->ctx;
 | |
|     if (ctx->grammar) {
 | |
|         LlgCommitResult res;
 | |
|         llg_commit_token(ctx->grammar, token, &res);
 | |
|         ctx->has_llg_res = false;
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
 | |
|     auto * ctx = (llama_sampler_llg *) smpl->ctx;
 | |
|     if (ctx->grammar) {
 | |
|         if (!ctx->has_llg_res) {
 | |
|             if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
 | |
|                 ctx->has_llg_res = true;
 | |
|             } else {
 | |
|                 LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
 | |
|                 llg_free_constraint(ctx->grammar);
 | |
|                 ctx->grammar = nullptr;
 | |
|             }
 | |
|         }
 | |
|         if (ctx->has_llg_res) {
 | |
|             if (ctx->llg_res.is_stop) {
 | |
|                 for (size_t i = 0; i < cur_p->size; ++i) {
 | |
|                     if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
 | |
|                         cur_p->data[i].logit = -INFINITY;
 | |
|                     }
 | |
|                 }
 | |
|             } else {
 | |
|                 const uint32_t * mask = ctx->llg_res.sample_mask;
 | |
|                 for (size_t i = 0; i < cur_p->size; ++i) {
 | |
|                     auto token = cur_p->data[i].id;
 | |
|                     if ((mask[token / 32] & (1 << (token % 32))) == 0) {
 | |
|                         cur_p->data[i].logit = -INFINITY;
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| static void llama_sampler_llg_reset(llama_sampler * smpl) {
 | |
|     auto * ctx = (llama_sampler_llg *) smpl->ctx;
 | |
|     if (!ctx->grammar) {
 | |
|         return;
 | |
|     }
 | |
| 
 | |
|     auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
 | |
|     llg_free_constraint(ctx->grammar);
 | |
|     ctx->grammar     = grammar_new;
 | |
|     ctx->has_llg_res = false;
 | |
| }
 | |
| 
 | |
| static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
 | |
|     const auto * ctx = (const llama_sampler_llg *) smpl->ctx;
 | |
| 
 | |
|     auto * result = llama_sampler_init_llg(ctx->vocab, nullptr, nullptr);
 | |
| 
 | |
|     // copy the state
 | |
|     {
 | |
|         auto * result_ctx = (llama_sampler_llg *) result->ctx;
 | |
| 
 | |
|         if (ctx->grammar) {
 | |
|             result_ctx->grammar_kind = ctx->grammar_kind;
 | |
|             result_ctx->grammar_data = ctx->grammar_data;
 | |
|             result_ctx->grammar      = llg_clone_constraint(ctx->grammar);
 | |
|             result_ctx->tokenizer    = llg_clone_tokenizer(ctx->tokenizer);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return result;
 | |
| }
 | |
| 
 | |
| static void llama_sampler_llg_free(llama_sampler * smpl) {
 | |
|     const auto * ctx = (llama_sampler_llg *) smpl->ctx;
 | |
| 
 | |
|     if (ctx->grammar) {
 | |
|         llg_free_constraint(ctx->grammar);
 | |
|         llg_free_tokenizer(ctx->tokenizer);
 | |
|     }
 | |
| 
 | |
|     delete ctx;
 | |
| }
 | |
| 
 | |
| static llama_sampler_i llama_sampler_llg_i = {
 | |
|     /* .name   = */ llama_sampler_llg_name,
 | |
|     /* .accept = */ llama_sampler_llg_accept_impl,
 | |
|     /* .apply  = */ llama_sampler_llg_apply,
 | |
|     /* .reset  = */ llama_sampler_llg_reset,
 | |
|     /* .clone  = */ llama_sampler_llg_clone,
 | |
|     /* .free   = */ llama_sampler_llg_free,
 | |
| };
 | |
| 
 | |
| static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
 | |
|                                             uint32_t * output_tokens, size_t output_tokens_len) {
 | |
|     const llama_vocab * vocab = (const llama_vocab *) user_data;
 | |
|     int                 r     = 0;
 | |
|     try {
 | |
|         r = llama_tokenize(vocab, (const char *) bytes, bytes_len, (int32_t *) output_tokens, output_tokens_len, false,
 | |
|                            true);
 | |
|     } catch (const std::exception & e) {
 | |
|         GGML_ABORT("llama_tokenize failed: %s\n", e.what());
 | |
|     }
 | |
|     if (r < 0) {
 | |
|         return -r;
 | |
|     }
 | |
|     return r;
 | |
| }
 | |
| 
 | |
| static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab) {
 | |
|     // TODO store the tokenizer in the vocab somehow
 | |
|     static const llama_vocab * vocab_cache;
 | |
|     static LlgTokenizer *      tokenizer_cache;
 | |
| 
 | |
|     if (vocab_cache == vocab) {
 | |
|         return llg_clone_tokenizer(tokenizer_cache);
 | |
|     }
 | |
| 
 | |
|     auto tok_eos = llama_vocab_eot(vocab);
 | |
|     if (tok_eos == LLAMA_TOKEN_NULL) {
 | |
|         tok_eos = llama_vocab_eos(vocab);
 | |
|     }
 | |
| 
 | |
|     size_t vocab_size = llama_vocab_n_tokens(vocab);
 | |
| 
 | |
|     auto token_lens       = new uint32_t[vocab_size];
 | |
|     // we typically have ~7 bytes per token; let's go on the safe side here
 | |
|     auto token_bytes_size = vocab_size * 16 + 1024 * 1024;
 | |
|     auto token_bytes      = new uint8_t[token_bytes_size];
 | |
| 
 | |
|     size_t offset = 0;
 | |
|     for (size_t i = 0; i < vocab_size; i++) {
 | |
|         size_t max_token = 1024;
 | |
|         if (token_bytes_size - offset < max_token) {
 | |
|             GGML_ABORT("token_bytes buffer too small\n");
 | |
|         }
 | |
| 
 | |
|         llama_token token = i;
 | |
|         auto        dp    = (char *) token_bytes + offset;
 | |
|         auto        size  = llama_detokenize(vocab, &token, 1, dp, max_token, false, false);
 | |
|         if (size < 0) {
 | |
|             GGML_ABORT("llama_detokenize failed\n");
 | |
|         }
 | |
|         if (size == 0) {
 | |
|             size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true);
 | |
|             if (size < 0) {
 | |
|                 GGML_ABORT("llama_detokenize failed\n");
 | |
|             }
 | |
|             if (size != 0) {
 | |
|                 *dp = '\xff';  // special token prefix marker
 | |
|                 size += 1;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         token_lens[i] = size;
 | |
|         offset += size;
 | |
|     }
 | |
| 
 | |
|     LlgTokenizerInit tinit = {
 | |
|         /* .vocab_size                         = */ (uint32_t) vocab_size,
 | |
|         /* .tok_eos                            = */ (uint32_t) tok_eos,
 | |
|         /* .token_lens                         = */ token_lens,
 | |
|         /* .token_bytes                        = */ token_bytes,
 | |
|         /* .tokenizer_json                     = */ nullptr,
 | |
|         /* .tokenize_assumes_string            = */ true,
 | |
|         /* .tokenize_fn                        = */ llama_sampler_llg_tokenize_fn,
 | |
|         /* .use_approximate_greedy_tokenize_fn = */ false,
 | |
|         /* .tokenize_user_data                 = */ vocab,
 | |
|     };
 | |
| 
 | |
|     char           error_buffer[1024];
 | |
|     LlgTokenizer * tokenizer = llg_new_tokenizer(&tinit, error_buffer, sizeof(error_buffer));
 | |
| 
 | |
|     delete[] token_bytes;
 | |
|     delete[] token_lens;
 | |
| 
 | |
|     if (tokenizer == nullptr) {
 | |
|         LOG_ERR("llg tokenizer error: %s\n", error_buffer);
 | |
|         return tokenizer;
 | |
|     }
 | |
| 
 | |
|     if (tokenizer_cache) {
 | |
|         llg_free_tokenizer(tokenizer_cache);
 | |
|     }
 | |
|     vocab_cache     = vocab;
 | |
|     tokenizer_cache = tokenizer;
 | |
| 
 | |
|     return llg_clone_tokenizer(tokenizer_cache);
 | |
| }
 | |
| 
 | |
| llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * grammar_kind,
 | |
|                                        const char * grammar_data) {
 | |
|     auto * ctx = new llama_sampler_llg;
 | |
| 
 | |
|     if (grammar_kind != nullptr && grammar_kind[0] != '\0') {
 | |
|         auto tokenizer = llama_sampler_llg_new_tokenizer(vocab);
 | |
|         *ctx           = {
 | |
|             /* .vocab        = */ vocab,
 | |
|             /* .grammar_kind = */ grammar_kind,
 | |
|             /* .grammar_data = */ grammar_data,
 | |
|             /* .tokenizer    = */ tokenizer,
 | |
|             /* .grammar      = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
 | |
|             /* .llg_res      = */ {},
 | |
|             /* .has_llg_res  = */ false,
 | |
|         };
 | |
|     } else {
 | |
|         *ctx = {
 | |
|             /* .vocab        = */ vocab,
 | |
|             /* .grammar_kind = */ {},
 | |
|             /* .grammar_data = */ {},
 | |
|             /* .tokenizer    = */ nullptr,
 | |
|             /* .grammar      = */ nullptr,
 | |
|             /* .llg_res      = */ {},
 | |
|             /* .has_llg_res  = */ false,
 | |
|         };
 | |
|     }
 | |
| 
 | |
|     return llama_sampler_init(
 | |
|         /* .iface = */ &llama_sampler_llg_i,
 | |
|         /* .ctx   = */ ctx
 | |
|     );
 | |
| }
 | |
| 
 | |
| #else
 | |
| 
 | |
| llama_sampler * llama_sampler_init_llg(const llama_vocab *, const char *, const char *) {
 | |
|     LOG_WRN("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 | |
|     return nullptr;
 | |
| }
 | |
| 
 | |
| #endif  // LLAMA_USE_LLGUIDANCE
 |