Init LlamaObjC Commit

This commit is contained in:
Jason Flax 2024-10-30 19:15:23 -04:00
parent 6026da52d6
commit 56f9d4b52a
29 changed files with 3472 additions and 32 deletions

14
Package.resolved Normal file
View file

@ -0,0 +1,14 @@
{
"pins" : [
{
"identity" : "swift-syntax",
"kind" : "remoteSourceControl",
"location" : "https://github.com/apple/swift-syntax.git",
"state" : {
"branch" : "main",
"revision" : "2c271e5ce55124ae534c2eff6e74f745e4db4f68"
}
}
],
"version" : 2
}

View file

@ -1,21 +1,28 @@
// swift-tools-version:5.5
// swift-tools-version:5.9
import CompilerPluginSupport
import PackageDescription
var sources = [
var cppSources = [
"src/llama.cpp",
"src/llama-vocab.cpp",
"src/llama-grammar.cpp",
"src/llama-sampling.cpp",
"src/unicode.cpp",
"src/unicode-data.cpp",
"ggml/src/ggml.c",
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.c",
"ggml/src/ggml-quants.c",
"ggml/src/ggml-aarch64.c",
"common/sampling.cpp",
"common/common.cpp",
"common/json-schema-to-grammar.cpp",
"common/log.cpp",
"common/console.cpp"
]
var ggmlSources = [
"src/ggml.c",
"src/ggml-alloc.c",
"src/ggml-backend.c",
"src/ggml-quants.c",
"src/ggml-aarch64.c"
]
var resources: [Resource] = []
var linkerSettings: [LinkerSetting] = []
var cSettings: [CSetting] = [
@ -24,13 +31,13 @@ var cSettings: [CSetting] = [
// NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64")
.define("ACCELERATE_NEW_LAPACK"),
.define("ACCELERATE_LAPACK_ILP64")
]
#if canImport(Darwin)
sources.append("ggml/src/ggml-metal.m")
resources.append(.process("ggml/src/ggml-metal.metal"))
ggmlSources.append("src/ggml-metal.m")
resources.append(.process("src/ggml-metal.metal"))
linkerSettings.append(.linkedFramework("Accelerate"))
cSettings.append(
contentsOf: [
@ -47,17 +54,20 @@ cSettings.append(
let package = Package(
name: "llama",
platforms: [
.macOS(.v12),
.macOS(.v13),
.iOS(.v14),
.watchOS(.v4),
.tvOS(.v14)
],
products: [
.library(name: "llama", targets: ["llama"]),
.executable(name: "LlamaKitMain", targets: ["LlamaKitMain"])
],
dependencies: [
.package(url: "https://github.com/apple/swift-syntax.git", branch: "main")
],
targets: [
.target(
name: "llama",
.target(name: "llama_cpp",
path: ".",
exclude: [
"cmake",
@ -68,12 +78,60 @@ let package = Package(
"CMakeLists.txt",
"Makefile"
],
sources: sources,
sources: cppSources,
publicHeadersPath: "spm-headers"),
.target(
name: "llama",
dependencies: ["llama_cpp"],
path: "ggml",
sources: ggmlSources,
resources: resources,
publicHeadersPath: "spm-headers",
cSettings: cSettings,
linkerSettings: linkerSettings
)
linkerSettings: linkerSettings),
.target(name: "LlamaObjC",
dependencies: ["llama"],
path: "objc",
sources: [
"GPTParams.mm",
"GPTSampler.mm",
"LlamaBatch.mm",
"LlamaObjC.mm",
"LlamaModel.mm",
"LlamaContext.mm",
"LlamaSession.mm",
],
cxxLanguageStandard: .cxx11
publicHeadersPath: "include",
cSettings: cSettings,
linkerSettings: linkerSettings),
.macro(
name: "JSONSchemaMacros",
dependencies: [
.product(name: "SwiftSyntax", package: "swift-syntax"),
.product(name: "SwiftSyntaxMacros", package: "swift-syntax"),
.product(name: "SwiftCompilerPlugin", package: "swift-syntax"),
],
path: "swift/JSONSchemaMacros"
),
.target(
name: "JSONSchema",
dependencies: ["JSONSchemaMacros"],
path: "swift/JSONSchema"
),
.target(
name: "LlamaKit",
dependencies: ["JSONSchema", "LlamaObjC"],
path: "swift/LlamaKit"
),
.testTarget(name: "LlamaKitTests",
dependencies: ["LlamaKit", "JSONSchema", "JSONSchemaMacros"],
path: "swift/test",
linkerSettings: [
.linkedFramework("XCTest"),
.linkedFramework("Testing")]),
.executableTarget(name: "LlamaKitMain",
dependencies: ["LlamaKit"],
path: "swift/main",
resources: [.process("Llama-3.2-3B-Instruct-Q4_0.gguf")]),
],
cxxLanguageStandard: .cxx17
)

View file

@ -34,10 +34,10 @@ struct llama_lora_adapter_container : llama_lora_adapter_info {
};
// build info
extern int LLAMA_BUILD_NUMBER;
extern char const * LLAMA_COMMIT;
extern char const * LLAMA_COMPILER;
extern char const * LLAMA_BUILD_TARGET;
static int LLAMA_BUILD_NUMBER = 0;
static char const * LLAMA_COMMIT = "";
static char const * LLAMA_COMPILER = "";
static char const * LLAMA_BUILD_TARGET = "";
struct llama_control_vector_load_info;

726
objc/GPTParams.mm Normal file
View file

@ -0,0 +1,726 @@
#import <Foundation/Foundation.h>
#import "GPTParams_Private.hpp"
#import "../common/common.h"
#import "ggml.h"
@implementation GGMLThreadpool {
ggml_threadpool *threadpool;
}
- (instancetype)initWithThreadpool:(ggml_threadpool *)threadpool
{
self = [super init];
if (self) {
self->threadpool = threadpool;
}
return self;
}
- (ggml_threadpool *)threadpool {
return threadpool;
}
@end
@implementation GGMLThreadpoolParams {
ggml_threadpool_params params;
}
- (BOOL)getCpuMaskAtIndex:(NSUInteger)index {
abort();
}
- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index {
abort();
}
- (instancetype)initWithParams:(ggml_threadpool_params&&)params
{
self = [super init];
if (self) {
self->params = params;
}
return self;
}
- (BOOL)isEqual:(id)other {
GGMLThreadpoolParams *rhs = (GGMLThreadpoolParams *)other;
ggml_threadpool_params rhs_params = rhs->params;
return ggml_threadpool_params_match(&params, &rhs_params);
}
- (GGMLThreadpool *)threadpool {
auto tp = ggml_threadpool_new(&params);
return [[GGMLThreadpool alloc] initWithThreadpool:tp];
}
@end
@implementation CPUParams {
cpu_params *params;
}
- (instancetype)initWithParams:(cpu_params&)params;
{
self = [super init];
if (self) {
self->params = &params;
}
return self;
}
- (int)nThreads {
return params->n_threads;
}
- (void)setNThreads:(int)nThreads {
params->n_threads = nThreads;
}
- (BOOL)maskValid {
return params->mask_valid;
}
- (void)setMaskValid:(BOOL)maskValid {
params->mask_valid = maskValid;
}
- (GGMLSchedPriority)priority {
return GGMLSchedPriority(params->priority);
}
- (void)setPriority:(GGMLSchedPriority)priority {
params->priority = ggml_sched_priority(priority);
}
- (BOOL)strictCPU {
return params->strict_cpu;
}
- (void)setStrictCPU:(BOOL)strictCPU {
params->strict_cpu = strictCPU;
}
- (uint32_t)poll {
return params->poll;
}
- (void)setPoll:(uint32_t)poll {
params->poll = poll;
}
- (BOOL)getCpuMaskAtIndex:(NSUInteger)index {
return params->cpumask[index];
}
- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index {
params->cpumask[index] = value;
}
- (GGMLThreadpoolParams *)ggmlThreadpoolParams {
return [[GGMLThreadpoolParams alloc] initWithParams:ggml_threadpool_params_from_cpu_params(*params)];
}
@end
@implementation GPTSamplerParams {
gpt_sampler_params *gpt_sampler_params;
}
- (instancetype)initWithParams:(gpt_sampler_params&)params {
self = [super init];
if (self) {
gpt_sampler_params = &params;
}
return self;
}
// Getters and setters for Objective-C properties, which manipulate the C++ struct
- (uint32_t)seed {
return gpt_sampler_params->seed;
}
- (void)setSeed:(uint32_t)seed {
gpt_sampler_params->seed = seed;
}
- (int32_t)nPrev {
return gpt_sampler_params->n_prev;
}
- (void)setNPrev:(int32_t)nPrev {
gpt_sampler_params->n_prev = nPrev;
}
- (int32_t)nProbs {
return gpt_sampler_params->n_probs;
}
- (void)setNProbs:(int32_t)nProbs {
gpt_sampler_params->n_probs = nProbs;
}
- (int32_t)minKeep {
return gpt_sampler_params->min_keep;
}
- (void)setMinKeep:(int32_t)minKeep {
gpt_sampler_params->min_keep = minKeep;
}
- (int32_t)topK {
return gpt_sampler_params->top_k;
}
- (void)setTopK:(int32_t)topK {
gpt_sampler_params->top_k = topK;
}
- (float)topP {
return gpt_sampler_params->top_p;
}
- (void)setTopP:(float)topP {
gpt_sampler_params->top_p = topP;
}
- (float)minP {
return gpt_sampler_params->min_p;
}
- (void)setMinP:(float)minP {
gpt_sampler_params->min_p = minP;
}
- (float)tfsZ {
return gpt_sampler_params->tfs_z;
}
- (void)setTfsZ:(float)tfsZ {
gpt_sampler_params->tfs_z = tfsZ;
}
- (float)typP {
return gpt_sampler_params->typ_p;
}
- (void)setTypP:(float)typP {
gpt_sampler_params->typ_p = typP;
}
- (float)temp {
return gpt_sampler_params->temp;
}
- (void)setTemp:(float)temp {
gpt_sampler_params->temp = temp;
}
- (float)dynatempRange {
return gpt_sampler_params->dynatemp_range;
}
- (void)setDynatempRange:(float)dynatempRange {
gpt_sampler_params->dynatemp_range = dynatempRange;
}
- (float)dynatempExponent {
return gpt_sampler_params->dynatemp_exponent;
}
- (void)setDynatempExponent:(float)dynatempExponent {
gpt_sampler_params->dynatemp_exponent = dynatempExponent;
}
- (int32_t)penaltyLastN {
return gpt_sampler_params->penalty_last_n;
}
- (void)setPenaltyLastN:(int32_t)penaltyLastN {
gpt_sampler_params->penalty_last_n = penaltyLastN;
}
- (float)penaltyRepeat {
return gpt_sampler_params->penalty_repeat;
}
- (void)setPenaltyRepeat:(float)penaltyRepeat {
gpt_sampler_params->penalty_repeat = penaltyRepeat;
}
- (float)penaltyFreq {
return gpt_sampler_params->penalty_freq;
}
- (void)setPenaltyFreq:(float)penaltyFreq {
gpt_sampler_params->penalty_freq = penaltyFreq;
}
- (float)penaltyPresent {
return gpt_sampler_params->penalty_present;
}
- (void)setPenaltyPresent:(float)penaltyPresent {
gpt_sampler_params->penalty_present = penaltyPresent;
}
- (int32_t)mirostat {
return gpt_sampler_params->mirostat;
}
- (void)setMirostat:(int32_t)mirostat {
gpt_sampler_params->mirostat = mirostat;
}
- (float)mirostatTau {
return gpt_sampler_params->mirostat_tau;
}
- (void)setMirostatTau:(float)mirostatTau {
gpt_sampler_params->mirostat_tau = mirostatTau;
}
- (float)mirostatEta {
return gpt_sampler_params->mirostat_eta;
}
- (void)setMirostatEta:(float)mirostatEta {
gpt_sampler_params->mirostat_eta = mirostatEta;
}
- (BOOL)penalizeNl {
return gpt_sampler_params->penalize_nl;
}
- (void)setPenalizeNl:(BOOL)penalizeNl {
gpt_sampler_params->penalize_nl = penalizeNl;
}
- (BOOL)ignoreEos {
return gpt_sampler_params->ignore_eos;
}
- (void)setIgnoreEos:(BOOL)ignoreEos {
gpt_sampler_params->ignore_eos = ignoreEos;
}
- (BOOL)noPerf {
return gpt_sampler_params->no_perf;
}
- (void)setNoPerf:(BOOL)noPerf {
gpt_sampler_params->no_perf = noPerf;
}
// For `samplers`, convert from NSArray<NSNumber *> to std::vector
- (NSArray<NSNumber *> *)samplers {
NSMutableArray<NSNumber *> *samplersArray = [NSMutableArray array];
for (auto sampler : gpt_sampler_params->samplers) {
[samplersArray addObject:@(sampler)];
}
return [samplersArray copy];
}
- (void)setSamplers:(NSArray<NSNumber *> *)samplers {
gpt_sampler_params->samplers.clear();
for (NSNumber *sampler in samplers) {
gpt_sampler_params->samplers.push_back(static_cast<gpt_sampler_type>(sampler.intValue));
}
}
//// For `logitBias`, convert from NSArray<NSNumber *> to std::vector
//- (NSArray<NSNumber *> *)logitBias {
// NSMutableArray<llama_logit_bias *> *logitBiasArray = [NSMutableArray array];
// for (auto bias : gpt_sampler_params.logit_bias) {
// [logitBiasArray addObject:bias];
// }
// return [logitBiasArray copy];
//}
//
//- (void)setLogitBias:(NSArray<NSNumber *> *)logitBias {
// gpt_sampler_params.logit_bias.clear();
// for (NSNumber *bias in logitBias) {
// gpt_sampler_params.logit_bias.push_back(bias.floatValue);
// }
//}
// For `grammar`, convert between NSString and std::string
- (NSString *)grammar {
return [NSString stringWithUTF8String:gpt_sampler_params->grammar.c_str()];
}
- (void)setGrammar:(NSString *)grammar {
gpt_sampler_params->grammar = std::string([grammar UTF8String]);
}
// Method to print out the parameters as a string
- (NSString *)print {
NSMutableString *output = [NSMutableString stringWithString:@"GPT Sampler Params:\n"];
[output appendFormat:@"Seed: %u\n", self.seed];
[output appendFormat:@"nPrev: %d\n", self.nPrev];
[output appendFormat:@"nProbs: %d\n", self.nProbs];
[output appendFormat:@"minKeep: %d\n", self.minKeep];
[output appendFormat:@"topK: %d\n", self.topK];
[output appendFormat:@"topP: %.2f\n", self.topP];
[output appendFormat:@"minP: %.2f\n", self.minP];
[output appendFormat:@"tfsZ: %.2f\n", self.tfsZ];
[output appendFormat:@"typP: %.2f\n", self.typP];
[output appendFormat:@"temp: %.2f\n", self.temp];
[output appendFormat:@"dynatempRange: %.2f\n", self.dynatempRange];
[output appendFormat:@"dynatempExponent: %.2f\n", self.dynatempExponent];
[output appendFormat:@"penaltyLastN: %d\n", self.penaltyLastN];
[output appendFormat:@"penaltyRepeat: %.2f\n", self.penaltyRepeat];
[output appendFormat:@"penaltyFreq: %.2f\n", self.penaltyFreq];
[output appendFormat:@"penaltyPresent: %.2f\n", self.penaltyPresent];
[output appendFormat:@"mirostat: %d\n", self.mirostat];
[output appendFormat:@"mirostatTau: %.2f\n", self.mirostatTau];
[output appendFormat:@"mirostatEta: %.2f\n", self.mirostatEta];
[output appendFormat:@"penalizeNl: %@\n", self.penalizeNl ? @"YES" : @"NO"];
[output appendFormat:@"ignoreEos: %@\n", self.ignoreEos ? @"YES" : @"NO"];
[output appendFormat:@"noPerf: %@\n", self.noPerf ? @"YES" : @"NO"];
[output appendFormat:@"Grammar: %@\n", self.grammar];
// Print samplers
[output appendString:@"Samplers: "];
for (NSNumber *sampler in self.samplers) {
[output appendFormat:@"%d, ", sampler.intValue];
}
[output appendString:@"\n"];
// Print logit biases
[output appendString:@"Logit Biases: "];
for (NSNumber *bias in self.logitBias) {
[output appendFormat:@"%.2f, ", bias.floatValue];
}
[output appendString:@"\n"];
return [output copy];
}
- (gpt_sampler_params&)cParams {
return *gpt_sampler_params;
}
@end
@implementation GPTParams {
gpt_params gpt_params;
}
- (NSArray<NSString *> *)antiPrompts {
auto antiprompts = [[NSMutableArray alloc] init];
for (auto& antiprompt : gpt_params.antiprompt) {
[antiprompts addObject:[NSString stringWithCString:antiprompt.c_str() encoding:NSUTF8StringEncoding]];
}
return antiprompts;
}
- (gpt_params&)params {
return gpt_params;
}
- (int32_t)nPredict {
return gpt_params.n_predict;
}
- (void)setNPredict:(int32_t)nPredict {
gpt_params.n_predict = nPredict;
}
- (NSInteger)nCtx {
return gpt_params.n_ctx;
}
- (void)setNCtx:(NSInteger)nCtx {
gpt_params.n_ctx = nCtx;
}
- (int32_t)nBatch {
return gpt_params.n_batch;
}
- (void)setNBatch:(int32_t)nBatch {
gpt_params.n_batch = nBatch;
}
- (int32_t)nUBatch {
return gpt_params.n_ubatch;
}
- (void)setNUBatch:(int32_t)nUBatch {
gpt_params.n_ubatch = nUBatch;
}
- (int32_t)nKeep {
return gpt_params.n_keep;
}
- (void)setNKeep:(int32_t)nKeep {
gpt_params.n_keep = nKeep;
}
- (int32_t)nDraft {
return gpt_params.n_draft;
}
- (void)setNDraft:(int32_t)nDraft {
gpt_params.n_draft = nDraft;
}
- (int32_t)nChunks {
return gpt_params.n_chunks;
}
- (void)setNChunks:(int32_t)nChunks {
gpt_params.n_chunks = nChunks;
}
- (int32_t)nParallel {
return gpt_params.n_parallel;
}
- (void)setNParallel:(int32_t)nParallel {
gpt_params.n_parallel = nParallel;
}
- (int32_t)nSequences {
return gpt_params.n_sequences;
}
- (void)setNSequences:(int32_t)nSequences {
gpt_params.n_sequences = nSequences;
}
- (float)pSplit {
return gpt_params.p_split;
}
- (void)setPSplit:(float)pSplit {
gpt_params.p_split = pSplit;
}
- (int32_t)nGpuLayers {
return gpt_params.n_gpu_layers;
}
- (void)setNGpuLayers:(int32_t)nGpuLayers {
gpt_params.n_gpu_layers = nGpuLayers;
}
- (int32_t)nGpuLayersDraft {
return gpt_params.n_gpu_layers_draft;
}
- (void)setNGpuLayersDraft:(int32_t)nGpuLayersDraft {
gpt_params.n_gpu_layers_draft = nGpuLayersDraft;
}
- (int32_t)mainGpu {
return gpt_params.main_gpu;
}
- (void)setMainGpu:(int32_t)mainGpu {
gpt_params.main_gpu = mainGpu;
}
- (int32_t)grpAttnN {
return gpt_params.grp_attn_n;
}
- (void)setGrpAttnN:(int32_t)grpAttnN {
gpt_params.grp_attn_n = grpAttnN;
}
- (int32_t)grpAttnW {
return gpt_params.grp_attn_w;
}
- (void)setGrpAttnW:(int32_t)grpAttnW {
gpt_params.grp_attn_w = grpAttnW;
}
- (int32_t)nPrint {
return gpt_params.n_print;
}
- (void)setNPrint:(int32_t)nPrint {
gpt_params.n_print = nPrint;
}
- (float)ropeFreqBase {
return gpt_params.rope_freq_base;
}
- (void)setRopeFreqBase:(float)ropeFreqBase {
gpt_params.rope_freq_base = ropeFreqBase;
}
- (float)ropeFreqScale {
return gpt_params.rope_freq_scale;
}
- (void)setRopeFreqScale:(float)ropeFreqScale {
gpt_params.rope_freq_scale = ropeFreqScale;
}
- (float)yarnExtFactor {
return gpt_params.yarn_ext_factor;
}
- (void)setYarnExtFactor:(float)yarnExtFactor {
gpt_params.yarn_ext_factor = yarnExtFactor;
}
- (float)yarnAttnFactor {
return gpt_params.yarn_attn_factor;
}
- (void)setYarnAttnFactor:(float)yarnAttnFactor {
gpt_params.yarn_attn_factor = yarnAttnFactor;
}
- (float)yarnBetaFast {
return gpt_params.yarn_beta_fast;
}
- (void)setYarnBetaFast:(float)yarnBetaFast {
gpt_params.yarn_beta_fast = yarnBetaFast;
}
- (float)yarnBetaSlow {
return gpt_params.yarn_beta_slow;
}
- (void)setYarnBetaSlow:(float)yarnBetaSlow {
gpt_params.yarn_beta_slow = yarnBetaSlow;
}
- (int32_t)yarnOrigCtx {
return gpt_params.yarn_orig_ctx;
}
- (void)setYarnOrigCtx:(int32_t)yarnOrigCtx {
gpt_params.yarn_orig_ctx = yarnOrigCtx;
}
- (float)defragThold {
return gpt_params.defrag_thold;
}
- (void)setDefragThold:(float)defragThold {
gpt_params.defrag_thold = defragThold;
}
// Assuming tensorSplit remains a fixed array in C struct, we can create a method to access specific values.
- (float)tensorSplitAtIndex:(NSUInteger)index {
if (index < 128) {
return gpt_params.tensor_split[index];
}
return 0.0f; // Return default value if index is out of bounds
}
- (void)setTensorSplitValue:(float)value atIndex:(NSUInteger)index {
if (index < 128) {
gpt_params.tensor_split[index] = value;
}
}
- (BOOL)embedding {
return gpt_params.embedding;
}
- (void)setEmbedding:(BOOL)embedding {
gpt_params.embedding = embedding;
}
- (LlamaModelParams *)LlamaModelParams {
return nil;
}
- (BOOL)ctxShift {
return gpt_params.ctx_shift;
}
- (void)setCtxShift:(BOOL)ctxShift {
gpt_params.ctx_shift = ctxShift;
}
- (CPUParams *)cpuParams {
return [[CPUParams alloc] initWithParams:gpt_params.cpuparams];
}
- (CPUParams *)cpuParamsBatch {
return [[CPUParams alloc] initWithParams:gpt_params.cpuparams_batch];
}
- (GPTSamplerParams *)samplerParams {
return [[GPTSamplerParams alloc] initWithParams:gpt_params.sparams];
}
- (NSString *)modelURL {
return [NSString stringWithCString:gpt_params.model_url.c_str() encoding:NSUTF8StringEncoding];
}
- (void)setModelURL:(NSString *)modelURL {
gpt_params.model_url = [modelURL cStringUsingEncoding:NSUTF8StringEncoding];
}
- (NSString *)modelPath {
return [NSString stringWithCString:gpt_params.model.c_str() encoding:NSUTF8StringEncoding];
}
- (void)setModelPath:(NSString *)modelPath {
gpt_params.model = [modelPath cStringUsingEncoding:NSUTF8StringEncoding];
}
- (NSString *)pathPromptCache {
return [[NSString alloc] initWithCString:gpt_params.path_prompt_cache.c_str() encoding:NSUTF8StringEncoding];
}
- (void)setPathPromptCache:(NSString *)pathPromptCache {
gpt_params.path_prompt_cache = [pathPromptCache cStringUsingEncoding:NSUTF8StringEncoding];
}
- (BOOL)enableChatTemplate {
return gpt_params.enable_chat_template;
}
- (void)setEnableChatTemplate:(BOOL)enableChatTemplate {
gpt_params.enable_chat_template = enableChatTemplate;
}
- (NSString *)chatTemplate {
return [NSString stringWithCString:gpt_params.chat_template.c_str()
encoding:NSUTF8StringEncoding];
}
- (void)setChatTemplate:(NSString *)chatTemplate {
gpt_params.chat_template = [chatTemplate cStringUsingEncoding:NSUTF8StringEncoding];
}
- (NSString *)inputPrefix {
return [NSString stringWithCString:gpt_params.input_prefix.c_str()
encoding:NSUTF8StringEncoding];
}
- (void)setInputPrefix:(NSString *)inputPrefix {
gpt_params.input_prefix = [inputPrefix cStringUsingEncoding:NSUTF8StringEncoding];
}
- (NSString *)inputSuffix {
return [NSString stringWithCString:gpt_params.input_suffix.c_str()
encoding:NSUTF8StringEncoding];
}
- (void)setInputSuffix:(NSString *)inputSuffix {
gpt_params.input_suffix = [inputSuffix cStringUsingEncoding:NSUTF8StringEncoding];
}
- (LlamaContextParams *)llamaContextParams {
}
- (LlamaModelParams *)llamaModelParams {
}
@end

49
objc/GPTSampler.mm Normal file
View file

@ -0,0 +1,49 @@
#import <Foundation/Foundation.h>
#import <GPTSampler.h>
#import <GPTParams_Private.hpp>
#import <LlamaModel_Private.hpp>
#import <LlamaContext_Private.hpp>
#import "../../common/sampling.h"
@implementation GPTSampler {
gpt_sampler *sampler;
}
- (instancetype)init:(LlamaModel *)model gptSamplerParams:(GPTSamplerParams *)gptSamplerParams
{
self = [super init];
if (self) {
self->sampler = gpt_sampler_init([model cModel], [gptSamplerParams cParams]);
}
return self;
}
- (uint32_t)seed {
return gpt_sampler_get_seed(sampler);
}
- (LlamaToken)sample:(LlamaContext *)context index:(NSInteger)index {
return [self sample:context index:index grammarFirst:false];
}
- (LlamaToken)sample:(LlamaContext *)context index:(NSInteger)index grammarFirst:(BOOL)grammarFirst {
return gpt_sampler_sample(sampler, [context cContext], index, grammarFirst);
}
- (void)accept:(LlamaToken)token acceptGrammar:(BOOL)acceptGrammar {
gpt_sampler_accept(sampler, token, acceptGrammar);
}
- (NSString *)previousString:(LlamaContext *)context n:(NSInteger)n {
return [[NSString alloc] initWithCString:gpt_sampler_prev_str(sampler, [context cContext], n).data() encoding:NSUTF8StringEncoding];
}
- (LlamaToken)last {
return gpt_sampler_last(sampler);
}
- (void)reset {
gpt_sampler_reset(sampler);
}
@end

21
objc/LlamaBatch.mm Normal file
View file

@ -0,0 +1,21 @@
#import <Foundation/Foundation.h>
#import "LlamaBatch_Private.hpp"
#import "llama.h"
@implementation LlamaBatch {
llama_batch batch;
}
- (instancetype)initWithBatch:(llama_batch)batch {
self->batch = batch;
}
- (NSData *)output {
return [[NSData alloc] initWithBytes:batch.logits length:batch.n_tokens];
}
- (llama_batch)cBatch {
return batch;
}
@end

94
objc/LlamaContext.mm Normal file
View file

@ -0,0 +1,94 @@
#import <Foundation/Foundation.h>
#import "LlamaContext_Private.hpp"
#import "GPTParams_Private.hpp"
#import "LlamaModel_Private.hpp"
#import "LlamaBatch_Private.hpp"
#import "../../common/common.h"
@implementation LlamaContext {
llama_context *ctx;
}
- (instancetype)initWithContext:(llama_context *)context {
self = [super init];
if (self) {
ctx = context;
}
return self;
}
- (void)attachThreadpool:(GGMLThreadpool *)threadpool
threadpoolBatch:(GGMLThreadpool *)threadpoolBatch {
llama_attach_threadpool(ctx, [threadpool threadpool], [threadpoolBatch threadpool]);
}
- (NSUInteger)nCtx {
return llama_n_ctx(ctx);
}
- (BOOL)loadStateFile:(NSString *)pathSession
tokensOut:(llama_token *)tokensOut
nTokenCpacity:(size_t)nTokenCapacity
nTokenCountOut:(size_t *)nTokenCountOut {
return llama_state_load_file(ctx, [pathSession cStringUsingEncoding:NSUTF8StringEncoding], tokensOut, nTokenCapacity, nTokenCountOut);
}
- (LlamaModel *)model {
auto model = llama_get_model(ctx);
return [[LlamaModel alloc] init:std::remove_const_t<llama_model *>(model)];
}
- (std::vector<llama_token>)tokenize:(NSString *)text
addSpecial:(BOOL)addSpecial
parseSpecial:(BOOL)parseSpecial {
return llama_tokenize(ctx, [text cStringUsingEncoding:NSUTF8StringEncoding], addSpecial, parseSpecial);
}
- (std::string)convertTokensToString:(const std::vector<llama_token>&)tokens {
return string_from(ctx, tokens);
}
- (llama_context *)cContext {
return ctx;
}
- (int32_t)encode:(llama_batch)batch {
return llama_encode(ctx, batch);
}
- (void)kvCacheSeqAdd:(LlamaSequenceId)sequenceId
p0:(LlamaPosition)p0
p1:(LlamaPosition)p1
delta:(LlamaPosition)delta {
llama_kv_cache_seq_add(ctx, sequenceId, p0, p1, delta);
}
- (void)kvCacheSeqDiv:(LlamaSequenceId)sequenceId
p0:(LlamaPosition)p0
p1:(LlamaPosition)p1
delta:(LlamaPosition)delta {
llama_kv_cache_seq_div(ctx, sequenceId, p0, p1, delta);
}
- (NSString *)tokenToPiece:(LlamaToken)token {
return [self tokenToPiece:token special:YES];
}
- (NSString *)tokenToPiece:(LlamaToken)token special:(BOOL)special {
return [[NSString alloc] initWithCString:llama_token_to_piece(ctx, token, special).c_str() encoding:NSUTF8StringEncoding];
}
- (NSInteger)decode:(LlamaBatch *)batch {
return llama_decode(ctx, [batch cBatch]);
}
- (BOOL)saveStateFile:(NSString *)pathSession
tokens:(const LlamaToken *)tokens
nTokenCount:(size_t)nTokenCount {
return llama_state_save_file(ctx,
[pathSession cStringUsingEncoding:NSUTF8StringEncoding],
tokens, nTokenCount);
}
@end

70
objc/LlamaModel.mm Normal file
View file

@ -0,0 +1,70 @@
#import <Foundation/Foundation.h>
#import "LlamaModel_Private.hpp"
#import "LlamaContext_Private.hpp"
#import "LlamaBatch_Private.hpp"
#import "GPTParams_Private.hpp"
#import "GPTSampler.h"
#import "ggml.h"
#import "../common/common.h"
@implementation LlamaChatMessage
@end
@implementation LlamaModel {
llama_model *model;
}
- (instancetype)init:(llama_model *)l_model {
self = [super init];
if (self) {
model = l_model;
}
return self;
}
- (LlamaContext *)context:(LlamaContextParams *)params {
return nil;
}
- (BOOL)addBOSToken {
return llama_add_bos_token(model);
}
- (BOOL)addEOSToken {
return llama_add_eos_token(model);
}
- (LlamaToken)tokenBOS {
return llama_token_bos(model);
}
- (int32_t)nCtxTrain {
return llama_n_ctx_train(model);
}
- (NSString *)formatExample:(NSString *)tmpl {
return [[NSString alloc] initWithCString:llama_chat_format_example(model, [tmpl cStringUsingEncoding:NSUTF8StringEncoding]).c_str()
encoding:NSUTF8StringEncoding];
}
- (BOOL)hasEncoder {
return llama_model_has_encoder(model);
}
- (llama_model *)cModel {
return model;
}
- (BOOL)tokenIsEOG:(LlamaToken)token {
return llama_token_is_eog(model, token);
}
- (LlamaToken)tokenEOT {
return llama_token_eot(model);
}
- (LlamaToken)tokenEOS {
return llama_token_eos(model);
}
@end

2
objc/LlamaObjC.mm Normal file
View file

@ -0,0 +1,2 @@
#import "LlamaObjC.h"

906
objc/LlamaSession.mm Normal file
View file

@ -0,0 +1,906 @@
#import <Foundation/Foundation.h>
#import "LlamaSession_Private.hpp"
#import "../../common/common.h"
#import "LlamaModel_Private.hpp"
#import "LlamaContext_Private.hpp"
#import "GPTSampler.h"
#import <OSLog/OSLog.h>
#import "ggml.h"
#import "GPTParams_Private.hpp"
#import "LlamaBatch_Private.hpp"
@implementation BlockingLineQueue {
// Input queue and related synchronization
NSMutableArray<NSString *> *inputQueue;
NSCondition *inputCondition;
// Output queue and related synchronization
NSMutableArray<NSString *> *outputQueue;
NSCondition *outputCondition;
// Log queue
NSMutableArray<NSString *> *log;
}
- (instancetype)init {
if (self = [super init]) {
inputQueue = [NSMutableArray new];
outputQueue = [NSMutableArray new];
log = [NSMutableArray new];
inputCondition = [[NSCondition alloc] init];
outputCondition = [[NSCondition alloc] init];
}
return self;
}
- (void)addInputLine:(NSString *)line {
[inputCondition lock];
[inputQueue addObject:line];
[log addObject:line];
[inputCondition signal]; // Notify that a new input line is available
[inputCondition unlock];
}
- (NSString *)inputLine {
[inputCondition lock];
while ([inputQueue count] == 0) {
[inputCondition wait];
}
NSString *line = [inputQueue objectAtIndex:0];
[inputQueue removeObjectAtIndex:0];
[inputCondition unlock];
return line;
}
- (void)addOutputLine:(NSString *)line {
[outputCondition lock];
[outputQueue addObject:line];
[log addObject:line];
[outputCondition signal]; // Notify that a new output line is available
[outputCondition unlock];
}
- (NSString *)outputLine {
[outputCondition lock];
while ([outputQueue count] == 0) {
[outputCondition wait];
}
NSString *line = [outputQueue objectAtIndex:0];
[outputQueue removeObjectAtIndex:0];
[outputCondition unlock];
return line;
}
@end
@implementation LlamaSession {
std::vector<llama_token> embd_inp;
std::vector<llama_chat_msg> chat_msgs;
GPTParams *params;
GPTSampler *smpl;
BOOL isInteracting;
bool is_antiprompt;
bool input_echo;
bool display;
bool need_to_save_session;
int n_past;
int n_remain;
int n_consumed;
int n_session_consumed;
std::vector<int> input_tokens;
std::vector<int> output_tokens;;
std::ostringstream output_ss;
std::stringstream last_output_ss;
std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
std::vector<llama_token> embd;
NSMutableString *pathSession;
NSInteger ga_i;
NSInteger ga_n;
NSInteger ga_w;
std::vector<llama_token> session_tokens;
// tokenized antiprompts
std::vector<std::vector<llama_token>> antiprompt_ids;
BOOL need_insert_eot;
int n_ctx;
}
- (NSString *)chat_add_and_format:(std::vector<llama_chat_msg> &) chat_msgs role:(const std::string &) role content:(const std::string &) content {
llama_chat_msg new_msg{role, content};
auto formatted = llama_chat_format_single([self.model cModel], [params params].chat_template, chat_msgs, new_msg, role == "user");
chat_msgs.push_back({role, content});
os_log_debug(OS_LOG_DEFAULT, "formatted: '%s'\n", formatted.c_str());
return [NSString stringWithCString:formatted.c_str() encoding:NSUTF8StringEncoding];
}
static BOOL file_is_empty(NSString *path) {
NSFileManager *manager = [NSFileManager defaultManager];
if ([manager fileExistsAtPath:path]) {
NSDictionary *attributes = [manager attributesOfItemAtPath:path error:nil];
unsigned long long size = [attributes fileSize];
if (attributes && size == 0) {
return true;
} else {
return false;
}
}
return true;
}
- (instancetype)initWithParams:(GPTParams *)params {
self = [super init];
self->params = params;
// model = llama_init.model;
// ctx = llama_init.context;
//
// if model == nil {
// LOG_ERR("%s: error: unable to load model\n", __func__);
// return 1;
// }
//
os_log_info(OS_LOG_DEFAULT,
"%s: llama threadpool init, n_threads = %d\n",
__func__, params.cpuParams.nThreads);
if (params.embedding) {
os_log_error(OS_LOG_DEFAULT,
R"(************
please use the 'embedding' tool for embedding calculations
************)");
abort();
}
if (params.nCtx != 0 && params.nCtx < 8) {
os_log_info(OS_LOG_DEFAULT, "minimum context size is 8, using minimum size.");
params.nCtx = 8;
}
if (params.ropeFreqBase != 0) {
os_log_info(OS_LOG_DEFAULT, "changing RoPE frequency base to \(params.ropeFreqBase)");
}
if (params.ropeFreqScale != 0.0) {
os_log_info(OS_LOG_DEFAULT, "scaling RoPE frequency by \(params.ropeFreqScale)");
}
llama_backend_init();
llama_numa_init(ggml_numa_strategy(params.numaStrategy));
auto llama_init = llama_init_from_gpt_params([params params]);
auto tpp_batch = params.cpuParamsBatch.ggmlThreadpoolParams;
auto tpp = params.cpuParams.ggmlThreadpoolParams;
set_process_priority(ggml_sched_priority(params.cpuParams.priority));
GGMLThreadpool *threadpool_batch;
if (tpp != tpp_batch) {
threadpool_batch = [tpp_batch threadpool];
if (!threadpool_batch) {
[NSException raise:@"batch threadpool create failed"
format:@"batch threadpool create failed"];
}
// Start the non-batch threadpool in the paused state
tpp.paused = true;
}
GGMLThreadpool *threadpool = [tpp threadpool];
if (!threadpool) {
[NSException raise:@"threadpool create failed"
format:@"threadpool create failed"];
}
self.ctx = [[LlamaContext alloc] initWithContext:llama_init.context];
[self.ctx attachThreadpool:threadpool threadpoolBatch:threadpool_batch];
self.model = [[LlamaModel alloc] init:llama_init.model];
const int n_ctx_train = [self.model nCtxTrain];
n_ctx = [self.ctx nCtx];
//
if (n_ctx > n_ctx_train) {
os_log_info(OS_LOG_DEFAULT, "%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
}
// print chat template example in conversation mode
if (params.conversation) {
if (params.enableChatTemplate) {
os_log_info(OS_LOG_DEFAULT, "%s: chat template example:\n%s\n", __func__,
[[self.model formatExample:params.chatTemplate] cStringUsingEncoding:NSUTF8StringEncoding]);
} else {
os_log_info(OS_LOG_DEFAULT, "%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
}
}
// print system information
@autoreleasepool {
NSLog(@"%s", gpt_params_get_system_info([params params]).c_str());
}
pathSession = [[NSMutableString alloc] initWithString:params.pathPromptCache];
NSFileManager *fileManager = [NSFileManager defaultManager];
if ([pathSession length] != 0) {
os_log_info(OS_LOG_DEFAULT, "%s: attempting to load saved session from '%s'\n", __func__, [pathSession cStringUsingEncoding:NSUTF8StringEncoding]);
if (![fileManager fileExistsAtPath:pathSession]) {
os_log_info(OS_LOG_DEFAULT, "%s: session file does not exist, will create.\n", __func__);
} else if (file_is_empty(pathSession)) {
os_log_info(OS_LOG_DEFAULT,"%s: The session file is empty. A new session will be initialized.\n", __func__);
} else {
// The file exists and is not empty
session_tokens.resize(n_ctx);
size_t n_token_count_out = 0;
if (![self.ctx loadStateFile:pathSession tokensOut:session_tokens.data() nTokenCpacity:session_tokens.capacity() nTokenCountOut:&n_token_count_out]) {
[NSException raise:@"SessionLoadFailure" format:@"%s: failed to load session file '%s'\n", __func__, [pathSession cStringUsingEncoding:NSUTF8StringEncoding]];
}
session_tokens.resize(n_token_count_out);
os_log_info(OS_LOG_DEFAULT,"%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
}
}
BOOL addBOS = [self.model addBOSToken];
if (![self.model hasEncoder]) {
GGML_ASSERT(![self.model addEOSToken]);
}
os_log_debug(OS_LOG_DEFAULT, "n_ctx: %d, add_bos: %d\n", n_ctx, addBOS);
{
auto prompt = (params.conversation && params.enableChatTemplate && params.prompt.length > 0)
? [self chat_add_and_format:chat_msgs role:"system" content:[params params].prompt] // format the system prompt in conversation mode
: params.prompt;
if (params.interactiveFirst || [params.prompt length] > 0 || session_tokens.empty()) {
os_log_debug(OS_LOG_DEFAULT, "tokenize the prompt\n");
embd_inp = [self.ctx tokenize:prompt addSpecial:true parseSpecial:true];
} else {
os_log_debug(OS_LOG_DEFAULT,"use session tokens\n");
embd_inp = session_tokens;
}
os_log_debug(OS_LOG_DEFAULT,"prompt: \"%s\"\n", [prompt cStringUsingEncoding:NSUTF8StringEncoding]);
os_log_debug(OS_LOG_DEFAULT,"tokens: %s\n", [self.ctx convertTokensToString:embd_inp].c_str());
}
// Should not run without any tokens
if (embd_inp.empty()) {
if (addBOS) {
embd_inp.push_back([self.model tokenBOS]);
// LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
} else {
[NSException raise:@"InputEmptyError" format:@"input is empty"];
}
}
// Tokenize negative prompt
if (embd_inp.size() > n_ctx - 4) {
[NSException raise:@"PromptError" format:@"%s: prompt is too long (%d tokens, max %d)\n", __func__, (int)embd_inp.size(), n_ctx - 4];
}
// debug message about similarity of saved session, if applicable
size_t n_matching_session_tokens = 0;
if (!session_tokens.empty()) {
for (llama_token id : session_tokens) {
if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
break;
}
n_matching_session_tokens++;
}
if ([params.prompt length] == 0 && n_matching_session_tokens == embd_inp.size()) {
// LOG_INF("%s: using full prompt from session file\n", __func__);
} else if (n_matching_session_tokens >= embd_inp.size()) {
// LOG_INF("%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
// LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
// __func__, n_matching_session_tokens, embd_inp.size());
} else {
// LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
// __func__, n_matching_session_tokens, embd_inp.size());
}
// remove any "future" tokens that we might have inherited from the previous session
llama_kv_cache_seq_rm([self.ctx cContext], -1, n_matching_session_tokens, -1);
}
//
// os_log_debug(OS_LOG_DEFAULT, "recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
// embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
//
// if we will use the cache for the full prompt without reaching the end of the cache, force
// reevaluation of the last token to recalculate the cached logits
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
// os_log_debug(OS_LOG_DEFAULT, "recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
session_tokens.resize(embd_inp.size() - 1);
}
// number of tokens to keep when resetting context
if (params.nKeep < 0 || params.nKeep > (int) embd_inp.size()) {
params.nKeep = (int)embd_inp.size();
} else {
params.nKeep += addBOS; // always keep the BOS token
}
if (params.conversation) {
params.interactiveFirst = true;
}
// enable interactive mode if interactive start is specified
if (params.interactiveFirst) {
params.interactive = true;
}
if (params.verbosePrompt) {
// LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
// LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n", embd_inp[i],
[[self.ctx tokenToPiece:embd_inp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
}
if (params.nKeep > addBOS) {
// LOG_INF("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.nKeep; i++) {
os_log_debug(OS_LOG_DEFAULT, "%s",
[[self.ctx tokenToPiece:embd_inp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
}
// LOG("'\n");
}
// LOG_INF("\n");
}
//
// // ctrl+C handling
// {
//#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
// struct sigaction sigint_action;
// sigint_action.sa_handler = sigint_handler;
// sigemptyset (&sigint_action.sa_mask);
// sigint_action.sa_flags = 0;
// sigaction(SIGINT, &sigint_action, NULL);
//#elif defined (_WIN32)
// auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
// return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
// };
// SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
//#endif
// }
//
if (params.interactive) {
os_log_info(OS_LOG_DEFAULT, "%s: interactive mode on.\n", __func__);
if ([params.antiPrompts count] > 0) {
for (NSString *antiprompt in params.antiPrompts) {
os_log_info(OS_LOG_DEFAULT, "Reverse prompt: '%s'\n", [antiprompt cStringUsingEncoding:NSUTF8StringEncoding]);
if (params.verbosePrompt) {
auto tmp = [_ctx tokenize:antiprompt
addSpecial:false
parseSpecial:true];
for (int i = 0; i < (int) tmp.size(); i++) {
os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n", tmp[i], [[self.ctx tokenToPiece:tmp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
}
}
}
}
if (params.inputPrefixBOS) {
os_log_info(OS_LOG_DEFAULT, "Input prefix with BOS\n");
}
if ([params.inputPrefix length] > 0) {
os_log_info(OS_LOG_DEFAULT, "Input prefix: '%s'\n", [params.inputPrefix cStringUsingEncoding:NSUTF8StringEncoding]);
if (params.verbosePrompt) {
auto tmp = [_ctx tokenize:params.inputPrefix addSpecial:true parseSpecial:true];
for (int i = 0; i < (int) tmp.size(); i++) {
os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n",
tmp[i], [[self.ctx tokenToPiece:tmp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
}
}
}
if ([params.inputSuffix length] > 0) {
os_log_info(OS_LOG_DEFAULT, "Input suffix: '%s'\n", [params.inputSuffix cStringUsingEncoding:NSUTF8StringEncoding]);
if (params.verbosePrompt) {
auto tmp = [_ctx tokenize:params.inputSuffix addSpecial:false parseSpecial:true];
for (int i = 0; i < (int) tmp.size(); i++) {
os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n",
tmp[i], [[self.ctx tokenToPiece:tmp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
}
}
}
}
smpl = [[GPTSampler alloc] init:_model gptSamplerParams:[params samplerParams]];
if (!smpl) {
[NSException raise:@"SamplingFailure" format:@"failed to initialize sampling subsystem"];
}
os_log_info(OS_LOG_DEFAULT, "sampler seed: %u\n", [smpl seed]);
// LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
// LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
//
// LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
//
// group-attention state
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
ga_n = params.grpAttnN;
ga_w = params.grpAttnW;
if (ga_n != 1) {
GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
os_log_info(OS_LOG_DEFAULT, "self-extend: n_ctx_train = %d, grp_attn_n = %ld, grp_attn_w = %ld\n", n_ctx_train, static_cast<long>(ga_n), static_cast<long>(ga_w));
}
if (params.interactive) {
const char * control_message;
if (params.multilineInput) {
control_message = " - To return control to the AI, end your input with '\\'.\n"
" - To return control without starting a new line, end your input with '/'.\n";
} else {
control_message = " - Press Return to return control to the AI.\n"
" - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n";
}
isInteracting = params.interactiveFirst;
}
is_antiprompt = false;
input_echo = true;
display = true;
need_to_save_session = [pathSession length] > 0 && n_matching_session_tokens < embd_inp.size();
n_remain = params.nPredict;
// // the first thing we will do is to output the prompt, so set color accordingly
// console::set_display(console::prompt);
// display = params.display_prompt;
//
antiprompt_ids.reserve([params.antiPrompts count]);
for (NSString *antiprompt in params.antiPrompts) {
antiprompt_ids.emplace_back([self.ctx tokenize:antiprompt addSpecial:false parseSpecial:true]);
}
if ([self.model hasEncoder]) {
int enc_input_size = embd_inp.size();
llama_token * enc_input_buf = embd_inp.data();
if ([_ctx encode:llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0)]) {
[NSException raise:@"EvalFailure" format:@"failed to eval"];
}
llama_token decoder_start_token_id = llama_model_decoder_start_token([self.model cModel]);
if (decoder_start_token_id == -1) {
decoder_start_token_id = [self.model tokenBOS];
}
embd_inp.clear();
embd_inp.push_back(decoder_start_token_id);
}
return self;
}
- (void)start:(BlockingLineQueue *)queue {
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
if (!embd.empty()) {
// Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value.
int max_embd_size = n_ctx - 4;
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
if ((int) embd.size() > max_embd_size) {
const int skipped_tokens = (int) embd.size() - max_embd_size;
embd.resize(max_embd_size);
// console::set_display(console::error);
os_log_error(OS_LOG_DEFAULT, "<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
// console::set_display(console::reset);
}
if (params.grpAttnN == 1) {
// infinite text generation via context shifting
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() >= [_ctx nCtx]) {
if (!params.ctxShift) {
os_log_debug(OS_LOG_DEFAULT, "\n\n%s: context full and context shift is disabled => stopping\n", __func__);
break;
} else {
if (params.nPredict == -2) {
os_log_debug(OS_LOG_DEFAULT, "\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.nPredict);
break;
}
const int n_left = n_past - params.nKeep;
const int n_discard = n_left/2;
os_log_debug(OS_LOG_DEFAULT, "context full, swapping: n_past = %d, n_left = %d, n_ctx = %lu, n_keep = %d, n_discard = %d\n",
n_past, n_left, static_cast<unsigned long>([_ctx nCtx]), params.nKeep, n_discard);
llama_kv_cache_seq_rm ([self.ctx cContext], 0, params.nKeep , params.nKeep + n_discard);
llama_kv_cache_seq_add([self.ctx cContext], 0, params.nKeep + n_discard, n_past, -n_discard);
n_past -= n_discard;
os_log_debug(OS_LOG_DEFAULT, "after swap: n_past = %d\n", n_past);
os_log_debug(OS_LOG_DEFAULT, "embd: %s\n", [self.ctx convertTokensToString:embd].c_str());
os_log_debug(OS_LOG_DEFAULT, "clear session path\n");
[pathSession setString:@""];
}
}
} else {
// context extension via Self-Extend
while (n_past >= ga_i + ga_w) {
const int ib = (ga_n*ga_i)/ga_w;
const int bd = (ga_w/ga_n)*(ga_n - 1);
const int dd = (ga_w/ga_n) - ib*bd - ga_w;
os_log_debug(OS_LOG_DEFAULT, "\n");
os_log_debug(OS_LOG_DEFAULT, "shift: [%6ld, %6d] + %6d -> [%6ld, %6d]\n", static_cast<long>(ga_i), n_past, ib*bd, static_cast<long>(ga_i + ib*bd), n_past + ib*bd);
os_log_debug(OS_LOG_DEFAULT, "div: [%6ld, %6ld] / %6ld -> [%6ld, %6ld]\n", static_cast<long>(ga_i + ib*bd), static_cast<long>(ga_i + ib*bd + ga_w), static_cast<long>(ga_n), static_cast<long>((ga_i + ib*bd)/ga_n), static_cast<long>((ga_i + ib*bd + ga_w)/ga_n));
os_log_debug(OS_LOG_DEFAULT, "shift: [%6ld, %6d] + %6d -> [%6ld, %6d]\n", static_cast<long>(ga_i + ib*bd + ga_w), n_past + ib*bd, dd, static_cast<long>(ga_i + ib*bd + ga_w + dd), n_past + ib*bd + dd);
[self.ctx kvCacheSeqAdd:0 p0:ga_i p1:n_past delta:ib*bd];
[self.ctx kvCacheSeqDiv:0 p0:ga_i + ib*bd p1:ga_i + ib*bd + ga_w delta:ga_n];
[self.ctx kvCacheSeqAdd:0 p0:ga_i + ib*bd + ga_w p1:n_past + ib*bd delta:dd];
n_past -= bd;
ga_i += ga_w/ga_n;
os_log_debug(OS_LOG_DEFAULT, "\nn_past_old = %d, n_past = %d, ga_i = %ld\n\n", n_past + bd, n_past, static_cast<long>(ga_i));
}
}
// try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
if (n_session_consumed < (int) session_tokens.size()) {
size_t i = 0;
for ( ; i < embd.size(); i++) {
if (embd[i] != session_tokens[n_session_consumed]) {
session_tokens.resize(n_session_consumed);
break;
}
n_past++;
n_session_consumed++;
if (n_session_consumed >= (int) session_tokens.size()) {
++i;
break;
}
}
if (i > 0) {
embd.erase(embd.begin(), embd.begin() + i);
}
}
for (int i = 0; i < (int) embd.size(); i += params.nBatch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.nBatch) {
n_eval = params.nBatch;
}
os_log_debug(OS_LOG_DEFAULT, "eval: %s\n", [self.ctx convertTokensToString:embd].c_str());
if ([self.ctx decode:[[LlamaBatch alloc] initWithBatch:llama_batch_get_one(&embd[i], n_eval, n_past, 0)] ]) {
[NSException raise:@"EvalFailure" format:@"failed to eval"];
}
n_past += n_eval;
os_log_debug(OS_LOG_DEFAULT, "n_past = %d\n", n_past);
// Display total tokens alongside total time
if (params.nPrint > 0 && n_past % params.nPrint == 0) {
os_log_debug(OS_LOG_DEFAULT, "\n\033[31mTokens consumed so far = %d / %lu \033[0m\n", n_past, static_cast<unsigned long>([self.ctx nCtx]));
}
}
if (!embd.empty() && [pathSession length] > 0) {
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
n_session_consumed = session_tokens.size();
}
}
embd.clear();
if ((int) embd_inp.size() <= n_consumed && !isInteracting) {
// optionally save the session on first sample (for faster prompt loading next time)
if ([pathSession length] > 0 && need_to_save_session && !params.promptCacheRO) {
need_to_save_session = false;
[self.ctx saveStateFile:pathSession tokens:session_tokens.data() nTokenCount:session_tokens.size()];
// llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
os_log_debug(OS_LOG_DEFAULT, "saved session to %s\n", [pathSession cStringUsingEncoding:NSUTF8StringEncoding]);
}
const llama_token idToken = [smpl sample:self.ctx index:-1];
[smpl accept:idToken acceptGrammar:true];
// os_log_debug(OS_LOG_DEFAULT, "last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
embd.push_back(idToken);
// echo this to console
input_echo = true;
// decrement remaining sampling budget
--n_remain;
os_log_debug(OS_LOG_DEFAULT, "n_remain: %d\n", n_remain);
} else {
// some user input remains from prompt or interaction, forward it to processing
os_log_debug(OS_LOG_DEFAULT, "embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules
[smpl accept:embd_inp[n_consumed] acceptGrammar:false];
++n_consumed;
if ((int) embd.size() >= params.nBatch) {
break;
}
}
}
// display text
if (input_echo && display) {
// std::cout<< "DISPLAYING TEXT" << std::endl;
for (auto idToken : embd) {
NSString *token_str = [self.ctx tokenToPiece:idToken special:params.special];
// Console/Stream Output
os_log_info(OS_LOG_DEFAULT, "%s", [token_str cStringUsingEncoding:NSUTF8StringEncoding]);
// Record Displayed Tokens To Log
// Note: Generated tokens are created one by one hence this check
if (embd.size() > 1) {
// Incoming Requested Tokens
input_tokens.push_back(idToken);
} else {
// Outgoing Generated Tokens
output_tokens.push_back(idToken);
output_ss << [token_str cStringUsingEncoding:NSUTF8StringEncoding];
last_output_ss << [token_str cStringUsingEncoding:NSUTF8StringEncoding];
}
}
if (!last_output_ss.str().empty()) {
// queue->addOutputLine(last_output_ss.str());
}
}
// reset color to default if there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
if (!last_output_ss.str().empty()) {
// queue->addOutputLine(last_output_ss.str());
}
// console::set_display(console::reset);
display = true;
}
// if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) {
// check for reverse prompt in the last n_prev tokens
if ([params.antiPrompts count] > 0) {
const int n_prev = 32;
NSString *last_output = [smpl previousString:self.ctx n:n_prev];
is_antiprompt = false;
// Check if each of the reverse prompts appears at the end of the output.
// If we're not running interactively, the reverse prompt might be tokenized with some following characters
// so we'll compensate for that by widening the search window a bit.
for (NSString *antiprompt in params.antiPrompts) {
size_t extra_padding = params.interactive ? 0 : 2;
size_t search_start_pos = [last_output length] > static_cast<size_t>([antiprompt length] + extra_padding)
? [last_output length] - static_cast<size_t>([antiprompt length] + extra_padding)
: 0;
// TODO: Check if correct
if ([last_output rangeOfString:antiprompt options:0 range:NSMakeRange(search_start_pos, last_output.length - search_start_pos)].location != NSNotFound) {
if (params.interactive) {
isInteracting = true;
}
is_antiprompt = true;
break;
}
}
// check for reverse prompt using special tokens
llama_token last_token = [smpl last];
for (std::vector<llama_token> ids : antiprompt_ids) {
if (ids.size() == 1 && last_token == ids[0]) {
if (params.interactive) {
isInteracting = true;
}
is_antiprompt = true;
break;
}
}
if (is_antiprompt) {
os_log_debug(OS_LOG_DEFAULT, "found antiprompt: %s\n", [last_output cStringUsingEncoding:NSUTF8StringEncoding]);
}
}
// deal with end of generation tokens in interactive mode
if ([self.model tokenIsEOG:[smpl last]]) {
os_log_debug(OS_LOG_DEFAULT, "found an EOG token\n");
if (params.interactive) {
if ([[params antiPrompts] count] > 0) {
// tokenize and inject first reverse prompt
const auto first_antiprompt = [self.ctx tokenize:params.antiPrompts[0] addSpecial:false parseSpecial:true];
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
is_antiprompt = true;
}
if (params.enableChatTemplate) {
[self chat_add_and_format:chat_msgs
role:"assistant"
content:assistant_ss.str()];
}
isInteracting = true;
// LOG("\n");
}
}
// if current token is not EOG, we add it to current assistant message
if (params.conversation) {
const auto idToken = [smpl last];
assistant_ss << [[self.ctx tokenToPiece:idToken special:false] cStringUsingEncoding:NSUTF8StringEncoding];
}
if (n_past > 0 && isInteracting) {
os_log_debug(OS_LOG_DEFAULT, "waiting for user input\n");
if (params.conversation) {
// osLog_("\n> ");
}
if (params.inputPrefixBOS) {
os_log_debug(OS_LOG_DEFAULT, "adding input prefix BOS token\n");
embd_inp.push_back([self.model tokenBOS]);
}
std::string buffer;
if ([params.inputPrefix length] > 0 && !params.conversation) {
os_log_debug(OS_LOG_DEFAULT, "appending input prefix: '%s'\n", [params.inputPrefix cStringUsingEncoding:NSUTF8StringEncoding]);
os_log_info(OS_LOG_DEFAULT, "%s", [params.inputPrefix cStringUsingEncoding:NSUTF8StringEncoding]);
}
// color user input only
// console::set_display(console::user_input);
display = params.displayPrompt;
std::string line;
// bool another_line = true;
static int read_one = 0;
// if (!read_one) {
// do {
// another_line = false;// console::readline(line, params.multiline_input);
// buffer += "What is the weather in New York?";//line;
// } while (another_line);
// read_one++;
// }
// else {
if (!last_output_ss.str().empty()) {
auto str = last_output_ss.str();
last_output_ss.str("");
[queue addOutputLine:[NSString stringWithCString:str.c_str() encoding:NSUTF8StringEncoding]];
}
buffer = [[queue inputLine] cStringUsingEncoding:NSUTF8StringEncoding];
// do {
// another_line = console::readline(line, params.multiline_input);
// buffer += line;
// } while (another_line);
// }
// done taking input, reset color
// console::set_display(console::reset);
display = true;
// Add tokens to embd only if the input buffer is non-empty
// Entering a empty line lets the user pass control back
if (buffer.length() > 1) {
// append input suffix if any
if ([params.inputSuffix length] > 0 && !params.conversation) {
os_log_debug(OS_LOG_DEFAULT, "appending input suffix: '%s'\n", [params.inputSuffix cStringUsingEncoding:NSUTF8StringEncoding]);
os_log_info(OS_LOG_DEFAULT, "%s", [params.inputSuffix cStringUsingEncoding:NSUTF8StringEncoding]);
}
os_log_debug(OS_LOG_DEFAULT, "buffer: '%s'\n", buffer.c_str());
const size_t original_size = embd_inp.size();
if (params.escapeSequences) {
string_process_escapes(buffer);
}
bool format_chat = params.conversation && params.enableChatTemplate;
std::string user_inp = format_chat
? [[self chat_add_and_format:chat_msgs role:"user" content:std::move(buffer)] cStringUsingEncoding:NSUTF8StringEncoding]
: std::move(buffer);
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
const auto line_pfx = [self.ctx tokenize:params.inputPrefix addSpecial:false parseSpecial:true];
const auto line_inp = [self.ctx tokenize:[NSString stringWithCString:user_inp.c_str()
encoding:NSUTF8StringEncoding]
addSpecial:false
parseSpecial:format_chat];
const auto line_sfx = [self.ctx tokenize:params.inputSuffix
addSpecial:false
parseSpecial:true];
os_log_debug(OS_LOG_DEFAULT, "input tokens: %s\n", [self.ctx convertTokensToString:line_inp].c_str());
// if user stop generation mid-way, we must add EOT to finish model's last response
if (need_insert_eot && format_chat) {
llama_token eot = [self.model tokenEOT];
embd_inp.push_back(eot == -1 ? [self.model tokenEOS] : eot);
need_insert_eot = false;
}
embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i];
output_tokens.push_back(token);
output_ss << [[self.ctx tokenToPiece:token] cStringUsingEncoding:NSUTF8StringEncoding];
}
// reset assistant message
assistant_ss.str("");
n_remain -= line_inp.size();
os_log_debug(OS_LOG_DEFAULT, "n_remain: %d\n", n_remain);
} else {
os_log_debug(OS_LOG_DEFAULT, "empty line, passing control back\n");
}
input_echo = false; // do not echo this again
}
if (n_past > 0) {
if (isInteracting) {
[smpl reset];
}
isInteracting = false;
}
}
// end of generation
if (!embd.empty() && [self.model tokenIsEOG:embd.back()] && !(params.interactive)) {
os_log_info(OS_LOG_DEFAULT, " [end of text]\n");
break;
}
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
// We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
if (params.interactive && n_remain <= 0 && params.nPredict >= 0) {
n_remain = params.nPredict;
isInteracting = true;
}
}
}
@end

264
objc/include/GPTParams.h Normal file
View file

@ -0,0 +1,264 @@
#ifndef GPTParams_h
#define GPTParams_h
@class LlamaModelParams;
@class LlamaContextParams;
@class GGMLThreadpool;
// Define the ggml_sched_priority enum
typedef NS_ENUM(NSInteger, GGMLSchedPriority) {
GGMLSchedPriorityNormal = 0, // Normal priority
GGMLSchedPriorityMedium = 1, // Medium priority
GGMLSchedPriorityHigh = 2, // High priority
GGMLSchedPriorityRealtime = 3 // Realtime priority
};
@interface GGMLThreadpoolParams : NSObject
@property (nonatomic, assign) int nThreads;
@property (nonatomic, assign) GGMLSchedPriority priority;
@property (nonatomic, assign) uint32_t poll;
@property (nonatomic, assign) BOOL strictCPU;
@property (nonatomic, assign) BOOL paused;
// Custom access methods for the cpumask array
- (BOOL)getCpuMaskAtIndex:(NSUInteger)index;
- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index;
- (GGMLThreadpool *)threadpool;
@end
@interface GGMLThreadpool : NSObject
@end
@interface CPUParams : NSObject
// Properties
@property (nonatomic, assign) int nThreads;
@property (nonatomic, assign) BOOL maskValid;
@property (nonatomic, assign) GGMLSchedPriority priority;
@property (nonatomic, assign) BOOL strictCPU;
@property (nonatomic, assign) uint32_t poll;
// Custom methods to access or manipulate the cpumask array
- (BOOL)getCpuMaskAtIndex:(NSUInteger)index;
- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index;
- (GGMLThreadpoolParams *)ggmlThreadpoolParams;
@end
@interface GPTSamplerParams : NSObject
// Properties corresponding to C++ struct fields
@property (nonatomic, assign) uint32_t seed;
@property (nonatomic, assign) int32_t nPrev;
@property (nonatomic, assign) int32_t nProbs;
@property (nonatomic, assign) int32_t minKeep;
@property (nonatomic, assign) int32_t topK;
@property (nonatomic, assign) float topP;
@property (nonatomic, assign) float minP;
@property (nonatomic, assign) float tfsZ;
@property (nonatomic, assign) float typP;
@property (nonatomic, assign) float temp;
@property (nonatomic, assign) float dynatempRange;
@property (nonatomic, assign) float dynatempExponent;
@property (nonatomic, assign) int32_t penaltyLastN;
@property (nonatomic, assign) float penaltyRepeat;
@property (nonatomic, assign) float penaltyFreq;
@property (nonatomic, assign) float penaltyPresent;
@property (nonatomic, assign) int32_t mirostat;
@property (nonatomic, assign) float mirostatTau;
@property (nonatomic, assign) float mirostatEta;
@property (nonatomic, assign) BOOL penalizeNl;
@property (nonatomic, assign) BOOL ignoreEos;
@property (nonatomic, assign) BOOL noPerf;
// Arrays and Strings
@property (nonatomic, strong) NSArray<NSNumber *> *samplers; // Samplers mapped to NSArray of NSNumber (for enums)
@property (nonatomic, copy) NSString *grammar; // Grammar as NSString
@property (nonatomic, strong) NSArray<NSNumber *> *logitBias; // Logit biases mapped to NSArray of NSNumber
// Method to print the parameters into a string
- (NSString *)print;
@end
@interface GPTParams : NSObject
@property (nonatomic, assign) int32_t nPredict;
@property (nonatomic, assign) NSInteger nCtx;
@property (nonatomic, assign) int32_t nBatch;
@property (nonatomic, assign) int32_t nUBatch;
@property (nonatomic, assign) int32_t nKeep;
@property (nonatomic, assign) int32_t nDraft;
@property (nonatomic, assign) int32_t nChunks;
@property (nonatomic, assign) int32_t nParallel;
@property (nonatomic, assign) int32_t nSequences;
@property (nonatomic, assign) float pSplit;
@property (nonatomic, assign) int32_t nGpuLayers;
@property (nonatomic, assign) int32_t nGpuLayersDraft;
@property (nonatomic, assign) int32_t mainGpu;
@property (nonatomic, strong) NSMutableArray<NSNumber *> *tensorSplit; // Fixed-size array, stays the same
@property (nonatomic, assign) int32_t grpAttnN;
@property (nonatomic, assign) int32_t grpAttnW;
@property (nonatomic, assign) int32_t nPrint;
@property (nonatomic, assign) float ropeFreqBase;
@property (nonatomic, assign) float ropeFreqScale;
@property (nonatomic, assign) float yarnExtFactor;
@property (nonatomic, assign) float yarnAttnFactor;
@property (nonatomic, assign) float yarnBetaFast;
@property (nonatomic, assign) float yarnBetaSlow;
@property (nonatomic, assign) int32_t yarnOrigCtx;
@property (nonatomic, assign) float defragThold;
// You need to replace your C++ struct "cpu_params" with an Objective-C class or struct accordingly
@property (nonatomic, strong) CPUParams *cpuParams;
@property (nonatomic, strong) CPUParams *cpuParamsBatch;
@property (nonatomic, strong) CPUParams *draftCpuParams;
@property (nonatomic, strong) CPUParams *draftCpuParamsBatch;
// Callbacks (assuming they are blocks in Objective-C)
@property (nonatomic, copy) void (^cbEval)(void *);
@property (nonatomic, assign) void *cbEvalUserData;
@property (nonatomic, assign) NSInteger numaStrategy; // Enumerations
@property (nonatomic, assign) NSInteger splitMode;
@property (nonatomic, assign) NSInteger ropeScalingType;
@property (nonatomic, assign) NSInteger poolingType;
@property (nonatomic, assign) NSInteger attentionType;
// Sampler parameters would also be converted to an Objective-C object
@property (nonatomic, strong) GPTSamplerParams *samplerParams;
@property (nonatomic, copy) NSString *modelPath;
@property (nonatomic, copy) NSString *modelDraft;
@property (nonatomic, copy) NSString *modelAlias;
@property (nonatomic, copy) NSString *modelURL;
@property (nonatomic, copy) NSString *hfToken;
@property (nonatomic, copy) NSString *hfRepo;
@property (nonatomic, copy) NSString *hfFile;
@property (nonatomic, copy) NSString *prompt;
@property (nonatomic, copy) NSString *promptFile;
@property (nonatomic, copy) NSString *pathPromptCache;
@property (nonatomic, copy) NSString *inputPrefix;
@property (nonatomic, copy) NSString *inputSuffix;
@property (nonatomic, copy) NSString *logdir;
@property (nonatomic, copy) NSString *lookupCacheStatic;
@property (nonatomic, copy) NSString *lookupCacheDynamic;
@property (nonatomic, copy) NSString *logitsFile;
@property (nonatomic, copy) NSString *rpcServers;
// Arrays in Objective-C are represented with `NSArray`
@property (nonatomic, strong) NSArray<NSString *> *inputFiles;
@property (nonatomic, strong) NSArray<NSString *> *antiPrompts;
@property (nonatomic, strong) NSArray *kvOverrides;
// Boolean values (in Objective-C, use `BOOL`)
@property (nonatomic, assign) BOOL loraInitWithoutApply;
@property (nonatomic, strong) NSArray *loraAdapters;
@property (nonatomic, strong) NSArray *controlVectors;
// Control params
@property (nonatomic, assign) int32_t verbosity;
@property (nonatomic, assign) int32_t controlVectorLayerStart;
@property (nonatomic, assign) int32_t controlVectorLayerEnd;
// Performance and configuration params
@property (nonatomic, assign) int32_t pplStride;
@property (nonatomic, assign) int32_t pplOutputType;
@property (nonatomic, assign) BOOL hellaswag;
@property (nonatomic, assign) size_t hellaswagTasks;
@property (nonatomic, assign) BOOL winogrande;
@property (nonatomic, assign) size_t winograndeTasks;
@property (nonatomic, assign) BOOL multipleChoice;
@property (nonatomic, assign) size_t multipleChoiceTasks;
@property (nonatomic, assign) BOOL klDivergence;
@property (nonatomic, assign) BOOL usage;
@property (nonatomic, assign) BOOL useColor;
@property (nonatomic, assign) BOOL special;
@property (nonatomic, assign) BOOL interactive;
@property (nonatomic, assign) BOOL interactiveFirst;
@property (nonatomic, assign) BOOL conversation;
@property (nonatomic, assign) BOOL promptCacheAll;
@property (nonatomic, assign) BOOL promptCacheRO;
@property (nonatomic, assign) BOOL escapeSequences;
@property (nonatomic, assign) BOOL multilineInput;
@property (nonatomic, assign) BOOL simpleIO;
@property (nonatomic, assign) BOOL continuousBatching;
@property (nonatomic, assign) BOOL flashAttention;
@property (nonatomic, assign) BOOL noPerformanceMetrics;
@property (nonatomic, assign) BOOL contextShift;
// Server and I/O settings
@property (nonatomic, assign) int32_t port;
@property (nonatomic, assign) int32_t timeoutRead;
@property (nonatomic, assign) int32_t timeoutWrite;
@property (nonatomic, assign) int32_t httpThreads;
@property (nonatomic, copy) NSString *hostname;
@property (nonatomic, copy) NSString *publicPath;
@property (nonatomic, copy) NSString *chatTemplate;
@property (nonatomic, copy) NSString *systemPrompt;
@property (nonatomic, assign) BOOL enableChatTemplate;
@property (nonatomic, strong) NSArray<NSString *> *apiKeys;
@property (nonatomic, copy) NSString *sslFileKey;
@property (nonatomic, copy) NSString *sslFileCert;
@property (nonatomic, assign) BOOL endpointSlots;
@property (nonatomic, assign) BOOL endpointMetrics;
@property (nonatomic, assign) BOOL logJSON;
@property (nonatomic, copy) NSString *slotSavePath;
@property (nonatomic, assign) float slotPromptSimilarity;
// batched-bench params
@property (nonatomic, assign) BOOL isPPShared;
@property (nonatomic, strong) NSArray<NSNumber *> *nPP;
@property (nonatomic, strong) NSArray<NSNumber *> *nTG;
@property (nonatomic, strong) NSArray<NSNumber *> *nPL;
// retrieval params
@property (nonatomic, strong) NSArray<NSString *> *contextFiles;
@property (nonatomic, assign) int32_t chunkSize;
@property (nonatomic, copy) NSString *chunkSeparator;
// passkey params
@property (nonatomic, assign) int32_t nJunk;
@property (nonatomic, assign) int32_t iPos;
// imatrix params
@property (nonatomic, copy) NSString *outFile;
@property (nonatomic, assign) int32_t nOutFreq;
@property (nonatomic, assign) int32_t nSaveFreq;
@property (nonatomic, assign) int32_t iChunk;
@property (nonatomic, assign) BOOL processOutput;
@property (nonatomic, assign) BOOL computePPL;
// cvector-generator params
@property (nonatomic, assign) int nPCABatch;
@property (nonatomic, assign) int nPCAIterations;
@property (nonatomic, assign) int cvectorDimreMethod;
@property (nonatomic, copy) NSString *cvectorOutfile;
@property (nonatomic, copy) NSString *cvectorPositiveFile;
@property (nonatomic, copy) NSString *cvectorNegativeFile;
@property (nonatomic, assign) BOOL spmInfill;
@property (nonatomic, copy) NSString *loraOutfile;
@property (nonatomic, assign) BOOL embedding;
@property (nonatomic, assign) BOOL verbosePrompt; // print prompt tokens before generation
@property (nonatomic, assign) BOOL batchedBenchOutputJSONL;
@property (nonatomic, assign) BOOL inputPrefixBOS; // prefix BOS to user inputs, preceding input_prefix
@property (nonatomic, assign) BOOL ctxShift; // context shift on inifinite text generation
@property (nonatomic, assign) BOOL displayPrompt; // print prompt before generation
- (LlamaModelParams *)llamaModelParams;
- (LlamaContextParams *)llamaContextParams;
@end
#endif /* GPTParams_h */

View file

@ -0,0 +1,25 @@
#ifndef GPTParams_Private_hpp
#define GPTParams_Private_hpp
#import "GPTParams.h"
#import "ggml.h"
#import "../../common/common.h"
@interface GGMLThreadpool()
- (ggml_threadpool *)threadpool;
@end
@interface GPTParams()
- (gpt_params&)params;
@end
@interface GPTSamplerParams()
- (gpt_sampler_params&)cParams;
@end
#endif /* GPTParams_Private_hpp */

55
objc/include/GPTSampler.h Normal file
View file

@ -0,0 +1,55 @@
#ifndef GPTSampler_h
#define GPTSampler_h
@class LlamaModel;
@class GPTSamplerParams;
@class LlamaContext;
typedef int32_t LlamaToken;
@interface GPTSampler : NSObject
- (instancetype)init:(LlamaModel *)model gptSamplerParams:(GPTSamplerParams *)gptSamplerParams;
- (uint32_t)seed;
// extended sampling implementation:
//
// - set logits
// - apply the configured sampler chain
// - check if the token fits the grammar (if any)
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
//
// if grammar_first is true, the grammar is applied before the samplers (slower)
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
//
- (LlamaToken)sample:(LlamaContext *)context
index:(NSInteger) index;
// extended sampling implementation:
//
// - set logits
// - apply the configured sampler chain
// - check if the token fits the grammar (if any)
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
//
// if grammar_first is true, the grammar is applied before the samplers (slower)
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
//
- (LlamaToken)sample:(LlamaContext *)context
index:(NSInteger) index
grammarFirst:(BOOL)grammarFirst;
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
- (void)accept:(LlamaToken)token
acceptGrammar:(BOOL)acceptGrammar;
// get a string representation of the last accepted tokens
- (NSString *)previousString:(LlamaContext *)context n:(NSInteger)n;
// get the last accepted token
- (LlamaToken)last;
- (void)reset;
@end
#endif /* GPTSampler_h */

34
objc/include/LlamaBatch.h Normal file
View file

@ -0,0 +1,34 @@
#ifndef LlamaBatch_h
#define LlamaBatch_h
typedef NSInteger LlamaSequenceId;
typedef NSInteger LlamaPosition;
typedef int32_t LlamaToken;
// Input data for llama_decode
// A llama_batch object can contain input about one or many sequences
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
//
// - token : the token ids of the input (used when embd is NULL)
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
// - pos : the positions of the respective token in the sequence
// - seq_id : the sequence to which the respective token belongs
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
@interface LlamaBatch : NSObject
@property (nonatomic, assign) NSInteger nTokens;
@property (nonatomic, assign) LlamaToken *tokens;
@property (nonatomic, assign) float *embd;
@property (nonatomic, assign) LlamaPosition *pos;
@property (nonatomic, assign) int32_t *nSeqId;
@property (nonatomic, assign) LlamaSequenceId **seqId;
@property (nonatomic, assign) NSData *output;
// Helpers for smooth API transition (optional usage in the interface)
@property (nonatomic, assign) LlamaPosition allPos0;
@property (nonatomic, assign) LlamaPosition allPos1;
@property (nonatomic, assign) LlamaSequenceId allSeqId;
@end
#endif /* LlamaBatch_h */

View file

@ -0,0 +1,13 @@
#ifndef LlamaBatch_Private_hpp
#define LlamaBatch_Private_hpp
#import "LlamaBatch.h"
#import "llama.h"
@interface LlamaBatch()
- (instancetype)initWithBatch:(llama_batch)batch;
- (llama_batch)cBatch;
@end
#endif /* LlamaBatch_Private_hpp */

View file

@ -0,0 +1,57 @@
#ifndef LlamaContext_h
#define LlamaContext_h
@class GGMLThreadpool;
@class LlamaBatch;
typedef NSInteger LlamaSequenceId;
typedef NSInteger LlamaPosition;
typedef int32_t LlamaToken;
@interface LlamaContext : NSObject
- (void)attachThreadpool:(GGMLThreadpool *)threadpool
threadpoolBatch:(GGMLThreadpool *)threadpoolBatch;
- (NSUInteger)nCtx;
// Positive return values does not mean a fatal error, but rather a warning.
// 0 - success
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
// < 0 - error
- (NSInteger)decode:(LlamaBatch *)batch;
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
// If the KV cache is RoPEd, the KV data is updated accordingly:
// - lazily on next llama_decode()
// - explicitly with llama_kv_cache_update()
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
- (void)kvCacheSeqAdd:(LlamaSequenceId)sequenceId
p0:(LlamaPosition)p0
p1:(LlamaPosition)p1
delta:(LlamaPosition)delta;
// Integer division of the positions by factor of `d > 1`
// If the KV cache is RoPEd, the KV data is updated accordingly:
// - lazily on next llama_decode()
// - explicitly with llama_kv_cache_update()
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
- (void)kvCacheSeqDiv:(LlamaSequenceId)sequenceId
p0:(LlamaPosition)p0
p1:(LlamaPosition)p1
delta:(LlamaPosition)delta;
// tokenizes a token into a piece, optionally renders special/control tokens
// should work similar to Python's `tokenizer.id_to_piece`
- (NSString *)tokenToPiece:(LlamaToken)token;
- (NSString *)tokenToPiece:(LlamaToken)token special:(BOOL)special;
- (BOOL)saveStateFile:(NSString *)pathSession
tokens:(const LlamaToken *)tokens
nTokenCount:(size_t)nTokenCount;
@end
#endif /* LlamaContext_h */

View file

@ -0,0 +1,28 @@
#ifndef LlamaContext_Private_hpp
#define LlamaContext_Private_hpp
#import "LlamaContext.h"
#import "../../common/common.h"
@interface LlamaContext()
- (instancetype)initWithContext:(llama_context *)context;
- (std::vector<llama_token>)tokenize:(NSString *)text
addSpecial:(BOOL)addSpecial
parseSpecial:(BOOL)parseSpecial;
- (BOOL)loadStateFile:(NSString *)pathSession
tokensOut:(llama_token *)tokensOut
nTokenCpacity:(size_t)nTokenCapacity
nTokenCountOut:(size_t *)nTokenCountOut;
- (std::string)convertTokensToString:(const std::vector<llama_token>&)tokens;
- (llama_context *)cContext;
- (int32_t)encode:(llama_batch)batch;
@end
#endif /* LlamaContext_Private_hpp */

35
objc/include/LlamaModel.h Normal file
View file

@ -0,0 +1,35 @@
#ifndef LlamaModel_h
#define LlamaModel_h
@class GPTParams;
@class GGMLThreadpool;
@class LlamaContext;
typedef int32_t LlamaToken;
@interface LlamaChatMessage : NSObject
@property (nonatomic, copy) NSString *role;
@property (nonatomic, copy) NSString *content;
@end
@interface LlamaContextParams : NSObject
@end
@interface LlamaModel : NSObject
- (LlamaContext *)context:(LlamaContextParams *)params;
- (LlamaToken)tokenBOS;
- (LlamaToken)tokenEOT;
- (LlamaToken)tokenEOS;
- (BOOL)tokenIsEOG:(LlamaToken)token;
- (int32_t)nCtxTrain;
- (BOOL)addBOSToken;
- (BOOL)addEOSToken;
- (BOOL)hasEncoder;
- (NSString *)formatExample:(NSString *)tmpl;
@end
#endif /* LlamaModel_h */

View file

@ -0,0 +1,15 @@
#ifndef LlamaModel_Private_hpp
#define LlamaModel_Private_hpp
#import "LlamaModel.h"
#import "llama.h"
@interface LlamaModel()
- (instancetype)init:(llama_model *)model;
- (llama_model *)cModel;
@end
#endif /* LlamaModel_Private_hpp */

13
objc/include/LlamaObjC.h Normal file
View file

@ -0,0 +1,13 @@
#ifndef LlamaObjC_h
#define LlamaObjC_h
#include <Foundation/Foundation.h>
#include <llama.h>
#include <LlamaModel.h>
#include <LlamaContext.h>
#include <LlamaSession.h>
#include <GPTParams.h>
#endif /* LlamaObjC_h */

View file

@ -0,0 +1,27 @@
#ifndef LlamaSession_h
#define LlamaSession_h
@class GPTParams;
@class LlamaModel;
@class LlamaContext;
@interface BlockingLineQueue : NSObject
- (void)addInputLine:(NSString *)line;
- (NSString *)inputLine;
- (void)addOutputLine:(NSString *)line;
- (NSString *)outputLine;
@end
@interface LlamaSession : NSObject
@property (nonatomic, strong) LlamaModel *model;
@property (nonatomic, strong) LlamaContext *ctx;
- (instancetype)initWithParams:(GPTParams *)params;
- (void)start:(BlockingLineQueue *)queue;
@end
#endif /* Header_h */

View file

@ -0,0 +1,10 @@
#ifndef LlamaSession_Private_hpp
#define LlamaSession_Private_hpp
#import "LlamaSession.h"
@interface LlamaSession()
@end
#endif /* LlamaSession_Private_hpp */

1
objc/include/ggml-metal.h Symbolic link
View file

@ -0,0 +1 @@
../../ggml/include/ggml-metal.h

View file

@ -0,0 +1,102 @@
import Foundation
import RegexBuilder
let SPACE_RULE = "\" \"?"
let PRIMITIVE_RULES: [String: String] = [
"boolean": "(\"true\" | \"false\") space",
"number": "\"-\"? ([0-9] | [1-9] [0-9]*) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space",
"integer": "\"-\"? ([0-9] | [1-9] [0-9]*) space",
"string": "\"\\\"\" ([^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]))* \"\\\"\" space",
"null": "\"null\" space",
]
let INVALID_RULE_CHARS_RE = try! NSRegularExpression(pattern: "[^a-zA-Z0-9-]+")
let GRAMMAR_LITERAL_ESCAPE_RE = try! NSRegularExpression(pattern: "[\r\n\"]")
let GRAMMAR_LITERAL_ESCAPES: [String: String] = ["\r": "\\r", "\n": "\\n", "\"": "\\\""]
public class SchemaConverter {
private var propOrder: [String]
private var rules: [String: String] = ["space": SPACE_RULE]
public init(propOrder: [String]) {
self.propOrder = propOrder
}
private func formatLiteral(_ literal: Any) -> String {
// let escaped = GRAMMAR_LITERAL_ESCAPES.reduce("\(literal)", {
// let regex = Regex("[\r\n\"]")
let escaped = GRAMMAR_LITERAL_ESCAPES.reduce("\(literal)") {
$0.replacingOccurrences(of: $1.key, with: $1.value)
}
return "\\\"\(escaped)\\\""
}
private func addRule(name: String, rule: String) -> String {
let escName = INVALID_RULE_CHARS_RE.stringByReplacingMatches(
in: name,
options: [],
range: NSRange(location: 0, length: name.count),
withTemplate: "-"
)
var key = escName
if let existingRule = rules[escName], existingRule != rule {
var i = 0
while rules["\(escName)\(i)"] != nil {
i += 1
}
key = "\(escName)\(i)"
}
rules[key] = rule
return key
}
public func visit(schema: [String: Any], name: String?) -> String {
let schemaType = schema["type"] as? String
let ruleName = name ?? "root"
if let oneOf = schema["oneOf"] as? [[String: Any]] ?? schema["anyOf"] as? [[String: Any]] {
let rule = oneOf.enumerated().map { (i, altSchema) in
visit(schema: altSchema, name: "\(name ?? "")\(name != nil ? "-" : "")\(i)")
}.joined(separator: " | ")
return addRule(name: ruleName, rule: rule)
} else if let constValue = schema["const"] {
return addRule(name: ruleName, rule: formatLiteral(constValue))
} else if let enumValues = schema["enum"] as? [Any] {
let rule = enumValues.map { "\"\(formatLiteral($0))\"" }.joined(separator: " | ")
return addRule(name: ruleName, rule: rule)
} else if schemaType == "object", let properties = schema["properties"] as? [String: Any] {
let propPairs = properties.sorted { (kv1, kv2) in
let idx1 = propOrder.firstIndex(of: kv1.key) ?? propOrder.count
let idx2 = propOrder.firstIndex(of: kv2.key) ?? propOrder.count
return (idx1, kv1.key) < (idx2, kv2.key)
}
var rule = "\"{\" space"
for (i, (propName, propSchema)) in propPairs.enumerated() {
let propRuleName = visit(schema: propSchema as! [String : Any], name: "\(name ?? "")\(name != nil ? "-" : "")\(propName)")
if i > 0 {
rule += " \",\" space"
}
rule += " \"\(formatLiteral(propName))\" space \":\" space \(propRuleName)"
}
rule += " \"}\" space"
return addRule(name: ruleName, rule: rule)
} else if schemaType == "array", let items = schema["items"] {
let itemRuleName = visit(schema: items as! [String : Any], name: "\(name ?? "")\(name != nil ? "-" : "")item")
let rule = "\"[\" space (\(itemRuleName) (\",\" space \(itemRuleName))*)? \"]\" space"
return addRule(name: ruleName, rule: rule)
} else {
assert(PRIMITIVE_RULES.keys.contains(schemaType ?? ""), "Unrecognized schema: \(schema)")
return addRule(name: ruleName == "root" ? "root" : schemaType!, rule: PRIMITIVE_RULES[schemaType!]!)
}
}
public func formatGrammar() -> String {
return rules.map { (name, rule) in "\(name) ::= \(rule)" }.joined(separator: "\n") + "\n"
}
}

View file

@ -0,0 +1,187 @@
import Foundation
//import SwiftSyntaxMacros
public struct JSONSchema : Codable {
public struct Items : Codable {
let type: String
let `enum`: [String]?
public init(type: String, `enum`: [String]?) {
self.type = type
self.enum = `enum`
}
}
public struct Property : Codable {
let type: String
let items: Items?
let description: String?
public init(type: String, items: Items?, description: String?) {
self.type = type
self.items = items
self.description = description
}
}
let type: String
let items: Items?
let properties: [String : Property]?
public init(type: String, items: Items?, properties: [String : Property]?) {
self.type = type
self.items = items
self.properties = properties
}
}
public struct _JSONFunctionSchema: Codable {
public struct Items: Codable {
let type: String
let `enum`: [String]?
public init(type: Any.Type, `enum`: [String]?) {
self.type = String(describing: type)
self.enum = `enum`
}
}
public struct Property: Codable {
let type: String
let items: Items?
let `enum`: [String]?
let description: String?
public init(type: String.Type, description: String?) {
self.type = "string"
self.description = description
self.items = nil
self.enum = nil
}
public init<T: CaseIterable>(type: T.Type, description: String?) where T: RawRepresentable,
T: StringProtocol {
self.type = "string"
self.enum = Array(type.allCases.map { $0.rawValue as! String })
self.description = description
self.items = nil
}
}
public struct Parameters: Codable {
public let properties: [String: Property]
public let required: [String]
public let type = "object"
public init(properties: [String : Property], required: [String]) {
self.properties = properties
self.required = required
}
}
let name: String
let description: String
let parameters: Parameters
public init(name: String, description: String, parameters: Parameters) {
self.name = name
self.description = description
self.parameters = parameters
}
}
public protocol JSONSchemaConvertible : Codable {
static var type: String { get }
static var jsonSchema: [String : Any] { get }
static func decode<K: CodingKey>(from container: KeyedDecodingContainer<K>,
forKey key: K) throws -> Self
}
extension RawRepresentable where Self : CaseIterable, RawValue : JSONSchemaConvertible, Self: Codable {
public static var type: String {
RawValue.type
}
public static var jsonSchema: [String: Any] {
[
"type": RawValue.type,
"enum": Self.allCases.map(\.rawValue)
]
}
}
extension JSONSchemaConvertible {
public static var items: JSONSchema.Items? {
nil
}
public static var properties: [JSONSchema.Property]? {
nil
}
public static var `enum`: [String]? {
nil
}
public static func decode<K: CodingKey>(from container: KeyedDecodingContainer<K>, forKey key: K) throws -> Self {
return try container.decode(Self.self, forKey: key)
}
}
extension String : JSONSchemaConvertible {
public static var type: String { "string" }
public static var jsonSchema: [String: Any] {
[
"type": "string"
]
}
}
extension Int : JSONSchemaConvertible {
public static var type: String { "number" }
public static var jsonSchema: [String: Any] {
[
"type": "integer"
]
}
}
extension Double : JSONSchemaConvertible {
public static var type: String { "number" }
public static var jsonSchema: [String: Any] {
[
"type": "number"
]
}
}
extension Date : JSONSchemaConvertible {
public static var type: String { "string" }
public static var jsonSchema: [String: Any] {
[
"type": "string"
]
}
public static func decode<K: CodingKey>(from container: KeyedDecodingContainer<K>, forKey key: K) throws -> Self {
let value = try container.decode(String.self, forKey: key)
let detector = try? NSDataDetector(types: NSTextCheckingResult.CheckingType.date.rawValue)
let matches = detector?.matches(in: value, options: [], range: NSMakeRange(0, value.utf16.count))
return matches!.first!.date!
// return ISO8601DateFormatter().date(from: value)!
}
}
extension Array : JSONSchemaConvertible where Element : JSONSchemaConvertible {
public static var type: String { "array" }
public static var items: JSONSchema.Items? {
JSONSchema.Items(type: Element.type, enum: Element.enum)
}
public static var jsonSchema: [String : Any] {
[
"type": "array",
"items": Element.jsonSchema
]
}
}
@attached(member, names: arbitrary)
@attached(extension, conformances: JSONSchemaConvertible, CaseIterable, names: arbitrary)
public macro JSONSchema() = #externalMacro(module: "JSONSchemaMacros",
type: "JSONSchemaMacro")
//@attached(member, names: arbitrary)

View file

@ -0,0 +1,229 @@
import SwiftSyntaxMacros
import SwiftCompilerPlugin
import SwiftSyntax
private struct MemberView {
let name: String
let type: String
var attributeKey: String?
var assignment: String?
}
private func view(for member: MemberBlockItemListSyntax.Element) throws -> MemberView? {
guard let decl = member.decl.as(VariableDeclSyntax.self),
let binding = decl.bindings.compactMap({
$0.pattern.as(IdentifierPatternSyntax.self)
}).first,
let type = decl.bindings.compactMap({
$0.typeAnnotation?.type
}).first,
!(type.syntaxNodeType is StructDeclSyntax.Type) else {
return nil
}
var memberView = MemberView(name: "\(binding.identifier)", type: "\(type)", attributeKey: nil)
if let macroName = decl.attributes.first?.as(AttributeSyntax.self)?
.arguments?.as(LabeledExprListSyntax.self)?.first?.expression.as(StringLiteralExprSyntax.self) {
memberView.attributeKey = "\(macroName.segments)"
}
if let assignment = decl.bindings.compactMap({
$0.initializer?.value
}).first {
memberView.assignment = "\(assignment)"
}
return memberView
}
struct JSONSchemaMacro: ExtensionMacro, MemberMacro {
static func expansion(of node: AttributeSyntax, providingMembersOf declaration: some DeclGroupSyntax, conformingTo protocols: [TypeSyntax], in context: some MacroExpansionContext) throws -> [DeclSyntax] {
let members = try declaration.memberBlock.members.compactMap(view(for:))
if declaration is EnumDeclSyntax {
return []
}
return [
"""
enum CodingKeys: CodingKey {
case \(raw: members.map(\.name).joined(separator: ", "))
}
""",
"""
init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
\(raw: members.map {
"""
self.\($0.name) = try \($0.type).decode(from: container, forKey: .\($0.name))
"""
}.joined(separator: "\n"))
}
"""
]
}
static func expansion(of node: SwiftSyntax.AttributeSyntax,
attachedTo declaration: some SwiftSyntax.DeclGroupSyntax,
providingExtensionsOf type: some SwiftSyntax.TypeSyntaxProtocol,
conformingTo protocols: [SwiftSyntax.TypeSyntax],
in context: some SwiftSyntaxMacros.MacroExpansionContext) throws -> [SwiftSyntax.ExtensionDeclSyntax] {
let members = try declaration.memberBlock.members.compactMap(view(for:))
var inheritedTypes: [InheritedTypeSyntax] = []
inheritedTypes.append(InheritedTypeSyntax(type: TypeSyntax("JSONSchemaConvertible")))
if declaration is EnumDeclSyntax {
inheritedTypes.append(InheritedTypeSyntax(type: TypeSyntax(", CaseIterable")))
}
let properties = members.map {
"""
"\($0.name)": \($0.type).jsonSchema
"""
}
if !(declaration is EnumDeclSyntax) {
return [
ExtensionDeclSyntax(extendedType: type,
inheritanceClause: .init(inheritedTypes: .init(inheritedTypes)),
memberBlock: """
{
static var type: String {
"object"
}
static var jsonSchema: [String: Any] {
[
"type": "object",
"properties": [
\(raw: properties.joined(separator: ","))
]
]
}
}
""")
]
} else {
return [
ExtensionDeclSyntax(extendedType: type,
inheritanceClause: .init(inheritedTypes: .init(inheritedTypes)),
memberBlock: """
{
public static func decode<K: CodingKey>(from container: KeyedDecodingContainer<K>, forKey key: K) throws -> Self {
if RawValue.self is Int.Type {
return Self(rawValue: Int(try container.decode(String.self, forKey: key)) as! Self.RawValue)!
} else {
return try container.decode(Self.self, forKey: key)
}
}
}
""")
]
}
}
}
enum TestError: Error {
case message(String)
}
struct LlamaActorMacro: ExtensionMacro, MemberMacro {
static func expansion(of node: AttributeSyntax, providingMembersOf declaration: some DeclGroupSyntax, conformingTo protocols: [TypeSyntax], in context: some MacroExpansionContext) throws -> [DeclSyntax] {
[
"""
let session: LlamaToolSession
public init(params: GPTParams) async throws {
self.session = try await LlamaToolSession(params: params, tools: Self.tools)
}
"""
]
}
static func expansion(of node: AttributeSyntax,
attachedTo declaration: some DeclGroupSyntax,
providingExtensionsOf type: some TypeSyntaxProtocol,
conformingTo protocols: [TypeSyntax],
in context: some MacroExpansionContext) throws -> [ExtensionDeclSyntax] {
var tools: [
(name: String,
description: String,
parameters: [(name: String,
type: String,
description: String)],
callableString: String,
callableName: String)
] = []
for member in declaration.memberBlock.members {
let comments = member.leadingTrivia.filter { $0.isComment }
guard let member = member.decl.as(FunctionDeclSyntax.self) else {
continue
}
let name = member.name
guard case var .docLineComment(description) = comments.first else {
throw TestError.message("Missing comment")
}
description = String(description.dropFirst(3))
var parameters: [(name: String, type: String, description: String)] = []
var index = 0
for parameter in member.signature.parameterClause.parameters {
let firstName = parameter.firstName.text
let typeName = parameter.type.as(IdentifierTypeSyntax.self)!.name.text
guard case var .docLineComment(description) = comments[index + 1] else {
throw TestError.message("Missing comment for \(firstName)")
}
description = String(description.dropFirst(3))
parameters.append((name: firstName, type: typeName, description: description))
index += 1
}
let callableName = context.makeUniqueName(name.text)
let callableString = """
@dynamicCallable struct \(callableName.text): DynamicCallable {
@discardableResult
func dynamicallyCall(withKeywordArguments args: [String: Any]) async throws -> String {
\(parameters.map {
"var \($0.name): \($0.type)!"
}.joined(separator: "\n"))
for (key, value) in args {
\(parameters.map {
"if key == \"\($0.name)\" { \($0.name) = value as! \($0.type) }"
}.joined(separator: "\n"))
}
let returnValue = try await \(name.text)(\(parameters.map { "\($0.name): \($0.name)" }.joined(separator: ",")))
let jsonValue = try JSONEncoder().encode(returnValue)
return String(data: jsonValue, encoding: .utf8)!
}
}
"""
tools.append((name: name.text, description: description,
parameters: parameters,
callableString: callableString,
callableName: callableName.text))
}
return [
.init(extendedType: type,
inheritanceClause: .init(inheritedTypes: InheritedTypeListSyntax.init(arrayLiteral: .init(type: IdentifierTypeSyntax(name: "LlamaActor")))),
memberBlock: """
{
\(raw: tools.map {
$0.callableString
}.joined(separator: "\n"))
static var tools: [String: (DynamicCallable, _JSONFunctionSchema)] {
[\(raw: tools.map { tool in
"""
"\(tool.name)": (\(tool.callableName)(), _JSONFunctionSchema(name: "\(tool.name)", description: "\(tool.description)", parameters: _JSONFunctionSchema.Parameters(properties: \(tool.parameters.count == 0 ? "[:]" : "[" + tool.parameters.map { parameter in
"""
"\(parameter.name)": _JSONFunctionSchema.Property(type: \(parameter.type).self, description: "\(parameter.description)"),
"""
}.joined() + "]"), required: [])))
"""
}.joined(separator: ","))]
}
}
""")
]
}
}
@main
struct JSONSchemaMacrosPlugin: CompilerPlugin {
let providingMacros: [Macro.Type] = [
JSONSchemaMacro.self, LlamaActorMacro.self
]
}

View file

@ -0,0 +1,189 @@
import Foundation
@_exported import JSONSchema
@_exported import LlamaObjC
public protocol DynamicCallable: Sendable {
@discardableResult
func dynamicallyCall(withKeywordArguments args: [String: Any]) async throws -> String
}
struct ToolCall: Decodable {
let id: Int
let name: String
let arguments: [String: String]
}
struct ToolResponse<T: Encodable>: Encodable {
let id: Int
let result: T
}
// MARK: LlamaChatSession
/// Standard chat session for a given LLM.
public actor LlamaChatSession {
private let queue = BlockingLineQueue()
private let session: LlamaObjC.LlamaSession
public init(params: GPTParams, flush: Bool = true) async throws {
session = LlamaObjC.LlamaSession(params: params)
Task.detached { [session, queue] in
session.start(queue)
}
// flush
guard flush else { return }
_ = queue.outputLine()
}
public func chat(message: String) async -> String {
queue.addInputLine(message)
return queue.outputLine()
}
}
// MARK: LlamaGrammarSession
public actor LlamaSession<T: JSONSchemaConvertible> {
private let session: LlamaChatSession
public init(params: GPTParams) async throws {
let converter = SchemaConverter(propOrder: [])
_ = converter.visit(schema: T.jsonSchema, name: nil)
params.samplerParams.grammar = converter.formatGrammar()
session = try await LlamaChatSession(params: params)
}
public func chat(message: String) async throws -> T {
let output = await session.chat(message: message).data(using: .utf8)!
return try JSONDecoder().decode(T.self, from: output)
}
}
// MARK: LlamaToolSession
public actor LlamaToolSession {
private let session: LlamaChatSession
private struct GetIpAddress: DynamicCallable {
func dynamicallyCall(withKeywordArguments args: [String : Any]) async throws -> String {
getIPAddress()
}
}
internal static func getIPAddress() -> String {
var address: String!
// Get list of all interfaces on the local machine:
var ifaddr: UnsafeMutablePointer<ifaddrs>? = nil
if getifaddrs(&ifaddr) == 0 {
// Loop through linked list of interfaces
var ptr = ifaddr
while ptr != nil {
let interface = ptr!.pointee
// Check if the interface is IPv4 or IPv6:
let addrFamily = interface.ifa_addr.pointee.sa_family
if addrFamily == UInt8(AF_INET) || addrFamily == UInt8(AF_INET6) {
// Convert interface name to String:
let name = String(cString: interface.ifa_name)
// Only consider non-loopback interfaces (e.g., "en0" for Wi-Fi)
if name == "en0" { // Typically en0 is the Wi-Fi interface
// Convert the address to a readable format:
var hostname = [CChar](repeating: 0, count: Int(NI_MAXHOST))
if getnameinfo(interface.ifa_addr, socklen_t(interface.ifa_addr.pointee.sa_len),
&hostname, socklen_t(hostname.count),
nil, socklen_t(0), NI_NUMERICHOST) == 0 {
address = String(cString: hostname)
}
}
}
ptr = interface.ifa_next
}
freeifaddrs(ifaddr)
}
return address
}
public private(set) var tools: [String: (DynamicCallable, _JSONFunctionSchema)]
public init(params: GPTParams,
tools: [String: (DynamicCallable, _JSONFunctionSchema)]) async throws {
self.tools = tools
let ipFnSchema = _JSONFunctionSchema(name: "getIpAddress", description: "Get the IP Address for this system", parameters: _JSONFunctionSchema.Parameters(properties: [:], required: []))
self.tools["getIpAddress"] = (GetIpAddress(), ipFnSchema)
let encoded = try JSONEncoder().encode(self.tools.values.map(\.1))
let prompt = """
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
<tool_call>
{"name": <function-name>,"arguments": <args-dict>}
</tool_call>
Here are the available tools:
<tools> \(String(data: encoded, encoding: .utf8)!) </tools><|eot_id|>
"""
params.prompt = prompt
params.interactive = true
params.antiPrompts.append("<|eot_id|>");
params.inputPrefix = "<|start_header_id|>user<|end_header_id|>";
params.inputSuffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>";
session = try await LlamaChatSession(params: params, flush: false)
let fn = await session.chat(message: "What is my IP address?")
let toolCall = try JSONDecoder().decode(ToolCall.self, from: fn.data(using: .utf8)!)
guard let tool = self.tools[toolCall.name] else {
fatalError()
}
let resp = try await tool.0.dynamicallyCall(withKeywordArguments: toolCall.arguments)
print(resp)
let output = await session.chat(message: """
<tool_response>
{"id": \(toolCall.id), result: \(resp)}
</tool_response>
""")
print(output)
}
public func chat(message: String) async throws -> String {
var nxt = await session.chat(message: message)
let fn = nxt
// try to see if the output is a function call
do {
let toolCall = try JSONDecoder().decode(ToolCall.self, from: fn.data(using: .utf8)!)
guard let tool = tools[toolCall.name] else {
fatalError()
}
let callable = tool.0
let resp = try await callable.dynamicallyCall(withKeywordArguments: toolCall.arguments)
print("tool response: \(resp)")
nxt = await session.chat(message: """
<tool_response>
{"id": \(toolCall.id), result: \(resp)}
</tool_response>
""")
print(nxt)
} catch {
print(error)
}
return nxt
}
}
public protocol LlamaActor: Actor {
static var tools: [String: (DynamicCallable, _JSONFunctionSchema)] { get }
var session: LlamaToolSession { get }
}
public extension LlamaActor {
func chat(_ message: String) async throws -> String {
try await session.chat(message: message)
}
}
@attached(member, names: arbitrary)
@attached(extension, conformances: LlamaActor, names: arbitrary)
public macro llamaActor() = #externalMacro(module: "JSONSchemaMacros",
type: "LlamaActorMacro")

76
swift/main/main.swift Normal file
View file

@ -0,0 +1,76 @@
import LlamaKit
import WeatherKit
import CoreLocation
@llamaActor actor MyLlama {
struct CurrentWeather: Codable {
let temperature: Double
let condition: WeatherCondition
}
/// Get the current weather in a given location.
/// - parameter location: The city and state, e.g. San Francisco, CA
/// - parameter unit: The unit of temperature
public static func getCurrentWeather(location: String, unit: String) async throws -> CurrentWeather {
let weather = try await WeatherService().weather(for: CLGeocoder().geocodeAddressString(location)[0].location!)
var temperature = weather.currentWeather.temperature
temperature.convert(to: .fahrenheit)
return CurrentWeather(temperature: temperature.value,
condition: weather.currentWeather.condition)
}
}
func downloadFile() async throws -> String {
let fm = FileManager.default
let tmpDir = fm.temporaryDirectory
let destinationURL = tmpDir.appending(path: "llama_groq_gguf.gguf")
guard !fm.fileExists(atPath: destinationURL.path()) else {
return destinationURL.path()
}
print("Downloading Llama Tools, this may take a while...")
// Define the URL
guard let url = URL(string: "https://huggingface.co/bartowski/Llama-3-Groq-8B-Tool-Use-GGUF/resolve/main/Llama-3-Groq-8B-Tool-Use-Q5_K_M.gguf?download=true") else {
print("Invalid URL.")
throw URLError(.badURL)
}
// Start the async download
let (tempURL, _) = try await URLSession.shared.download(from: url)
// Define the destination path in the documents directory
// Move the downloaded file to the destination
try fm.moveItem(at: tempURL, to: destinationURL)
print("File downloaded to: \(destinationURL.path())")
return destinationURL.path()
}
let params = GPTParams()
params.modelPath = try await downloadFile()
params.nPredict = 512
params.nCtx = 4096
params.cpuParams.nThreads = 8
params.cpuParamsBatch.nThreads = 8
params.nBatch = 1024
params.nGpuLayers = 1024
let llama = try await MyLlama(params: params)
while true {
print("Enter input: ", terminator: "")
// Read user input
if let userInput = readLine() {
if userInput.lowercased() == "exit" {
print("Exiting the loop.")
break
} else {
print("🧔🏽‍♂️: \(userInput)")
let response = try await llama.chat(userInput)
print("🤖: \(response)")
}
} else {
print("Failed to read input.")
}
}

View file

@ -0,0 +1,140 @@
import Foundation
import Testing
@testable import LlamaKit
import JSONSchema
// MARK: LlamaGrammarSession Suite
@Suite("LlamaGrammarSession Suite")
struct LlamaGrammarSessionSuite {
@JSONSchema struct Trip {
let location: String
let startDate: TimeInterval
let durationInDays: Int
}
func downloadFile() async throws -> String {
let fm = FileManager.default
let tmpDir = fm.temporaryDirectory
let destinationURL = tmpDir.appending(path: "tinyllama.gguf")
guard !fm.fileExists(atPath: destinationURL.path()) else {
return destinationURL.path()
}
print("Downloading TinyLlama, this may take a while...")
// Define the URL
guard let url = URL(string: "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q3_K_L.gguf?download=true") else {
print("Invalid URL.")
throw URLError(.badURL)
}
// Start the async download
let (tempURL, _) = try await URLSession.shared.download(from: url)
// Define the destination path in the documents directory
// Move the downloaded file to the destination
try fm.moveItem(at: tempURL, to: destinationURL)
print("File downloaded to: \(destinationURL.path())")
return destinationURL.path()
}
@Test func llamaGrammarSession() async throws {
let params = GPTParams()
params.modelPath = try await downloadFile()
params.nPredict = 256
params.nCtx = 1024
params.cpuParams.nThreads = 4
params.cpuParamsBatch.nThreads = 4
params.nBatch = 1024
params.nGpuLayers = 128
params.chatTemplate = """
<|system|>
{system_message}</s>
<|user|>
{prompt}</s>
<|assistant|>
"""
params.prompt = """
You are a travel agent. The current date epoch \(Date.now.timeIntervalSince1970).
Responses should have the following fields:
location: the location of the trip
startDate: the start of the trip as the unix epoch since 1970
durationInDays: the duration of the trip in days
"""
params.interactive = true
let session = try await LlamaSession<Trip>(params: params)
await #expect(throws: Never.self) {
let trip = try await session.chat(message: "Please create a trip for me to New York City that starts two weeks from now. The duration of the trip MUST be 3 days long.")
#expect(trip.location.contains("New York"))
// TODO: Testing the other fields is difficult considering model size
// TODO: so for now, we are just asserting the grammar works
}
}
}
import WeatherKit
import CoreLocation
@llamaActor actor MyLlama {
struct CurrentWeather: Codable {
let temperature: Double
let condition: WeatherCondition
}
/// Get the current weather in a given location.
/// - parameter location: The city and state, e.g. San Francisco, CA
/// - parameter unit: The unit of temperature
public static func getCurrentWeather(location: String, unit: String) async throws -> CurrentWeather {
let weather = try await WeatherService().weather(for: CLGeocoder().geocodeAddressString(location)[0].location!)
var temperature = weather.currentWeather.temperature
temperature.convert(to: .fahrenheit)
return CurrentWeather(temperature: temperature.value,
condition: weather.currentWeather.condition)
}
}
func downloadFile() async throws -> String {
let fm = FileManager.default
let tmpDir = fm.temporaryDirectory
let destinationURL = tmpDir.appending(path: "llama_groq_gguf.gguf")
guard !fm.fileExists(atPath: destinationURL.path()) else {
return destinationURL.path()
}
print("Downloading Llama Tools, this may take a while...")
// Define the URL
guard let url = URL(string: "https://huggingface.co/bartowski/Llama-3-Groq-8B-Tool-Use-GGUF/resolve/main/Llama-3-Groq-8B-Tool-Use-Q5_K_M.gguf?download=true") else {
print("Invalid URL.")
throw URLError(.badURL)
}
// Start the async download
let (tempURL, _) = try await URLSession.shared.download(from: url)
// Define the destination path in the documents directory
// Move the downloaded file to the destination
try fm.moveItem(at: tempURL, to: destinationURL)
print("File downloaded to: \(destinationURL.path())")
return destinationURL.path()
}
@Test func llamaToolSession() async throws {
let params = GPTParams()
params.modelPath = try await downloadFile()
params.nPredict = 512
params.nCtx = 4096
params.cpuParams.nThreads = 8
params.cpuParamsBatch.nThreads = 8
params.nBatch = 1024
params.nGpuLayers = 1024
let llama = try await MyLlama(params: params)
let currentWeather = try await MyLlama.getCurrentWeather(location: "San Francisco, CA", unit: "farenheit")
let output = try await llama.chat("What's the weather (in farenheit) in San Francisco, CA?")
#expect(output.contains(String(format: "%.2f", currentWeather.temperature)))
// #expect(output.contains(currentWeather.condition.rawValue))
}