diff --git a/Package.resolved b/Package.resolved new file mode 100644 index 000000000..a010b3778 --- /dev/null +++ b/Package.resolved @@ -0,0 +1,14 @@ +{ + "pins" : [ + { + "identity" : "swift-syntax", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-syntax.git", + "state" : { + "branch" : "main", + "revision" : "2c271e5ce55124ae534c2eff6e74f745e4db4f68" + } + } + ], + "version" : 2 +} diff --git a/Package.swift b/Package.swift index 1d90b47bf..95f3ac5b0 100644 --- a/Package.swift +++ b/Package.swift @@ -1,21 +1,28 @@ -// swift-tools-version:5.5 - +// swift-tools-version:5.9 +import CompilerPluginSupport import PackageDescription -var sources = [ +var cppSources = [ "src/llama.cpp", "src/llama-vocab.cpp", "src/llama-grammar.cpp", "src/llama-sampling.cpp", "src/unicode.cpp", "src/unicode-data.cpp", - "ggml/src/ggml.c", - "ggml/src/ggml-alloc.c", - "ggml/src/ggml-backend.c", - "ggml/src/ggml-quants.c", - "ggml/src/ggml-aarch64.c", + "common/sampling.cpp", + "common/common.cpp", + "common/json-schema-to-grammar.cpp", + "common/log.cpp", + "common/console.cpp" ] +var ggmlSources = [ + "src/ggml.c", + "src/ggml-alloc.c", + "src/ggml-backend.c", + "src/ggml-quants.c", + "src/ggml-aarch64.c" +] var resources: [Resource] = [] var linkerSettings: [LinkerSetting] = [] var cSettings: [CSetting] = [ @@ -24,13 +31,13 @@ var cSettings: [CSetting] = [ // NOTE: NEW_LAPACK will required iOS version 16.4+ // We should consider add this in the future when we drop support for iOS 14 // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) - // .define("ACCELERATE_NEW_LAPACK"), - // .define("ACCELERATE_LAPACK_ILP64") + .define("ACCELERATE_NEW_LAPACK"), + .define("ACCELERATE_LAPACK_ILP64") ] #if canImport(Darwin) -sources.append("ggml/src/ggml-metal.m") -resources.append(.process("ggml/src/ggml-metal.metal")) +ggmlSources.append("src/ggml-metal.m") +resources.append(.process("src/ggml-metal.metal")) linkerSettings.append(.linkedFramework("Accelerate")) cSettings.append( contentsOf: [ @@ -47,33 +54,84 @@ cSettings.append( let package = Package( name: "llama", platforms: [ - .macOS(.v12), + .macOS(.v13), .iOS(.v14), .watchOS(.v4), .tvOS(.v14) ], products: [ .library(name: "llama", targets: ["llama"]), + .executable(name: "LlamaKitMain", targets: ["LlamaKitMain"]) + ], + dependencies: [ + .package(url: "https://github.com/apple/swift-syntax.git", branch: "main") ], targets: [ + .target(name: "llama_cpp", + path: ".", + exclude: [ + "cmake", + "examples", + "scripts", + "models", + "tests", + "CMakeLists.txt", + "Makefile" + ], + sources: cppSources, + publicHeadersPath: "spm-headers"), .target( name: "llama", - path: ".", - exclude: [ - "cmake", - "examples", - "scripts", - "models", - "tests", - "CMakeLists.txt", - "Makefile" - ], - sources: sources, + dependencies: ["llama_cpp"], + path: "ggml", + sources: ggmlSources, resources: resources, - publicHeadersPath: "spm-headers", cSettings: cSettings, - linkerSettings: linkerSettings - ) + linkerSettings: linkerSettings), + .target(name: "LlamaObjC", + dependencies: ["llama"], + path: "objc", + sources: [ + "GPTParams.mm", + "GPTSampler.mm", + "LlamaBatch.mm", + "LlamaObjC.mm", + "LlamaModel.mm", + "LlamaContext.mm", + "LlamaSession.mm", + ], + publicHeadersPath: "include", + cSettings: cSettings, + linkerSettings: linkerSettings), + .macro( + name: "JSONSchemaMacros", + dependencies: [ + .product(name: "SwiftSyntax", package: "swift-syntax"), + .product(name: "SwiftSyntaxMacros", package: "swift-syntax"), + .product(name: "SwiftCompilerPlugin", package: "swift-syntax"), + ], + path: "swift/JSONSchemaMacros" + ), + .target( + name: "JSONSchema", + dependencies: ["JSONSchemaMacros"], + path: "swift/JSONSchema" + ), + .target( + name: "LlamaKit", + dependencies: ["JSONSchema", "LlamaObjC"], + path: "swift/LlamaKit" + ), + .testTarget(name: "LlamaKitTests", + dependencies: ["LlamaKit", "JSONSchema", "JSONSchemaMacros"], + path: "swift/test", + linkerSettings: [ + .linkedFramework("XCTest"), + .linkedFramework("Testing")]), + .executableTarget(name: "LlamaKitMain", + dependencies: ["LlamaKit"], + path: "swift/main", + resources: [.process("Llama-3.2-3B-Instruct-Q4_0.gguf")]), ], - cxxLanguageStandard: .cxx11 + cxxLanguageStandard: .cxx17 ) diff --git a/common/common.h b/common/common.h index cb87c4479..b0fdb63ce 100644 --- a/common/common.h +++ b/common/common.h @@ -34,10 +34,10 @@ struct llama_lora_adapter_container : llama_lora_adapter_info { }; // build info -extern int LLAMA_BUILD_NUMBER; -extern char const * LLAMA_COMMIT; -extern char const * LLAMA_COMPILER; -extern char const * LLAMA_BUILD_TARGET; +static int LLAMA_BUILD_NUMBER = 0; +static char const * LLAMA_COMMIT = ""; +static char const * LLAMA_COMPILER = ""; +static char const * LLAMA_BUILD_TARGET = ""; struct llama_control_vector_load_info; diff --git a/objc/GPTParams.mm b/objc/GPTParams.mm new file mode 100644 index 000000000..6d1c2f5bd --- /dev/null +++ b/objc/GPTParams.mm @@ -0,0 +1,726 @@ +#import +#import "GPTParams_Private.hpp" +#import "../common/common.h" +#import "ggml.h" + +@implementation GGMLThreadpool { + ggml_threadpool *threadpool; +} + +- (instancetype)initWithThreadpool:(ggml_threadpool *)threadpool +{ + self = [super init]; + if (self) { + self->threadpool = threadpool; + } + return self; +} + +- (ggml_threadpool *)threadpool { + return threadpool; +} + +@end + +@implementation GGMLThreadpoolParams { + ggml_threadpool_params params; +} + +- (BOOL)getCpuMaskAtIndex:(NSUInteger)index { + abort(); +} + +- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index { + abort(); +} + +- (instancetype)initWithParams:(ggml_threadpool_params&&)params +{ + self = [super init]; + if (self) { + self->params = params; + } + return self; +} + +- (BOOL)isEqual:(id)other { + GGMLThreadpoolParams *rhs = (GGMLThreadpoolParams *)other; + ggml_threadpool_params rhs_params = rhs->params; + return ggml_threadpool_params_match(¶ms, &rhs_params); +} + +- (GGMLThreadpool *)threadpool { + auto tp = ggml_threadpool_new(¶ms); + return [[GGMLThreadpool alloc] initWithThreadpool:tp]; +} +@end + +@implementation CPUParams { + cpu_params *params; +} + +- (instancetype)initWithParams:(cpu_params&)params; +{ + self = [super init]; + if (self) { + self->params = ¶ms; + } + return self; +} + +- (int)nThreads { + return params->n_threads; +} + +- (void)setNThreads:(int)nThreads { + params->n_threads = nThreads; +} + +- (BOOL)maskValid { + return params->mask_valid; +} + +- (void)setMaskValid:(BOOL)maskValid { + params->mask_valid = maskValid; +} + +- (GGMLSchedPriority)priority { + return GGMLSchedPriority(params->priority); +} + +- (void)setPriority:(GGMLSchedPriority)priority { + params->priority = ggml_sched_priority(priority); +} + +- (BOOL)strictCPU { + return params->strict_cpu; +} + +- (void)setStrictCPU:(BOOL)strictCPU { + params->strict_cpu = strictCPU; +} + +- (uint32_t)poll { + return params->poll; +} + +- (void)setPoll:(uint32_t)poll { + params->poll = poll; +} + +- (BOOL)getCpuMaskAtIndex:(NSUInteger)index { + return params->cpumask[index]; +} + +- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index { + params->cpumask[index] = value; +} + +- (GGMLThreadpoolParams *)ggmlThreadpoolParams { + return [[GGMLThreadpoolParams alloc] initWithParams:ggml_threadpool_params_from_cpu_params(*params)]; +} + +@end + +@implementation GPTSamplerParams { + gpt_sampler_params *gpt_sampler_params; +} + +- (instancetype)initWithParams:(gpt_sampler_params&)params { + self = [super init]; + if (self) { + gpt_sampler_params = ¶ms; + } + return self; +} + +// Getters and setters for Objective-C properties, which manipulate the C++ struct + +- (uint32_t)seed { + return gpt_sampler_params->seed; +} + +- (void)setSeed:(uint32_t)seed { + gpt_sampler_params->seed = seed; +} + +- (int32_t)nPrev { + return gpt_sampler_params->n_prev; +} + +- (void)setNPrev:(int32_t)nPrev { + gpt_sampler_params->n_prev = nPrev; +} + +- (int32_t)nProbs { + return gpt_sampler_params->n_probs; +} + +- (void)setNProbs:(int32_t)nProbs { + gpt_sampler_params->n_probs = nProbs; +} + +- (int32_t)minKeep { + return gpt_sampler_params->min_keep; +} + +- (void)setMinKeep:(int32_t)minKeep { + gpt_sampler_params->min_keep = minKeep; +} + +- (int32_t)topK { + return gpt_sampler_params->top_k; +} + +- (void)setTopK:(int32_t)topK { + gpt_sampler_params->top_k = topK; +} + +- (float)topP { + return gpt_sampler_params->top_p; +} + +- (void)setTopP:(float)topP { + gpt_sampler_params->top_p = topP; +} + +- (float)minP { + return gpt_sampler_params->min_p; +} + +- (void)setMinP:(float)minP { + gpt_sampler_params->min_p = minP; +} + +- (float)tfsZ { + return gpt_sampler_params->tfs_z; +} + +- (void)setTfsZ:(float)tfsZ { + gpt_sampler_params->tfs_z = tfsZ; +} + +- (float)typP { + return gpt_sampler_params->typ_p; +} + +- (void)setTypP:(float)typP { + gpt_sampler_params->typ_p = typP; +} + +- (float)temp { + return gpt_sampler_params->temp; +} + +- (void)setTemp:(float)temp { + gpt_sampler_params->temp = temp; +} + +- (float)dynatempRange { + return gpt_sampler_params->dynatemp_range; +} + +- (void)setDynatempRange:(float)dynatempRange { + gpt_sampler_params->dynatemp_range = dynatempRange; +} + +- (float)dynatempExponent { + return gpt_sampler_params->dynatemp_exponent; +} + +- (void)setDynatempExponent:(float)dynatempExponent { + gpt_sampler_params->dynatemp_exponent = dynatempExponent; +} + +- (int32_t)penaltyLastN { + return gpt_sampler_params->penalty_last_n; +} + +- (void)setPenaltyLastN:(int32_t)penaltyLastN { + gpt_sampler_params->penalty_last_n = penaltyLastN; +} + +- (float)penaltyRepeat { + return gpt_sampler_params->penalty_repeat; +} + +- (void)setPenaltyRepeat:(float)penaltyRepeat { + gpt_sampler_params->penalty_repeat = penaltyRepeat; +} + +- (float)penaltyFreq { + return gpt_sampler_params->penalty_freq; +} + +- (void)setPenaltyFreq:(float)penaltyFreq { + gpt_sampler_params->penalty_freq = penaltyFreq; +} + +- (float)penaltyPresent { + return gpt_sampler_params->penalty_present; +} + +- (void)setPenaltyPresent:(float)penaltyPresent { + gpt_sampler_params->penalty_present = penaltyPresent; +} + +- (int32_t)mirostat { + return gpt_sampler_params->mirostat; +} + +- (void)setMirostat:(int32_t)mirostat { + gpt_sampler_params->mirostat = mirostat; +} + +- (float)mirostatTau { + return gpt_sampler_params->mirostat_tau; +} + +- (void)setMirostatTau:(float)mirostatTau { + gpt_sampler_params->mirostat_tau = mirostatTau; +} + +- (float)mirostatEta { + return gpt_sampler_params->mirostat_eta; +} + +- (void)setMirostatEta:(float)mirostatEta { + gpt_sampler_params->mirostat_eta = mirostatEta; +} + +- (BOOL)penalizeNl { + return gpt_sampler_params->penalize_nl; +} + +- (void)setPenalizeNl:(BOOL)penalizeNl { + gpt_sampler_params->penalize_nl = penalizeNl; +} + +- (BOOL)ignoreEos { + return gpt_sampler_params->ignore_eos; +} + +- (void)setIgnoreEos:(BOOL)ignoreEos { + gpt_sampler_params->ignore_eos = ignoreEos; +} + +- (BOOL)noPerf { + return gpt_sampler_params->no_perf; +} + +- (void)setNoPerf:(BOOL)noPerf { + gpt_sampler_params->no_perf = noPerf; +} + +// For `samplers`, convert from NSArray to std::vector +- (NSArray *)samplers { + NSMutableArray *samplersArray = [NSMutableArray array]; + for (auto sampler : gpt_sampler_params->samplers) { + [samplersArray addObject:@(sampler)]; + } + return [samplersArray copy]; +} + +- (void)setSamplers:(NSArray *)samplers { + gpt_sampler_params->samplers.clear(); + for (NSNumber *sampler in samplers) { + gpt_sampler_params->samplers.push_back(static_cast(sampler.intValue)); + } +} + +//// For `logitBias`, convert from NSArray to std::vector +//- (NSArray *)logitBias { +// NSMutableArray *logitBiasArray = [NSMutableArray array]; +// for (auto bias : gpt_sampler_params.logit_bias) { +// [logitBiasArray addObject:bias]; +// } +// return [logitBiasArray copy]; +//} +// +//- (void)setLogitBias:(NSArray *)logitBias { +// gpt_sampler_params.logit_bias.clear(); +// for (NSNumber *bias in logitBias) { +// gpt_sampler_params.logit_bias.push_back(bias.floatValue); +// } +//} + +// For `grammar`, convert between NSString and std::string +- (NSString *)grammar { + return [NSString stringWithUTF8String:gpt_sampler_params->grammar.c_str()]; +} + +- (void)setGrammar:(NSString *)grammar { + gpt_sampler_params->grammar = std::string([grammar UTF8String]); +} + +// Method to print out the parameters as a string +- (NSString *)print { + NSMutableString *output = [NSMutableString stringWithString:@"GPT Sampler Params:\n"]; + [output appendFormat:@"Seed: %u\n", self.seed]; + [output appendFormat:@"nPrev: %d\n", self.nPrev]; + [output appendFormat:@"nProbs: %d\n", self.nProbs]; + [output appendFormat:@"minKeep: %d\n", self.minKeep]; + [output appendFormat:@"topK: %d\n", self.topK]; + [output appendFormat:@"topP: %.2f\n", self.topP]; + [output appendFormat:@"minP: %.2f\n", self.minP]; + [output appendFormat:@"tfsZ: %.2f\n", self.tfsZ]; + [output appendFormat:@"typP: %.2f\n", self.typP]; + [output appendFormat:@"temp: %.2f\n", self.temp]; + [output appendFormat:@"dynatempRange: %.2f\n", self.dynatempRange]; + [output appendFormat:@"dynatempExponent: %.2f\n", self.dynatempExponent]; + [output appendFormat:@"penaltyLastN: %d\n", self.penaltyLastN]; + [output appendFormat:@"penaltyRepeat: %.2f\n", self.penaltyRepeat]; + [output appendFormat:@"penaltyFreq: %.2f\n", self.penaltyFreq]; + [output appendFormat:@"penaltyPresent: %.2f\n", self.penaltyPresent]; + [output appendFormat:@"mirostat: %d\n", self.mirostat]; + [output appendFormat:@"mirostatTau: %.2f\n", self.mirostatTau]; + [output appendFormat:@"mirostatEta: %.2f\n", self.mirostatEta]; + [output appendFormat:@"penalizeNl: %@\n", self.penalizeNl ? @"YES" : @"NO"]; + [output appendFormat:@"ignoreEos: %@\n", self.ignoreEos ? @"YES" : @"NO"]; + [output appendFormat:@"noPerf: %@\n", self.noPerf ? @"YES" : @"NO"]; + [output appendFormat:@"Grammar: %@\n", self.grammar]; + + // Print samplers + [output appendString:@"Samplers: "]; + for (NSNumber *sampler in self.samplers) { + [output appendFormat:@"%d, ", sampler.intValue]; + } + [output appendString:@"\n"]; + + // Print logit biases + [output appendString:@"Logit Biases: "]; + for (NSNumber *bias in self.logitBias) { + [output appendFormat:@"%.2f, ", bias.floatValue]; + } + [output appendString:@"\n"]; + + return [output copy]; +} + +- (gpt_sampler_params&)cParams { + return *gpt_sampler_params; +} + +@end + +@implementation GPTParams { + gpt_params gpt_params; +} + +- (NSArray *)antiPrompts { + auto antiprompts = [[NSMutableArray alloc] init]; + for (auto& antiprompt : gpt_params.antiprompt) { + [antiprompts addObject:[NSString stringWithCString:antiprompt.c_str() encoding:NSUTF8StringEncoding]]; + } + return antiprompts; +} + +- (gpt_params&)params { + return gpt_params; +} + +- (int32_t)nPredict { + return gpt_params.n_predict; +} + +- (void)setNPredict:(int32_t)nPredict { + gpt_params.n_predict = nPredict; +} + +- (NSInteger)nCtx { + return gpt_params.n_ctx; +} + +- (void)setNCtx:(NSInteger)nCtx { + gpt_params.n_ctx = nCtx; +} + +- (int32_t)nBatch { + return gpt_params.n_batch; +} + +- (void)setNBatch:(int32_t)nBatch { + gpt_params.n_batch = nBatch; +} + +- (int32_t)nUBatch { + return gpt_params.n_ubatch; +} + +- (void)setNUBatch:(int32_t)nUBatch { + gpt_params.n_ubatch = nUBatch; +} + +- (int32_t)nKeep { + return gpt_params.n_keep; +} + +- (void)setNKeep:(int32_t)nKeep { + gpt_params.n_keep = nKeep; +} + +- (int32_t)nDraft { + return gpt_params.n_draft; +} + +- (void)setNDraft:(int32_t)nDraft { + gpt_params.n_draft = nDraft; +} + +- (int32_t)nChunks { + return gpt_params.n_chunks; +} + +- (void)setNChunks:(int32_t)nChunks { + gpt_params.n_chunks = nChunks; +} + +- (int32_t)nParallel { + return gpt_params.n_parallel; +} + +- (void)setNParallel:(int32_t)nParallel { + gpt_params.n_parallel = nParallel; +} + +- (int32_t)nSequences { + return gpt_params.n_sequences; +} + +- (void)setNSequences:(int32_t)nSequences { + gpt_params.n_sequences = nSequences; +} + +- (float)pSplit { + return gpt_params.p_split; +} + +- (void)setPSplit:(float)pSplit { + gpt_params.p_split = pSplit; +} + +- (int32_t)nGpuLayers { + return gpt_params.n_gpu_layers; +} + +- (void)setNGpuLayers:(int32_t)nGpuLayers { + gpt_params.n_gpu_layers = nGpuLayers; +} + +- (int32_t)nGpuLayersDraft { + return gpt_params.n_gpu_layers_draft; +} + +- (void)setNGpuLayersDraft:(int32_t)nGpuLayersDraft { + gpt_params.n_gpu_layers_draft = nGpuLayersDraft; +} + +- (int32_t)mainGpu { + return gpt_params.main_gpu; +} + +- (void)setMainGpu:(int32_t)mainGpu { + gpt_params.main_gpu = mainGpu; +} + +- (int32_t)grpAttnN { + return gpt_params.grp_attn_n; +} + +- (void)setGrpAttnN:(int32_t)grpAttnN { + gpt_params.grp_attn_n = grpAttnN; +} + +- (int32_t)grpAttnW { + return gpt_params.grp_attn_w; +} + +- (void)setGrpAttnW:(int32_t)grpAttnW { + gpt_params.grp_attn_w = grpAttnW; +} + +- (int32_t)nPrint { + return gpt_params.n_print; +} + +- (void)setNPrint:(int32_t)nPrint { + gpt_params.n_print = nPrint; +} + +- (float)ropeFreqBase { + return gpt_params.rope_freq_base; +} + +- (void)setRopeFreqBase:(float)ropeFreqBase { + gpt_params.rope_freq_base = ropeFreqBase; +} + +- (float)ropeFreqScale { + return gpt_params.rope_freq_scale; +} + +- (void)setRopeFreqScale:(float)ropeFreqScale { + gpt_params.rope_freq_scale = ropeFreqScale; +} + +- (float)yarnExtFactor { + return gpt_params.yarn_ext_factor; +} + +- (void)setYarnExtFactor:(float)yarnExtFactor { + gpt_params.yarn_ext_factor = yarnExtFactor; +} + +- (float)yarnAttnFactor { + return gpt_params.yarn_attn_factor; +} + +- (void)setYarnAttnFactor:(float)yarnAttnFactor { + gpt_params.yarn_attn_factor = yarnAttnFactor; +} + +- (float)yarnBetaFast { + return gpt_params.yarn_beta_fast; +} + +- (void)setYarnBetaFast:(float)yarnBetaFast { + gpt_params.yarn_beta_fast = yarnBetaFast; +} + +- (float)yarnBetaSlow { + return gpt_params.yarn_beta_slow; +} + +- (void)setYarnBetaSlow:(float)yarnBetaSlow { + gpt_params.yarn_beta_slow = yarnBetaSlow; +} + +- (int32_t)yarnOrigCtx { + return gpt_params.yarn_orig_ctx; +} + +- (void)setYarnOrigCtx:(int32_t)yarnOrigCtx { + gpt_params.yarn_orig_ctx = yarnOrigCtx; +} + +- (float)defragThold { + return gpt_params.defrag_thold; +} + +- (void)setDefragThold:(float)defragThold { + gpt_params.defrag_thold = defragThold; +} + +// Assuming tensorSplit remains a fixed array in C struct, we can create a method to access specific values. +- (float)tensorSplitAtIndex:(NSUInteger)index { + if (index < 128) { + return gpt_params.tensor_split[index]; + } + return 0.0f; // Return default value if index is out of bounds +} + +- (void)setTensorSplitValue:(float)value atIndex:(NSUInteger)index { + if (index < 128) { + gpt_params.tensor_split[index] = value; + } +} + +- (BOOL)embedding { + return gpt_params.embedding; +} + +- (void)setEmbedding:(BOOL)embedding { + gpt_params.embedding = embedding; +} + +- (LlamaModelParams *)LlamaModelParams { + return nil; +} + +- (BOOL)ctxShift { + return gpt_params.ctx_shift; +} + +- (void)setCtxShift:(BOOL)ctxShift { + gpt_params.ctx_shift = ctxShift; +} + +- (CPUParams *)cpuParams { + return [[CPUParams alloc] initWithParams:gpt_params.cpuparams]; +} + +- (CPUParams *)cpuParamsBatch { + return [[CPUParams alloc] initWithParams:gpt_params.cpuparams_batch]; +} + +- (GPTSamplerParams *)samplerParams { + return [[GPTSamplerParams alloc] initWithParams:gpt_params.sparams]; +} + +- (NSString *)modelURL { + return [NSString stringWithCString:gpt_params.model_url.c_str() encoding:NSUTF8StringEncoding]; +} + +- (void)setModelURL:(NSString *)modelURL { + gpt_params.model_url = [modelURL cStringUsingEncoding:NSUTF8StringEncoding]; +} + +- (NSString *)modelPath { + return [NSString stringWithCString:gpt_params.model.c_str() encoding:NSUTF8StringEncoding]; +} + +- (void)setModelPath:(NSString *)modelPath { + gpt_params.model = [modelPath cStringUsingEncoding:NSUTF8StringEncoding]; +} + +- (NSString *)pathPromptCache { + return [[NSString alloc] initWithCString:gpt_params.path_prompt_cache.c_str() encoding:NSUTF8StringEncoding]; +} + +- (void)setPathPromptCache:(NSString *)pathPromptCache { + gpt_params.path_prompt_cache = [pathPromptCache cStringUsingEncoding:NSUTF8StringEncoding]; +} + +- (BOOL)enableChatTemplate { + return gpt_params.enable_chat_template; +} + +- (void)setEnableChatTemplate:(BOOL)enableChatTemplate { + gpt_params.enable_chat_template = enableChatTemplate; +} + +- (NSString *)chatTemplate { + return [NSString stringWithCString:gpt_params.chat_template.c_str() + encoding:NSUTF8StringEncoding]; +} + +- (void)setChatTemplate:(NSString *)chatTemplate { + gpt_params.chat_template = [chatTemplate cStringUsingEncoding:NSUTF8StringEncoding]; +} + +- (NSString *)inputPrefix { + return [NSString stringWithCString:gpt_params.input_prefix.c_str() + encoding:NSUTF8StringEncoding]; +} + +- (void)setInputPrefix:(NSString *)inputPrefix { + gpt_params.input_prefix = [inputPrefix cStringUsingEncoding:NSUTF8StringEncoding]; +} + +- (NSString *)inputSuffix { + return [NSString stringWithCString:gpt_params.input_suffix.c_str() + encoding:NSUTF8StringEncoding]; +} + +- (void)setInputSuffix:(NSString *)inputSuffix { + gpt_params.input_suffix = [inputSuffix cStringUsingEncoding:NSUTF8StringEncoding]; +} + + +- (LlamaContextParams *)llamaContextParams { +} + +- (LlamaModelParams *)llamaModelParams { +} + +@end diff --git a/objc/GPTSampler.mm b/objc/GPTSampler.mm new file mode 100644 index 000000000..af318c883 --- /dev/null +++ b/objc/GPTSampler.mm @@ -0,0 +1,49 @@ +#import +#import +#import +#import +#import +#import "../../common/sampling.h" + +@implementation GPTSampler { + gpt_sampler *sampler; +} + +- (instancetype)init:(LlamaModel *)model gptSamplerParams:(GPTSamplerParams *)gptSamplerParams +{ + self = [super init]; + if (self) { + self->sampler = gpt_sampler_init([model cModel], [gptSamplerParams cParams]); + } + return self; +} + +- (uint32_t)seed { + return gpt_sampler_get_seed(sampler); +} + +- (LlamaToken)sample:(LlamaContext *)context index:(NSInteger)index { + return [self sample:context index:index grammarFirst:false]; +} + +- (LlamaToken)sample:(LlamaContext *)context index:(NSInteger)index grammarFirst:(BOOL)grammarFirst { + return gpt_sampler_sample(sampler, [context cContext], index, grammarFirst); +} + +- (void)accept:(LlamaToken)token acceptGrammar:(BOOL)acceptGrammar { + gpt_sampler_accept(sampler, token, acceptGrammar); +} + +- (NSString *)previousString:(LlamaContext *)context n:(NSInteger)n { + return [[NSString alloc] initWithCString:gpt_sampler_prev_str(sampler, [context cContext], n).data() encoding:NSUTF8StringEncoding]; +} + +- (LlamaToken)last { + return gpt_sampler_last(sampler); +} + +- (void)reset { + gpt_sampler_reset(sampler); +} + +@end diff --git a/objc/LlamaBatch.mm b/objc/LlamaBatch.mm new file mode 100644 index 000000000..6d3acab41 --- /dev/null +++ b/objc/LlamaBatch.mm @@ -0,0 +1,21 @@ +#import +#import "LlamaBatch_Private.hpp" +#import "llama.h" + +@implementation LlamaBatch { + llama_batch batch; +} + +- (instancetype)initWithBatch:(llama_batch)batch { + self->batch = batch; +} + +- (NSData *)output { + return [[NSData alloc] initWithBytes:batch.logits length:batch.n_tokens]; +} + +- (llama_batch)cBatch { + return batch; +} + +@end diff --git a/objc/LlamaContext.mm b/objc/LlamaContext.mm new file mode 100644 index 000000000..0961ee366 --- /dev/null +++ b/objc/LlamaContext.mm @@ -0,0 +1,94 @@ +#import +#import "LlamaContext_Private.hpp" +#import "GPTParams_Private.hpp" +#import "LlamaModel_Private.hpp" +#import "LlamaBatch_Private.hpp" +#import "../../common/common.h" + +@implementation LlamaContext { + llama_context *ctx; +} + +- (instancetype)initWithContext:(llama_context *)context { + self = [super init]; + if (self) { + ctx = context; + } + return self; +} + +- (void)attachThreadpool:(GGMLThreadpool *)threadpool + threadpoolBatch:(GGMLThreadpool *)threadpoolBatch { + llama_attach_threadpool(ctx, [threadpool threadpool], [threadpoolBatch threadpool]); +} + + +- (NSUInteger)nCtx { + return llama_n_ctx(ctx); +} + +- (BOOL)loadStateFile:(NSString *)pathSession + tokensOut:(llama_token *)tokensOut + nTokenCpacity:(size_t)nTokenCapacity + nTokenCountOut:(size_t *)nTokenCountOut { + return llama_state_load_file(ctx, [pathSession cStringUsingEncoding:NSUTF8StringEncoding], tokensOut, nTokenCapacity, nTokenCountOut); +} + +- (LlamaModel *)model { + auto model = llama_get_model(ctx); + return [[LlamaModel alloc] init:std::remove_const_t(model)]; +} + +- (std::vector)tokenize:(NSString *)text +addSpecial:(BOOL)addSpecial +parseSpecial:(BOOL)parseSpecial { + return llama_tokenize(ctx, [text cStringUsingEncoding:NSUTF8StringEncoding], addSpecial, parseSpecial); +} + +- (std::string)convertTokensToString:(const std::vector&)tokens { + return string_from(ctx, tokens); +} + +- (llama_context *)cContext { + return ctx; +} + +- (int32_t)encode:(llama_batch)batch { + return llama_encode(ctx, batch); +} + +- (void)kvCacheSeqAdd:(LlamaSequenceId)sequenceId + p0:(LlamaPosition)p0 + p1:(LlamaPosition)p1 + delta:(LlamaPosition)delta { + llama_kv_cache_seq_add(ctx, sequenceId, p0, p1, delta); +} + +- (void)kvCacheSeqDiv:(LlamaSequenceId)sequenceId + p0:(LlamaPosition)p0 + p1:(LlamaPosition)p1 + delta:(LlamaPosition)delta { + llama_kv_cache_seq_div(ctx, sequenceId, p0, p1, delta); +} + +- (NSString *)tokenToPiece:(LlamaToken)token { + return [self tokenToPiece:token special:YES]; +} + +- (NSString *)tokenToPiece:(LlamaToken)token special:(BOOL)special { + return [[NSString alloc] initWithCString:llama_token_to_piece(ctx, token, special).c_str() encoding:NSUTF8StringEncoding]; +} + +- (NSInteger)decode:(LlamaBatch *)batch { + return llama_decode(ctx, [batch cBatch]); +} + +- (BOOL)saveStateFile:(NSString *)pathSession + tokens:(const LlamaToken *)tokens + nTokenCount:(size_t)nTokenCount { + return llama_state_save_file(ctx, + [pathSession cStringUsingEncoding:NSUTF8StringEncoding], + tokens, nTokenCount); +} + +@end diff --git a/objc/LlamaModel.mm b/objc/LlamaModel.mm new file mode 100644 index 000000000..1637d4d88 --- /dev/null +++ b/objc/LlamaModel.mm @@ -0,0 +1,70 @@ +#import +#import "LlamaModel_Private.hpp" +#import "LlamaContext_Private.hpp" +#import "LlamaBatch_Private.hpp" +#import "GPTParams_Private.hpp" +#import "GPTSampler.h" +#import "ggml.h" +#import "../common/common.h" + +@implementation LlamaChatMessage +@end + +@implementation LlamaModel { + llama_model *model; +} + +- (instancetype)init:(llama_model *)l_model { + self = [super init]; + if (self) { + model = l_model; + } + return self; +} + +- (LlamaContext *)context:(LlamaContextParams *)params { + return nil; +} + +- (BOOL)addBOSToken { + return llama_add_bos_token(model); +} + +- (BOOL)addEOSToken { + return llama_add_eos_token(model); +} + +- (LlamaToken)tokenBOS { + return llama_token_bos(model); +} + +- (int32_t)nCtxTrain { + return llama_n_ctx_train(model); +} + +- (NSString *)formatExample:(NSString *)tmpl { + return [[NSString alloc] initWithCString:llama_chat_format_example(model, [tmpl cStringUsingEncoding:NSUTF8StringEncoding]).c_str() + encoding:NSUTF8StringEncoding]; +} + +- (BOOL)hasEncoder { + return llama_model_has_encoder(model); +} + +- (llama_model *)cModel { + return model; +} + +- (BOOL)tokenIsEOG:(LlamaToken)token { + return llama_token_is_eog(model, token); +} + +- (LlamaToken)tokenEOT { + return llama_token_eot(model); +} + +- (LlamaToken)tokenEOS { + return llama_token_eos(model); +} + +@end diff --git a/objc/LlamaObjC.mm b/objc/LlamaObjC.mm new file mode 100644 index 000000000..7c08ef3c8 --- /dev/null +++ b/objc/LlamaObjC.mm @@ -0,0 +1,2 @@ +#import "LlamaObjC.h" + diff --git a/objc/LlamaSession.mm b/objc/LlamaSession.mm new file mode 100644 index 000000000..cdf359b93 --- /dev/null +++ b/objc/LlamaSession.mm @@ -0,0 +1,906 @@ +#import +#import "LlamaSession_Private.hpp" +#import "../../common/common.h" +#import "LlamaModel_Private.hpp" +#import "LlamaContext_Private.hpp" +#import "GPTSampler.h" +#import +#import "ggml.h" +#import "GPTParams_Private.hpp" +#import "LlamaBatch_Private.hpp" + +@implementation BlockingLineQueue { + // Input queue and related synchronization + NSMutableArray *inputQueue; + NSCondition *inputCondition; + + // Output queue and related synchronization + NSMutableArray *outputQueue; + NSCondition *outputCondition; + + // Log queue + NSMutableArray *log; +} + +- (instancetype)init { + if (self = [super init]) { + inputQueue = [NSMutableArray new]; + outputQueue = [NSMutableArray new]; + log = [NSMutableArray new]; + inputCondition = [[NSCondition alloc] init]; + outputCondition = [[NSCondition alloc] init]; + } + return self; +} + +- (void)addInputLine:(NSString *)line { + [inputCondition lock]; + [inputQueue addObject:line]; + [log addObject:line]; + [inputCondition signal]; // Notify that a new input line is available + [inputCondition unlock]; +} + +- (NSString *)inputLine { + [inputCondition lock]; + while ([inputQueue count] == 0) { + [inputCondition wait]; + } + NSString *line = [inputQueue objectAtIndex:0]; + [inputQueue removeObjectAtIndex:0]; + [inputCondition unlock]; + return line; +} + +- (void)addOutputLine:(NSString *)line { + [outputCondition lock]; + [outputQueue addObject:line]; + [log addObject:line]; + [outputCondition signal]; // Notify that a new output line is available + [outputCondition unlock]; +} + +- (NSString *)outputLine { + [outputCondition lock]; + while ([outputQueue count] == 0) { + [outputCondition wait]; + } + NSString *line = [outputQueue objectAtIndex:0]; + [outputQueue removeObjectAtIndex:0]; + [outputCondition unlock]; + return line; +} +@end + +@implementation LlamaSession { + std::vector embd_inp; + std::vector chat_msgs; + GPTParams *params; + GPTSampler *smpl; + BOOL isInteracting; + + bool is_antiprompt; + bool input_echo; + bool display; + bool need_to_save_session; + + int n_past; + int n_remain; + int n_consumed; + int n_session_consumed; + + std::vector input_tokens; + std::vector output_tokens;; + std::ostringstream output_ss; + std::stringstream last_output_ss; + std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode + + std::vector embd; + NSMutableString *pathSession; + NSInteger ga_i; + NSInteger ga_n; + NSInteger ga_w; + std::vector session_tokens; + // tokenized antiprompts + std::vector> antiprompt_ids; + BOOL need_insert_eot; + int n_ctx; +} + +- (NSString *)chat_add_and_format:(std::vector &) chat_msgs role:(const std::string &) role content:(const std::string &) content { + llama_chat_msg new_msg{role, content}; + auto formatted = llama_chat_format_single([self.model cModel], [params params].chat_template, chat_msgs, new_msg, role == "user"); + chat_msgs.push_back({role, content}); + os_log_debug(OS_LOG_DEFAULT, "formatted: '%s'\n", formatted.c_str()); + return [NSString stringWithCString:formatted.c_str() encoding:NSUTF8StringEncoding]; +} + +static BOOL file_is_empty(NSString *path) { + NSFileManager *manager = [NSFileManager defaultManager]; + if ([manager fileExistsAtPath:path]) { + NSDictionary *attributes = [manager attributesOfItemAtPath:path error:nil]; + unsigned long long size = [attributes fileSize]; + if (attributes && size == 0) { + return true; + } else { + return false; + } + } + return true; +} + +- (instancetype)initWithParams:(GPTParams *)params { + self = [super init]; + + self->params = params; + // model = llama_init.model; + // ctx = llama_init.context; + // + // if model == nil { + // LOG_ERR("%s: error: unable to load model\n", __func__); + // return 1; + // } + // + os_log_info(OS_LOG_DEFAULT, + "%s: llama threadpool init, n_threads = %d\n", + __func__, params.cpuParams.nThreads); + + if (params.embedding) { + os_log_error(OS_LOG_DEFAULT, + R"(************ + please use the 'embedding' tool for embedding calculations + ************)"); + abort(); + } + + if (params.nCtx != 0 && params.nCtx < 8) { + os_log_info(OS_LOG_DEFAULT, "minimum context size is 8, using minimum size."); + params.nCtx = 8; + } + + if (params.ropeFreqBase != 0) { + os_log_info(OS_LOG_DEFAULT, "changing RoPE frequency base to \(params.ropeFreqBase)"); + } + + if (params.ropeFreqScale != 0.0) { + os_log_info(OS_LOG_DEFAULT, "scaling RoPE frequency by \(params.ropeFreqScale)"); + } + + llama_backend_init(); + llama_numa_init(ggml_numa_strategy(params.numaStrategy)); + auto llama_init = llama_init_from_gpt_params([params params]); + + auto tpp_batch = params.cpuParamsBatch.ggmlThreadpoolParams; + auto tpp = params.cpuParams.ggmlThreadpoolParams; + + set_process_priority(ggml_sched_priority(params.cpuParams.priority)); + + GGMLThreadpool *threadpool_batch; + if (tpp != tpp_batch) { + threadpool_batch = [tpp_batch threadpool]; + if (!threadpool_batch) { + [NSException raise:@"batch threadpool create failed" + format:@"batch threadpool create failed"]; + } + + // Start the non-batch threadpool in the paused state + tpp.paused = true; + } + + GGMLThreadpool *threadpool = [tpp threadpool]; + if (!threadpool) { + [NSException raise:@"threadpool create failed" + format:@"threadpool create failed"]; + } + + self.ctx = [[LlamaContext alloc] initWithContext:llama_init.context]; + [self.ctx attachThreadpool:threadpool threadpoolBatch:threadpool_batch]; + self.model = [[LlamaModel alloc] init:llama_init.model]; + const int n_ctx_train = [self.model nCtxTrain]; + n_ctx = [self.ctx nCtx]; + // + if (n_ctx > n_ctx_train) { + os_log_info(OS_LOG_DEFAULT, "%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); + } + + // print chat template example in conversation mode + if (params.conversation) { + if (params.enableChatTemplate) { + os_log_info(OS_LOG_DEFAULT, "%s: chat template example:\n%s\n", __func__, + [[self.model formatExample:params.chatTemplate] cStringUsingEncoding:NSUTF8StringEncoding]); + } else { + os_log_info(OS_LOG_DEFAULT, "%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); + } + } + // print system information + @autoreleasepool { + NSLog(@"%s", gpt_params_get_system_info([params params]).c_str()); + } + + pathSession = [[NSMutableString alloc] initWithString:params.pathPromptCache]; + + NSFileManager *fileManager = [NSFileManager defaultManager]; + + if ([pathSession length] != 0) { + os_log_info(OS_LOG_DEFAULT, "%s: attempting to load saved session from '%s'\n", __func__, [pathSession cStringUsingEncoding:NSUTF8StringEncoding]); + if (![fileManager fileExistsAtPath:pathSession]) { + os_log_info(OS_LOG_DEFAULT, "%s: session file does not exist, will create.\n", __func__); + } else if (file_is_empty(pathSession)) { + os_log_info(OS_LOG_DEFAULT,"%s: The session file is empty. A new session will be initialized.\n", __func__); + } else { + // The file exists and is not empty + session_tokens.resize(n_ctx); + size_t n_token_count_out = 0; + if (![self.ctx loadStateFile:pathSession tokensOut:session_tokens.data() nTokenCpacity:session_tokens.capacity() nTokenCountOut:&n_token_count_out]) { + [NSException raise:@"SessionLoadFailure" format:@"%s: failed to load session file '%s'\n", __func__, [pathSession cStringUsingEncoding:NSUTF8StringEncoding]]; + } + session_tokens.resize(n_token_count_out); + os_log_info(OS_LOG_DEFAULT,"%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); + } + } + + BOOL addBOS = [self.model addBOSToken]; + if (![self.model hasEncoder]) { + GGML_ASSERT(![self.model addEOSToken]); + } + + os_log_debug(OS_LOG_DEFAULT, "n_ctx: %d, add_bos: %d\n", n_ctx, addBOS); + + + { + auto prompt = (params.conversation && params.enableChatTemplate && params.prompt.length > 0) + ? [self chat_add_and_format:chat_msgs role:"system" content:[params params].prompt] // format the system prompt in conversation mode + : params.prompt; + if (params.interactiveFirst || [params.prompt length] > 0 || session_tokens.empty()) { + os_log_debug(OS_LOG_DEFAULT, "tokenize the prompt\n"); + embd_inp = [self.ctx tokenize:prompt addSpecial:true parseSpecial:true]; + } else { + os_log_debug(OS_LOG_DEFAULT,"use session tokens\n"); + embd_inp = session_tokens; + } + + os_log_debug(OS_LOG_DEFAULT,"prompt: \"%s\"\n", [prompt cStringUsingEncoding:NSUTF8StringEncoding]); + os_log_debug(OS_LOG_DEFAULT,"tokens: %s\n", [self.ctx convertTokensToString:embd_inp].c_str()); + } + + // Should not run without any tokens + if (embd_inp.empty()) { + if (addBOS) { + embd_inp.push_back([self.model tokenBOS]); +// LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); + } else { + [NSException raise:@"InputEmptyError" format:@"input is empty"]; + } + } + + // Tokenize negative prompt + if (embd_inp.size() > n_ctx - 4) { + [NSException raise:@"PromptError" format:@"%s: prompt is too long (%d tokens, max %d)\n", __func__, (int)embd_inp.size(), n_ctx - 4]; + } + + // debug message about similarity of saved session, if applicable + size_t n_matching_session_tokens = 0; + if (!session_tokens.empty()) { + for (llama_token id : session_tokens) { + if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) { + break; + } + n_matching_session_tokens++; + } + if ([params.prompt length] == 0 && n_matching_session_tokens == embd_inp.size()) { +// LOG_INF("%s: using full prompt from session file\n", __func__); + } else if (n_matching_session_tokens >= embd_inp.size()) { +// LOG_INF("%s: session file has exact match for prompt!\n", __func__); + } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { +// LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", +// __func__, n_matching_session_tokens, embd_inp.size()); + } else { +// LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n", +// __func__, n_matching_session_tokens, embd_inp.size()); + } + + // remove any "future" tokens that we might have inherited from the previous session + llama_kv_cache_seq_rm([self.ctx cContext], -1, n_matching_session_tokens, -1); + } + // + // os_log_debug(OS_LOG_DEFAULT, "recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", + // embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size()); + // + // if we will use the cache for the full prompt without reaching the end of the cache, force + // reevaluation of the last token to recalculate the cached logits + if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { +// os_log_debug(OS_LOG_DEFAULT, "recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1); + + session_tokens.resize(embd_inp.size() - 1); + } + + // number of tokens to keep when resetting context + if (params.nKeep < 0 || params.nKeep > (int) embd_inp.size()) { + params.nKeep = (int)embd_inp.size(); + } else { + params.nKeep += addBOS; // always keep the BOS token + } + + if (params.conversation) { + params.interactiveFirst = true; + } + + // enable interactive mode if interactive start is specified + if (params.interactiveFirst) { + params.interactive = true; + } + + if (params.verbosePrompt) { +// LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); +// LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int) embd_inp.size(); i++) { + os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n", embd_inp[i], + [[self.ctx tokenToPiece:embd_inp[i]] cStringUsingEncoding:NSUTF8StringEncoding]); + } + + if (params.nKeep > addBOS) { +// LOG_INF("%s: static prompt based on n_keep: '", __func__); + for (int i = 0; i < params.nKeep; i++) { + os_log_debug(OS_LOG_DEFAULT, "%s", + [[self.ctx tokenToPiece:embd_inp[i]] cStringUsingEncoding:NSUTF8StringEncoding]); + } +// LOG("'\n"); + } +// LOG_INF("\n"); + } + // + // // ctrl+C handling + // { + //#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + // struct sigaction sigint_action; + // sigint_action.sa_handler = sigint_handler; + // sigemptyset (&sigint_action.sa_mask); + // sigint_action.sa_flags = 0; + // sigaction(SIGINT, &sigint_action, NULL); + //#elif defined (_WIN32) + // auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + // return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + // }; + // SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); + //#endif + // } + // + if (params.interactive) { + os_log_info(OS_LOG_DEFAULT, "%s: interactive mode on.\n", __func__); + + if ([params.antiPrompts count] > 0) { + for (NSString *antiprompt in params.antiPrompts) { + os_log_info(OS_LOG_DEFAULT, "Reverse prompt: '%s'\n", [antiprompt cStringUsingEncoding:NSUTF8StringEncoding]); + if (params.verbosePrompt) { + auto tmp = [_ctx tokenize:antiprompt + addSpecial:false + parseSpecial:true]; + for (int i = 0; i < (int) tmp.size(); i++) { + os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n", tmp[i], [[self.ctx tokenToPiece:tmp[i]] cStringUsingEncoding:NSUTF8StringEncoding]); + } + } + } + } + + if (params.inputPrefixBOS) { + os_log_info(OS_LOG_DEFAULT, "Input prefix with BOS\n"); + } + + if ([params.inputPrefix length] > 0) { + os_log_info(OS_LOG_DEFAULT, "Input prefix: '%s'\n", [params.inputPrefix cStringUsingEncoding:NSUTF8StringEncoding]); + if (params.verbosePrompt) { + auto tmp = [_ctx tokenize:params.inputPrefix addSpecial:true parseSpecial:true]; + for (int i = 0; i < (int) tmp.size(); i++) { + os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n", + tmp[i], [[self.ctx tokenToPiece:tmp[i]] cStringUsingEncoding:NSUTF8StringEncoding]); + } + } + } + + if ([params.inputSuffix length] > 0) { + os_log_info(OS_LOG_DEFAULT, "Input suffix: '%s'\n", [params.inputSuffix cStringUsingEncoding:NSUTF8StringEncoding]); + if (params.verbosePrompt) { + auto tmp = [_ctx tokenize:params.inputSuffix addSpecial:false parseSpecial:true]; + for (int i = 0; i < (int) tmp.size(); i++) { + os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n", + tmp[i], [[self.ctx tokenToPiece:tmp[i]] cStringUsingEncoding:NSUTF8StringEncoding]); + } + } + } + } + + smpl = [[GPTSampler alloc] init:_model gptSamplerParams:[params samplerParams]]; + if (!smpl) { + [NSException raise:@"SamplingFailure" format:@"failed to initialize sampling subsystem"]; + } + + os_log_info(OS_LOG_DEFAULT, "sampler seed: %u\n", [smpl seed]); + // LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); + // LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str()); + // + // LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + // + // group-attention state + // number of grouped KV tokens so far (used only if params.grp_attn_n > 1) + + ga_n = params.grpAttnN; + ga_w = params.grpAttnW; + + if (ga_n != 1) { + GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT + GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT + //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT + //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT + os_log_info(OS_LOG_DEFAULT, "self-extend: n_ctx_train = %d, grp_attn_n = %ld, grp_attn_w = %ld\n", n_ctx_train, static_cast(ga_n), static_cast(ga_w)); + } + + if (params.interactive) { + const char * control_message; + if (params.multilineInput) { + control_message = " - To return control to the AI, end your input with '\\'.\n" + " - To return control without starting a new line, end your input with '/'.\n"; + } else { + control_message = " - Press Return to return control to the AI.\n" + " - To return control without starting a new line, end your input with '/'.\n" + " - If you want to submit another line, end your input with '\\'.\n"; + } + + isInteracting = params.interactiveFirst; + } + + is_antiprompt = false; + input_echo = true; + display = true; + need_to_save_session = [pathSession length] > 0 && n_matching_session_tokens < embd_inp.size(); + n_remain = params.nPredict; + + // // the first thing we will do is to output the prompt, so set color accordingly + // console::set_display(console::prompt); + // display = params.display_prompt; + // + + + + + antiprompt_ids.reserve([params.antiPrompts count]); + for (NSString *antiprompt in params.antiPrompts) { + antiprompt_ids.emplace_back([self.ctx tokenize:antiprompt addSpecial:false parseSpecial:true]); + } + + if ([self.model hasEncoder]) { + int enc_input_size = embd_inp.size(); + llama_token * enc_input_buf = embd_inp.data(); + + if ([_ctx encode:llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0)]) { + [NSException raise:@"EvalFailure" format:@"failed to eval"]; + } + + llama_token decoder_start_token_id = llama_model_decoder_start_token([self.model cModel]); + if (decoder_start_token_id == -1) { + decoder_start_token_id = [self.model tokenBOS]; + } + + embd_inp.clear(); + embd_inp.push_back(decoder_start_token_id); + } + return self; +} + +- (void)start:(BlockingLineQueue *)queue { + while ((n_remain != 0 && !is_antiprompt) || params.interactive) { + // predict + if (!embd.empty()) { + // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via + // --prompt or --file which uses the same value. + int max_embd_size = n_ctx - 4; + + // Ensure the input doesn't exceed the context size by truncating embd if necessary. + if ((int) embd.size() > max_embd_size) { + const int skipped_tokens = (int) embd.size() - max_embd_size; + embd.resize(max_embd_size); + +// console::set_display(console::error); + os_log_error(OS_LOG_DEFAULT, "<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); +// console::set_display(console::reset); + } + + if (params.grpAttnN == 1) { + // infinite text generation via context shifting + // if we run out of context: + // - take the n_keep first tokens from the original prompt (via n_past) + // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches + + if (n_past + (int) embd.size() >= [_ctx nCtx]) { + if (!params.ctxShift) { + os_log_debug(OS_LOG_DEFAULT, "\n\n%s: context full and context shift is disabled => stopping\n", __func__); + break; + } else { + if (params.nPredict == -2) { + os_log_debug(OS_LOG_DEFAULT, "\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.nPredict); + break; + } + + const int n_left = n_past - params.nKeep; + const int n_discard = n_left/2; + + os_log_debug(OS_LOG_DEFAULT, "context full, swapping: n_past = %d, n_left = %d, n_ctx = %lu, n_keep = %d, n_discard = %d\n", + n_past, n_left, static_cast([_ctx nCtx]), params.nKeep, n_discard); + + llama_kv_cache_seq_rm ([self.ctx cContext], 0, params.nKeep , params.nKeep + n_discard); + llama_kv_cache_seq_add([self.ctx cContext], 0, params.nKeep + n_discard, n_past, -n_discard); + + n_past -= n_discard; + + os_log_debug(OS_LOG_DEFAULT, "after swap: n_past = %d\n", n_past); + + os_log_debug(OS_LOG_DEFAULT, "embd: %s\n", [self.ctx convertTokensToString:embd].c_str()); + + os_log_debug(OS_LOG_DEFAULT, "clear session path\n"); + [pathSession setString:@""]; + } + } + } else { + // context extension via Self-Extend + while (n_past >= ga_i + ga_w) { + const int ib = (ga_n*ga_i)/ga_w; + const int bd = (ga_w/ga_n)*(ga_n - 1); + const int dd = (ga_w/ga_n) - ib*bd - ga_w; + + os_log_debug(OS_LOG_DEFAULT, "\n"); + os_log_debug(OS_LOG_DEFAULT, "shift: [%6ld, %6d] + %6d -> [%6ld, %6d]\n", static_cast(ga_i), n_past, ib*bd, static_cast(ga_i + ib*bd), n_past + ib*bd); + os_log_debug(OS_LOG_DEFAULT, "div: [%6ld, %6ld] / %6ld -> [%6ld, %6ld]\n", static_cast(ga_i + ib*bd), static_cast(ga_i + ib*bd + ga_w), static_cast(ga_n), static_cast((ga_i + ib*bd)/ga_n), static_cast((ga_i + ib*bd + ga_w)/ga_n)); + os_log_debug(OS_LOG_DEFAULT, "shift: [%6ld, %6d] + %6d -> [%6ld, %6d]\n", static_cast(ga_i + ib*bd + ga_w), n_past + ib*bd, dd, static_cast(ga_i + ib*bd + ga_w + dd), n_past + ib*bd + dd); + + [self.ctx kvCacheSeqAdd:0 p0:ga_i p1:n_past delta:ib*bd]; + [self.ctx kvCacheSeqDiv:0 p0:ga_i + ib*bd p1:ga_i + ib*bd + ga_w delta:ga_n]; + [self.ctx kvCacheSeqAdd:0 p0:ga_i + ib*bd + ga_w p1:n_past + ib*bd delta:dd]; + + n_past -= bd; + + ga_i += ga_w/ga_n; + + os_log_debug(OS_LOG_DEFAULT, "\nn_past_old = %d, n_past = %d, ga_i = %ld\n\n", n_past + bd, n_past, static_cast(ga_i)); + } + } + + // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) + if (n_session_consumed < (int) session_tokens.size()) { + size_t i = 0; + for ( ; i < embd.size(); i++) { + if (embd[i] != session_tokens[n_session_consumed]) { + session_tokens.resize(n_session_consumed); + break; + } + + n_past++; + n_session_consumed++; + + if (n_session_consumed >= (int) session_tokens.size()) { + ++i; + break; + } + } + if (i > 0) { + embd.erase(embd.begin(), embd.begin() + i); + } + } + + for (int i = 0; i < (int) embd.size(); i += params.nBatch) { + int n_eval = (int) embd.size() - i; + if (n_eval > params.nBatch) { + n_eval = params.nBatch; + } + + os_log_debug(OS_LOG_DEFAULT, "eval: %s\n", [self.ctx convertTokensToString:embd].c_str()); + + + if ([self.ctx decode:[[LlamaBatch alloc] initWithBatch:llama_batch_get_one(&embd[i], n_eval, n_past, 0)] ]) { + [NSException raise:@"EvalFailure" format:@"failed to eval"]; + } + + n_past += n_eval; + + os_log_debug(OS_LOG_DEFAULT, "n_past = %d\n", n_past); + // Display total tokens alongside total time + if (params.nPrint > 0 && n_past % params.nPrint == 0) { + os_log_debug(OS_LOG_DEFAULT, "\n\033[31mTokens consumed so far = %d / %lu \033[0m\n", n_past, static_cast([self.ctx nCtx])); + } + } + + if (!embd.empty() && [pathSession length] > 0) { + session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); + n_session_consumed = session_tokens.size(); + } + } + + embd.clear(); + + if ((int) embd_inp.size() <= n_consumed && !isInteracting) { + // optionally save the session on first sample (for faster prompt loading next time) + if ([pathSession length] > 0 && need_to_save_session && !params.promptCacheRO) { + need_to_save_session = false; + [self.ctx saveStateFile:pathSession tokens:session_tokens.data() nTokenCount:session_tokens.size()]; +// llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + + os_log_debug(OS_LOG_DEFAULT, "saved session to %s\n", [pathSession cStringUsingEncoding:NSUTF8StringEncoding]); + } + + const llama_token idToken = [smpl sample:self.ctx index:-1]; + + [smpl accept:idToken acceptGrammar:true]; + + // os_log_debug(OS_LOG_DEFAULT, "last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); + + embd.push_back(idToken); + + // echo this to console + input_echo = true; + + // decrement remaining sampling budget + --n_remain; + + os_log_debug(OS_LOG_DEFAULT, "n_remain: %d\n", n_remain); + } else { + // some user input remains from prompt or interaction, forward it to processing + os_log_debug(OS_LOG_DEFAULT, "embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + while ((int) embd_inp.size() > n_consumed) { + embd.push_back(embd_inp[n_consumed]); + + // push the prompt in the sampling context in order to apply repetition penalties later + // for the prompt, we don't apply grammar rules + [smpl accept:embd_inp[n_consumed] acceptGrammar:false]; + + ++n_consumed; + if ((int) embd.size() >= params.nBatch) { + break; + } + } + } + + // display text + if (input_echo && display) { +// std::cout<< "DISPLAYING TEXT" << std::endl; + + for (auto idToken : embd) { + NSString *token_str = [self.ctx tokenToPiece:idToken special:params.special]; + + // Console/Stream Output + os_log_info(OS_LOG_DEFAULT, "%s", [token_str cStringUsingEncoding:NSUTF8StringEncoding]); + + // Record Displayed Tokens To Log + // Note: Generated tokens are created one by one hence this check + if (embd.size() > 1) { + // Incoming Requested Tokens + input_tokens.push_back(idToken); + + } else { + // Outgoing Generated Tokens + output_tokens.push_back(idToken); + output_ss << [token_str cStringUsingEncoding:NSUTF8StringEncoding]; + last_output_ss << [token_str cStringUsingEncoding:NSUTF8StringEncoding]; + } + + } + if (!last_output_ss.str().empty()) { +// queue->addOutputLine(last_output_ss.str()); + } + } + + // reset color to default if there is no pending user input + if (input_echo && (int) embd_inp.size() == n_consumed) { + if (!last_output_ss.str().empty()) { +// queue->addOutputLine(last_output_ss.str()); + } +// console::set_display(console::reset); + display = true; + } + + // if not currently processing queued inputs; + if ((int) embd_inp.size() <= n_consumed) { + // check for reverse prompt in the last n_prev tokens + if ([params.antiPrompts count] > 0) { + const int n_prev = 32; + NSString *last_output = [smpl previousString:self.ctx n:n_prev]; + + is_antiprompt = false; + // Check if each of the reverse prompts appears at the end of the output. + // If we're not running interactively, the reverse prompt might be tokenized with some following characters + // so we'll compensate for that by widening the search window a bit. + for (NSString *antiprompt in params.antiPrompts) { + size_t extra_padding = params.interactive ? 0 : 2; + size_t search_start_pos = [last_output length] > static_cast([antiprompt length] + extra_padding) + ? [last_output length] - static_cast([antiprompt length] + extra_padding) + : 0; + + // TODO: Check if correct + if ([last_output rangeOfString:antiprompt options:0 range:NSMakeRange(search_start_pos, last_output.length - search_start_pos)].location != NSNotFound) { + if (params.interactive) { + isInteracting = true; + } + is_antiprompt = true; + break; + } + } + + // check for reverse prompt using special tokens + llama_token last_token = [smpl last]; + for (std::vector ids : antiprompt_ids) { + if (ids.size() == 1 && last_token == ids[0]) { + if (params.interactive) { + isInteracting = true; + } + is_antiprompt = true; + break; + } + } + + if (is_antiprompt) { + os_log_debug(OS_LOG_DEFAULT, "found antiprompt: %s\n", [last_output cStringUsingEncoding:NSUTF8StringEncoding]); + } + } + + // deal with end of generation tokens in interactive mode + + if ([self.model tokenIsEOG:[smpl last]]) { + os_log_debug(OS_LOG_DEFAULT, "found an EOG token\n"); + + if (params.interactive) { + if ([[params antiPrompts] count] > 0) { + // tokenize and inject first reverse prompt + + const auto first_antiprompt = [self.ctx tokenize:params.antiPrompts[0] addSpecial:false parseSpecial:true]; + embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); + is_antiprompt = true; + } + + if (params.enableChatTemplate) { + [self chat_add_and_format:chat_msgs + role:"assistant" + content:assistant_ss.str()]; + } + isInteracting = true; +// LOG("\n"); + } + } + + // if current token is not EOG, we add it to current assistant message + if (params.conversation) { + const auto idToken = [smpl last]; + assistant_ss << [[self.ctx tokenToPiece:idToken special:false] cStringUsingEncoding:NSUTF8StringEncoding]; + } + + if (n_past > 0 && isInteracting) { + os_log_debug(OS_LOG_DEFAULT, "waiting for user input\n"); + + if (params.conversation) { +// osLog_("\n> "); + } + + if (params.inputPrefixBOS) { + os_log_debug(OS_LOG_DEFAULT, "adding input prefix BOS token\n"); + embd_inp.push_back([self.model tokenBOS]); + } + + std::string buffer; + if ([params.inputPrefix length] > 0 && !params.conversation) { + os_log_debug(OS_LOG_DEFAULT, "appending input prefix: '%s'\n", [params.inputPrefix cStringUsingEncoding:NSUTF8StringEncoding]); + os_log_info(OS_LOG_DEFAULT, "%s", [params.inputPrefix cStringUsingEncoding:NSUTF8StringEncoding]); + } + + // color user input only +// console::set_display(console::user_input); + display = params.displayPrompt; + + std::string line; +// bool another_line = true; + static int read_one = 0; +// if (!read_one) { +// do { +// another_line = false;// console::readline(line, params.multiline_input); +// buffer += "What is the weather in New York?";//line; +// } while (another_line); +// read_one++; +// } +// else { + if (!last_output_ss.str().empty()) { + auto str = last_output_ss.str(); + last_output_ss.str(""); + [queue addOutputLine:[NSString stringWithCString:str.c_str() encoding:NSUTF8StringEncoding]]; + } + + buffer = [[queue inputLine] cStringUsingEncoding:NSUTF8StringEncoding]; +// do { +// another_line = console::readline(line, params.multiline_input); +// buffer += line; +// } while (another_line); +// } + // done taking input, reset color +// console::set_display(console::reset); + display = true; + + // Add tokens to embd only if the input buffer is non-empty + // Entering a empty line lets the user pass control back + if (buffer.length() > 1) { + // append input suffix if any + if ([params.inputSuffix length] > 0 && !params.conversation) { + os_log_debug(OS_LOG_DEFAULT, "appending input suffix: '%s'\n", [params.inputSuffix cStringUsingEncoding:NSUTF8StringEncoding]); + os_log_info(OS_LOG_DEFAULT, "%s", [params.inputSuffix cStringUsingEncoding:NSUTF8StringEncoding]); + } + + os_log_debug(OS_LOG_DEFAULT, "buffer: '%s'\n", buffer.c_str()); + + const size_t original_size = embd_inp.size(); + + if (params.escapeSequences) { + string_process_escapes(buffer); + } + + bool format_chat = params.conversation && params.enableChatTemplate; + std::string user_inp = format_chat + ? [[self chat_add_and_format:chat_msgs role:"user" content:std::move(buffer)] cStringUsingEncoding:NSUTF8StringEncoding] + : std::move(buffer); + // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) + const auto line_pfx = [self.ctx tokenize:params.inputPrefix addSpecial:false parseSpecial:true]; + const auto line_inp = [self.ctx tokenize:[NSString stringWithCString:user_inp.c_str() + encoding:NSUTF8StringEncoding] + addSpecial:false + parseSpecial:format_chat]; + const auto line_sfx = [self.ctx tokenize:params.inputSuffix + addSpecial:false + parseSpecial:true]; + + os_log_debug(OS_LOG_DEFAULT, "input tokens: %s\n", [self.ctx convertTokensToString:line_inp].c_str()); + + // if user stop generation mid-way, we must add EOT to finish model's last response + if (need_insert_eot && format_chat) { + llama_token eot = [self.model tokenEOT]; + embd_inp.push_back(eot == -1 ? [self.model tokenEOS] : eot); + need_insert_eot = false; + } + + embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end()); + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end()); + + for (size_t i = original_size; i < embd_inp.size(); ++i) { + const llama_token token = embd_inp[i]; + output_tokens.push_back(token); + output_ss << [[self.ctx tokenToPiece:token] cStringUsingEncoding:NSUTF8StringEncoding]; + } + + // reset assistant message + assistant_ss.str(""); + + n_remain -= line_inp.size(); + os_log_debug(OS_LOG_DEFAULT, "n_remain: %d\n", n_remain); + } else { + os_log_debug(OS_LOG_DEFAULT, "empty line, passing control back\n"); + } + + input_echo = false; // do not echo this again + } + + if (n_past > 0) { + if (isInteracting) { + [smpl reset]; + } + isInteracting = false; + } + } + + // end of generation + if (!embd.empty() && [self.model tokenIsEOG:embd.back()] && !(params.interactive)) { + os_log_info(OS_LOG_DEFAULT, " [end of text]\n"); + break; + } + + // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. + // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size). + if (params.interactive && n_remain <= 0 && params.nPredict >= 0) { + n_remain = params.nPredict; + isInteracting = true; + } + } +} + +@end diff --git a/objc/include/GPTParams.h b/objc/include/GPTParams.h new file mode 100644 index 000000000..a5073e4e5 --- /dev/null +++ b/objc/include/GPTParams.h @@ -0,0 +1,264 @@ +#ifndef GPTParams_h +#define GPTParams_h + +@class LlamaModelParams; +@class LlamaContextParams; +@class GGMLThreadpool; + +// Define the ggml_sched_priority enum +typedef NS_ENUM(NSInteger, GGMLSchedPriority) { + GGMLSchedPriorityNormal = 0, // Normal priority + GGMLSchedPriorityMedium = 1, // Medium priority + GGMLSchedPriorityHigh = 2, // High priority + GGMLSchedPriorityRealtime = 3 // Realtime priority +}; + +@interface GGMLThreadpoolParams : NSObject + +@property (nonatomic, assign) int nThreads; +@property (nonatomic, assign) GGMLSchedPriority priority; +@property (nonatomic, assign) uint32_t poll; +@property (nonatomic, assign) BOOL strictCPU; +@property (nonatomic, assign) BOOL paused; + +// Custom access methods for the cpumask array +- (BOOL)getCpuMaskAtIndex:(NSUInteger)index; +- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index; +- (GGMLThreadpool *)threadpool; + +@end + +@interface GGMLThreadpool : NSObject +@end + +@interface CPUParams : NSObject + +// Properties +@property (nonatomic, assign) int nThreads; +@property (nonatomic, assign) BOOL maskValid; +@property (nonatomic, assign) GGMLSchedPriority priority; +@property (nonatomic, assign) BOOL strictCPU; +@property (nonatomic, assign) uint32_t poll; + +// Custom methods to access or manipulate the cpumask array +- (BOOL)getCpuMaskAtIndex:(NSUInteger)index; +- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index; +- (GGMLThreadpoolParams *)ggmlThreadpoolParams; + +@end + +@interface GPTSamplerParams : NSObject + +// Properties corresponding to C++ struct fields +@property (nonatomic, assign) uint32_t seed; +@property (nonatomic, assign) int32_t nPrev; +@property (nonatomic, assign) int32_t nProbs; +@property (nonatomic, assign) int32_t minKeep; +@property (nonatomic, assign) int32_t topK; +@property (nonatomic, assign) float topP; +@property (nonatomic, assign) float minP; +@property (nonatomic, assign) float tfsZ; +@property (nonatomic, assign) float typP; +@property (nonatomic, assign) float temp; +@property (nonatomic, assign) float dynatempRange; +@property (nonatomic, assign) float dynatempExponent; +@property (nonatomic, assign) int32_t penaltyLastN; +@property (nonatomic, assign) float penaltyRepeat; +@property (nonatomic, assign) float penaltyFreq; +@property (nonatomic, assign) float penaltyPresent; +@property (nonatomic, assign) int32_t mirostat; +@property (nonatomic, assign) float mirostatTau; +@property (nonatomic, assign) float mirostatEta; +@property (nonatomic, assign) BOOL penalizeNl; +@property (nonatomic, assign) BOOL ignoreEos; +@property (nonatomic, assign) BOOL noPerf; + +// Arrays and Strings +@property (nonatomic, strong) NSArray *samplers; // Samplers mapped to NSArray of NSNumber (for enums) +@property (nonatomic, copy) NSString *grammar; // Grammar as NSString +@property (nonatomic, strong) NSArray *logitBias; // Logit biases mapped to NSArray of NSNumber + +// Method to print the parameters into a string +- (NSString *)print; + +@end + +@interface GPTParams : NSObject + +@property (nonatomic, assign) int32_t nPredict; +@property (nonatomic, assign) NSInteger nCtx; +@property (nonatomic, assign) int32_t nBatch; +@property (nonatomic, assign) int32_t nUBatch; +@property (nonatomic, assign) int32_t nKeep; +@property (nonatomic, assign) int32_t nDraft; +@property (nonatomic, assign) int32_t nChunks; +@property (nonatomic, assign) int32_t nParallel; +@property (nonatomic, assign) int32_t nSequences; +@property (nonatomic, assign) float pSplit; +@property (nonatomic, assign) int32_t nGpuLayers; +@property (nonatomic, assign) int32_t nGpuLayersDraft; +@property (nonatomic, assign) int32_t mainGpu; +@property (nonatomic, strong) NSMutableArray *tensorSplit; // Fixed-size array, stays the same +@property (nonatomic, assign) int32_t grpAttnN; +@property (nonatomic, assign) int32_t grpAttnW; +@property (nonatomic, assign) int32_t nPrint; +@property (nonatomic, assign) float ropeFreqBase; +@property (nonatomic, assign) float ropeFreqScale; +@property (nonatomic, assign) float yarnExtFactor; +@property (nonatomic, assign) float yarnAttnFactor; +@property (nonatomic, assign) float yarnBetaFast; +@property (nonatomic, assign) float yarnBetaSlow; +@property (nonatomic, assign) int32_t yarnOrigCtx; +@property (nonatomic, assign) float defragThold; + +// You need to replace your C++ struct "cpu_params" with an Objective-C class or struct accordingly +@property (nonatomic, strong) CPUParams *cpuParams; +@property (nonatomic, strong) CPUParams *cpuParamsBatch; +@property (nonatomic, strong) CPUParams *draftCpuParams; +@property (nonatomic, strong) CPUParams *draftCpuParamsBatch; + +// Callbacks (assuming they are blocks in Objective-C) +@property (nonatomic, copy) void (^cbEval)(void *); +@property (nonatomic, assign) void *cbEvalUserData; + +@property (nonatomic, assign) NSInteger numaStrategy; // Enumerations + +@property (nonatomic, assign) NSInteger splitMode; +@property (nonatomic, assign) NSInteger ropeScalingType; +@property (nonatomic, assign) NSInteger poolingType; +@property (nonatomic, assign) NSInteger attentionType; + +// Sampler parameters would also be converted to an Objective-C object +@property (nonatomic, strong) GPTSamplerParams *samplerParams; + +@property (nonatomic, copy) NSString *modelPath; +@property (nonatomic, copy) NSString *modelDraft; +@property (nonatomic, copy) NSString *modelAlias; +@property (nonatomic, copy) NSString *modelURL; +@property (nonatomic, copy) NSString *hfToken; +@property (nonatomic, copy) NSString *hfRepo; +@property (nonatomic, copy) NSString *hfFile; +@property (nonatomic, copy) NSString *prompt; +@property (nonatomic, copy) NSString *promptFile; +@property (nonatomic, copy) NSString *pathPromptCache; +@property (nonatomic, copy) NSString *inputPrefix; +@property (nonatomic, copy) NSString *inputSuffix; +@property (nonatomic, copy) NSString *logdir; +@property (nonatomic, copy) NSString *lookupCacheStatic; +@property (nonatomic, copy) NSString *lookupCacheDynamic; +@property (nonatomic, copy) NSString *logitsFile; +@property (nonatomic, copy) NSString *rpcServers; + +// Arrays in Objective-C are represented with `NSArray` +@property (nonatomic, strong) NSArray *inputFiles; +@property (nonatomic, strong) NSArray *antiPrompts; +@property (nonatomic, strong) NSArray *kvOverrides; + +// Boolean values (in Objective-C, use `BOOL`) +@property (nonatomic, assign) BOOL loraInitWithoutApply; +@property (nonatomic, strong) NSArray *loraAdapters; +@property (nonatomic, strong) NSArray *controlVectors; + +// Control params +@property (nonatomic, assign) int32_t verbosity; +@property (nonatomic, assign) int32_t controlVectorLayerStart; +@property (nonatomic, assign) int32_t controlVectorLayerEnd; + +// Performance and configuration params +@property (nonatomic, assign) int32_t pplStride; +@property (nonatomic, assign) int32_t pplOutputType; + +@property (nonatomic, assign) BOOL hellaswag; +@property (nonatomic, assign) size_t hellaswagTasks; +@property (nonatomic, assign) BOOL winogrande; +@property (nonatomic, assign) size_t winograndeTasks; +@property (nonatomic, assign) BOOL multipleChoice; +@property (nonatomic, assign) size_t multipleChoiceTasks; +@property (nonatomic, assign) BOOL klDivergence; + +@property (nonatomic, assign) BOOL usage; +@property (nonatomic, assign) BOOL useColor; +@property (nonatomic, assign) BOOL special; +@property (nonatomic, assign) BOOL interactive; +@property (nonatomic, assign) BOOL interactiveFirst; +@property (nonatomic, assign) BOOL conversation; +@property (nonatomic, assign) BOOL promptCacheAll; +@property (nonatomic, assign) BOOL promptCacheRO; + +@property (nonatomic, assign) BOOL escapeSequences; +@property (nonatomic, assign) BOOL multilineInput; +@property (nonatomic, assign) BOOL simpleIO; +@property (nonatomic, assign) BOOL continuousBatching; +@property (nonatomic, assign) BOOL flashAttention; +@property (nonatomic, assign) BOOL noPerformanceMetrics; +@property (nonatomic, assign) BOOL contextShift; + +// Server and I/O settings +@property (nonatomic, assign) int32_t port; +@property (nonatomic, assign) int32_t timeoutRead; +@property (nonatomic, assign) int32_t timeoutWrite; +@property (nonatomic, assign) int32_t httpThreads; + +@property (nonatomic, copy) NSString *hostname; +@property (nonatomic, copy) NSString *publicPath; +@property (nonatomic, copy) NSString *chatTemplate; +@property (nonatomic, copy) NSString *systemPrompt; +@property (nonatomic, assign) BOOL enableChatTemplate; + +@property (nonatomic, strong) NSArray *apiKeys; + +@property (nonatomic, copy) NSString *sslFileKey; +@property (nonatomic, copy) NSString *sslFileCert; + +@property (nonatomic, assign) BOOL endpointSlots; +@property (nonatomic, assign) BOOL endpointMetrics; +@property (nonatomic, assign) BOOL logJSON; + +@property (nonatomic, copy) NSString *slotSavePath; +@property (nonatomic, assign) float slotPromptSimilarity; + +// batched-bench params +@property (nonatomic, assign) BOOL isPPShared; +@property (nonatomic, strong) NSArray *nPP; +@property (nonatomic, strong) NSArray *nTG; +@property (nonatomic, strong) NSArray *nPL; + +// retrieval params +@property (nonatomic, strong) NSArray *contextFiles; +@property (nonatomic, assign) int32_t chunkSize; +@property (nonatomic, copy) NSString *chunkSeparator; + +// passkey params +@property (nonatomic, assign) int32_t nJunk; +@property (nonatomic, assign) int32_t iPos; + +// imatrix params +@property (nonatomic, copy) NSString *outFile; +@property (nonatomic, assign) int32_t nOutFreq; +@property (nonatomic, assign) int32_t nSaveFreq; +@property (nonatomic, assign) int32_t iChunk; +@property (nonatomic, assign) BOOL processOutput; +@property (nonatomic, assign) BOOL computePPL; + +// cvector-generator params +@property (nonatomic, assign) int nPCABatch; +@property (nonatomic, assign) int nPCAIterations; +@property (nonatomic, assign) int cvectorDimreMethod; +@property (nonatomic, copy) NSString *cvectorOutfile; +@property (nonatomic, copy) NSString *cvectorPositiveFile; +@property (nonatomic, copy) NSString *cvectorNegativeFile; + +@property (nonatomic, assign) BOOL spmInfill; +@property (nonatomic, copy) NSString *loraOutfile; +@property (nonatomic, assign) BOOL embedding; +@property (nonatomic, assign) BOOL verbosePrompt; // print prompt tokens before generation +@property (nonatomic, assign) BOOL batchedBenchOutputJSONL; +@property (nonatomic, assign) BOOL inputPrefixBOS; // prefix BOS to user inputs, preceding input_prefix +@property (nonatomic, assign) BOOL ctxShift; // context shift on inifinite text generation +@property (nonatomic, assign) BOOL displayPrompt; // print prompt before generation +- (LlamaModelParams *)llamaModelParams; +- (LlamaContextParams *)llamaContextParams; + +@end + +#endif /* GPTParams_h */ diff --git a/objc/include/GPTParams_Private.hpp b/objc/include/GPTParams_Private.hpp new file mode 100644 index 000000000..0ba1e5666 --- /dev/null +++ b/objc/include/GPTParams_Private.hpp @@ -0,0 +1,25 @@ +#ifndef GPTParams_Private_hpp +#define GPTParams_Private_hpp + +#import "GPTParams.h" +#import "ggml.h" +#import "../../common/common.h" + +@interface GGMLThreadpool() + +- (ggml_threadpool *)threadpool; + +@end + +@interface GPTParams() + +- (gpt_params&)params; + +@end + +@interface GPTSamplerParams() + +- (gpt_sampler_params&)cParams; + +@end +#endif /* GPTParams_Private_hpp */ diff --git a/objc/include/GPTSampler.h b/objc/include/GPTSampler.h new file mode 100644 index 000000000..317ae6cda --- /dev/null +++ b/objc/include/GPTSampler.h @@ -0,0 +1,55 @@ +#ifndef GPTSampler_h +#define GPTSampler_h + +@class LlamaModel; +@class GPTSamplerParams; +@class LlamaContext; +typedef int32_t LlamaToken; + +@interface GPTSampler : NSObject + +- (instancetype)init:(LlamaModel *)model gptSamplerParams:(GPTSamplerParams *)gptSamplerParams; +- (uint32_t)seed; + +// extended sampling implementation: +// +// - set logits +// - apply the configured sampler chain +// - check if the token fits the grammar (if any) +// - if not: resample by first applying the grammar constraints and then sampling again (slower path) +// +// if grammar_first is true, the grammar is applied before the samplers (slower) +// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar +// +- (LlamaToken)sample:(LlamaContext *)context + index:(NSInteger) index; + +// extended sampling implementation: +// +// - set logits +// - apply the configured sampler chain +// - check if the token fits the grammar (if any) +// - if not: resample by first applying the grammar constraints and then sampling again (slower path) +// +// if grammar_first is true, the grammar is applied before the samplers (slower) +// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar +// +- (LlamaToken)sample:(LlamaContext *)context + index:(NSInteger) index + grammarFirst:(BOOL)grammarFirst; + +// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar +- (void)accept:(LlamaToken)token + acceptGrammar:(BOOL)acceptGrammar; + +// get a string representation of the last accepted tokens +- (NSString *)previousString:(LlamaContext *)context n:(NSInteger)n; + +// get the last accepted token +- (LlamaToken)last; + +- (void)reset; + +@end + +#endif /* GPTSampler_h */ diff --git a/objc/include/LlamaBatch.h b/objc/include/LlamaBatch.h new file mode 100644 index 000000000..f5354ba1e --- /dev/null +++ b/objc/include/LlamaBatch.h @@ -0,0 +1,34 @@ +#ifndef LlamaBatch_h +#define LlamaBatch_h + +typedef NSInteger LlamaSequenceId; +typedef NSInteger LlamaPosition; +typedef int32_t LlamaToken; + +// Input data for llama_decode +// A llama_batch object can contain input about one or many sequences +// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens +// +// - token : the token ids of the input (used when embd is NULL) +// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL) +// - pos : the positions of the respective token in the sequence +// - seq_id : the sequence to which the respective token belongs +// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output +@interface LlamaBatch : NSObject + +@property (nonatomic, assign) NSInteger nTokens; +@property (nonatomic, assign) LlamaToken *tokens; +@property (nonatomic, assign) float *embd; +@property (nonatomic, assign) LlamaPosition *pos; +@property (nonatomic, assign) int32_t *nSeqId; +@property (nonatomic, assign) LlamaSequenceId **seqId; +@property (nonatomic, assign) NSData *output; + +// Helpers for smooth API transition (optional usage in the interface) +@property (nonatomic, assign) LlamaPosition allPos0; +@property (nonatomic, assign) LlamaPosition allPos1; +@property (nonatomic, assign) LlamaSequenceId allSeqId; + +@end + +#endif /* LlamaBatch_h */ diff --git a/objc/include/LlamaBatch_Private.hpp b/objc/include/LlamaBatch_Private.hpp new file mode 100644 index 000000000..fe7f9fbc5 --- /dev/null +++ b/objc/include/LlamaBatch_Private.hpp @@ -0,0 +1,13 @@ +#ifndef LlamaBatch_Private_hpp +#define LlamaBatch_Private_hpp +#import "LlamaBatch.h" +#import "llama.h" + +@interface LlamaBatch() + +- (instancetype)initWithBatch:(llama_batch)batch; +- (llama_batch)cBatch; + +@end + +#endif /* LlamaBatch_Private_hpp */ diff --git a/objc/include/LlamaContext.h b/objc/include/LlamaContext.h new file mode 100644 index 000000000..4568820ec --- /dev/null +++ b/objc/include/LlamaContext.h @@ -0,0 +1,57 @@ +#ifndef LlamaContext_h +#define LlamaContext_h + +@class GGMLThreadpool; +@class LlamaBatch; + +typedef NSInteger LlamaSequenceId; +typedef NSInteger LlamaPosition; +typedef int32_t LlamaToken; + +@interface LlamaContext : NSObject + +- (void)attachThreadpool:(GGMLThreadpool *)threadpool + threadpoolBatch:(GGMLThreadpool *)threadpoolBatch; + +- (NSUInteger)nCtx; + +// Positive return values does not mean a fatal error, but rather a warning. +// 0 - success +// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) +// < 0 - error +- (NSInteger)decode:(LlamaBatch *)batch; + +// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) +// If the KV cache is RoPEd, the KV data is updated accordingly: +// - lazily on next llama_decode() +// - explicitly with llama_kv_cache_update() +// p0 < 0 : [0, p1] +// p1 < 0 : [p0, inf) +- (void)kvCacheSeqAdd:(LlamaSequenceId)sequenceId + p0:(LlamaPosition)p0 + p1:(LlamaPosition)p1 + delta:(LlamaPosition)delta; + +// Integer division of the positions by factor of `d > 1` +// If the KV cache is RoPEd, the KV data is updated accordingly: +// - lazily on next llama_decode() +// - explicitly with llama_kv_cache_update() +// p0 < 0 : [0, p1] +// p1 < 0 : [p0, inf) +- (void)kvCacheSeqDiv:(LlamaSequenceId)sequenceId + p0:(LlamaPosition)p0 + p1:(LlamaPosition)p1 + delta:(LlamaPosition)delta; + +// tokenizes a token into a piece, optionally renders special/control tokens +// should work similar to Python's `tokenizer.id_to_piece` +- (NSString *)tokenToPiece:(LlamaToken)token; +- (NSString *)tokenToPiece:(LlamaToken)token special:(BOOL)special; + +- (BOOL)saveStateFile:(NSString *)pathSession + tokens:(const LlamaToken *)tokens + nTokenCount:(size_t)nTokenCount; + +@end + +#endif /* LlamaContext_h */ diff --git a/objc/include/LlamaContext_Private.hpp b/objc/include/LlamaContext_Private.hpp new file mode 100644 index 000000000..4a36e2f27 --- /dev/null +++ b/objc/include/LlamaContext_Private.hpp @@ -0,0 +1,28 @@ +#ifndef LlamaContext_Private_hpp +#define LlamaContext_Private_hpp + +#import "LlamaContext.h" +#import "../../common/common.h" + +@interface LlamaContext() + +- (instancetype)initWithContext:(llama_context *)context; + +- (std::vector)tokenize:(NSString *)text + addSpecial:(BOOL)addSpecial + parseSpecial:(BOOL)parseSpecial; + +- (BOOL)loadStateFile:(NSString *)pathSession + tokensOut:(llama_token *)tokensOut + nTokenCpacity:(size_t)nTokenCapacity + nTokenCountOut:(size_t *)nTokenCountOut; + +- (std::string)convertTokensToString:(const std::vector&)tokens; + +- (llama_context *)cContext; + +- (int32_t)encode:(llama_batch)batch; + +@end + +#endif /* LlamaContext_Private_hpp */ diff --git a/objc/include/LlamaModel.h b/objc/include/LlamaModel.h new file mode 100644 index 000000000..2d7e7fe87 --- /dev/null +++ b/objc/include/LlamaModel.h @@ -0,0 +1,35 @@ +#ifndef LlamaModel_h +#define LlamaModel_h + +@class GPTParams; +@class GGMLThreadpool; +@class LlamaContext; + +typedef int32_t LlamaToken; + +@interface LlamaChatMessage : NSObject + +@property (nonatomic, copy) NSString *role; +@property (nonatomic, copy) NSString *content; + +@end + +@interface LlamaContextParams : NSObject +@end + +@interface LlamaModel : NSObject + +- (LlamaContext *)context:(LlamaContextParams *)params; +- (LlamaToken)tokenBOS; +- (LlamaToken)tokenEOT; +- (LlamaToken)tokenEOS; +- (BOOL)tokenIsEOG:(LlamaToken)token; +- (int32_t)nCtxTrain; +- (BOOL)addBOSToken; +- (BOOL)addEOSToken; +- (BOOL)hasEncoder; +- (NSString *)formatExample:(NSString *)tmpl; + +@end + +#endif /* LlamaModel_h */ diff --git a/objc/include/LlamaModel_Private.hpp b/objc/include/LlamaModel_Private.hpp new file mode 100644 index 000000000..729b53464 --- /dev/null +++ b/objc/include/LlamaModel_Private.hpp @@ -0,0 +1,15 @@ +#ifndef LlamaModel_Private_hpp +#define LlamaModel_Private_hpp + +#import "LlamaModel.h" +#import "llama.h" + +@interface LlamaModel() + +- (instancetype)init:(llama_model *)model; + +- (llama_model *)cModel; + +@end + +#endif /* LlamaModel_Private_hpp */ diff --git a/objc/include/LlamaObjC.h b/objc/include/LlamaObjC.h new file mode 100644 index 000000000..13eafa97c --- /dev/null +++ b/objc/include/LlamaObjC.h @@ -0,0 +1,13 @@ +#ifndef LlamaObjC_h +#define LlamaObjC_h + +#include +#include +#include +#include +#include +#include + + + +#endif /* LlamaObjC_h */ diff --git a/objc/include/LlamaSession.h b/objc/include/LlamaSession.h new file mode 100644 index 000000000..45d2c5eea --- /dev/null +++ b/objc/include/LlamaSession.h @@ -0,0 +1,27 @@ +#ifndef LlamaSession_h +#define LlamaSession_h + +@class GPTParams; +@class LlamaModel; +@class LlamaContext; + +@interface BlockingLineQueue : NSObject + +- (void)addInputLine:(NSString *)line; +- (NSString *)inputLine; +- (void)addOutputLine:(NSString *)line; +- (NSString *)outputLine; + +@end + +@interface LlamaSession : NSObject + +@property (nonatomic, strong) LlamaModel *model; +@property (nonatomic, strong) LlamaContext *ctx; + +- (instancetype)initWithParams:(GPTParams *)params; +- (void)start:(BlockingLineQueue *)queue; + +@end + +#endif /* Header_h */ diff --git a/objc/include/LlamaSession_Private.hpp b/objc/include/LlamaSession_Private.hpp new file mode 100644 index 000000000..7e3b0243f --- /dev/null +++ b/objc/include/LlamaSession_Private.hpp @@ -0,0 +1,10 @@ +#ifndef LlamaSession_Private_hpp +#define LlamaSession_Private_hpp + +#import "LlamaSession.h" + +@interface LlamaSession() + +@end + +#endif /* LlamaSession_Private_hpp */ diff --git a/objc/include/ggml-metal.h b/objc/include/ggml-metal.h new file mode 120000 index 000000000..ee773a742 --- /dev/null +++ b/objc/include/ggml-metal.h @@ -0,0 +1 @@ +../../ggml/include/ggml-metal.h \ No newline at end of file diff --git a/swift/JSONSchema/Grammar.swift b/swift/JSONSchema/Grammar.swift new file mode 100644 index 000000000..daa2e3dbe --- /dev/null +++ b/swift/JSONSchema/Grammar.swift @@ -0,0 +1,102 @@ +import Foundation +import RegexBuilder + +let SPACE_RULE = "\" \"?" + +let PRIMITIVE_RULES: [String: String] = [ + "boolean": "(\"true\" | \"false\") space", + "number": "\"-\"? ([0-9] | [1-9] [0-9]*) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space", + "integer": "\"-\"? ([0-9] | [1-9] [0-9]*) space", + "string": "\"\\\"\" ([^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]))* \"\\\"\" space", + "null": "\"null\" space", +] + +let INVALID_RULE_CHARS_RE = try! NSRegularExpression(pattern: "[^a-zA-Z0-9-]+") +let GRAMMAR_LITERAL_ESCAPE_RE = try! NSRegularExpression(pattern: "[\r\n\"]") +let GRAMMAR_LITERAL_ESCAPES: [String: String] = ["\r": "\\r", "\n": "\\n", "\"": "\\\""] + +public class SchemaConverter { + private var propOrder: [String] + private var rules: [String: String] = ["space": SPACE_RULE] + + public init(propOrder: [String]) { + self.propOrder = propOrder + } + + private func formatLiteral(_ literal: Any) -> String { +// let escaped = GRAMMAR_LITERAL_ESCAPES.reduce("\(literal)", { +// let regex = Regex("[\r\n\"]") + let escaped = GRAMMAR_LITERAL_ESCAPES.reduce("\(literal)") { + $0.replacingOccurrences(of: $1.key, with: $1.value) + } + + return "\\\"\(escaped)\\\"" + } + + private func addRule(name: String, rule: String) -> String { + let escName = INVALID_RULE_CHARS_RE.stringByReplacingMatches( + in: name, + options: [], + range: NSRange(location: 0, length: name.count), + withTemplate: "-" + ) + + var key = escName + if let existingRule = rules[escName], existingRule != rule { + var i = 0 + while rules["\(escName)\(i)"] != nil { + i += 1 + } + key = "\(escName)\(i)" + } + + rules[key] = rule + return key + } + + public func visit(schema: [String: Any], name: String?) -> String { + let schemaType = schema["type"] as? String + let ruleName = name ?? "root" + + if let oneOf = schema["oneOf"] as? [[String: Any]] ?? schema["anyOf"] as? [[String: Any]] { + let rule = oneOf.enumerated().map { (i, altSchema) in + visit(schema: altSchema, name: "\(name ?? "")\(name != nil ? "-" : "")\(i)") + }.joined(separator: " | ") + return addRule(name: ruleName, rule: rule) + } else if let constValue = schema["const"] { + return addRule(name: ruleName, rule: formatLiteral(constValue)) + } else if let enumValues = schema["enum"] as? [Any] { + let rule = enumValues.map { "\"\(formatLiteral($0))\"" }.joined(separator: " | ") + return addRule(name: ruleName, rule: rule) + } else if schemaType == "object", let properties = schema["properties"] as? [String: Any] { + let propPairs = properties.sorted { (kv1, kv2) in + let idx1 = propOrder.firstIndex(of: kv1.key) ?? propOrder.count + let idx2 = propOrder.firstIndex(of: kv2.key) ?? propOrder.count + return (idx1, kv1.key) < (idx2, kv2.key) + } + + var rule = "\"{\" space" + for (i, (propName, propSchema)) in propPairs.enumerated() { + let propRuleName = visit(schema: propSchema as! [String : Any], name: "\(name ?? "")\(name != nil ? "-" : "")\(propName)") + if i > 0 { + rule += " \",\" space" + } + rule += " \"\(formatLiteral(propName))\" space \":\" space \(propRuleName)" + } + rule += " \"}\" space" + + return addRule(name: ruleName, rule: rule) + } else if schemaType == "array", let items = schema["items"] { + let itemRuleName = visit(schema: items as! [String : Any], name: "\(name ?? "")\(name != nil ? "-" : "")item") + let rule = "\"[\" space (\(itemRuleName) (\",\" space \(itemRuleName))*)? \"]\" space" + return addRule(name: ruleName, rule: rule) + } else { + assert(PRIMITIVE_RULES.keys.contains(schemaType ?? ""), "Unrecognized schema: \(schema)") + return addRule(name: ruleName == "root" ? "root" : schemaType!, rule: PRIMITIVE_RULES[schemaType!]!) + } + } + + public func formatGrammar() -> String { + return rules.map { (name, rule) in "\(name) ::= \(rule)" }.joined(separator: "\n") + "\n" + } +} diff --git a/swift/JSONSchema/JSONSchema.swift b/swift/JSONSchema/JSONSchema.swift new file mode 100644 index 000000000..69ffceb99 --- /dev/null +++ b/swift/JSONSchema/JSONSchema.swift @@ -0,0 +1,187 @@ +import Foundation +//import SwiftSyntaxMacros + +public struct JSONSchema : Codable { + public struct Items : Codable { + let type: String + let `enum`: [String]? + + public init(type: String, `enum`: [String]?) { + self.type = type + self.enum = `enum` + } + } + public struct Property : Codable { + let type: String + let items: Items? + let description: String? + + public init(type: String, items: Items?, description: String?) { + self.type = type + self.items = items + self.description = description + } + } + let type: String + let items: Items? + let properties: [String : Property]? + + public init(type: String, items: Items?, properties: [String : Property]?) { + self.type = type + self.items = items + self.properties = properties + } +} + + +public struct _JSONFunctionSchema: Codable { + public struct Items: Codable { + let type: String + let `enum`: [String]? + + public init(type: Any.Type, `enum`: [String]?) { + self.type = String(describing: type) + self.enum = `enum` + } + } + + public struct Property: Codable { + let type: String + let items: Items? + let `enum`: [String]? + let description: String? + + public init(type: String.Type, description: String?) { + self.type = "string" + self.description = description + self.items = nil + self.enum = nil + } + + public init(type: T.Type, description: String?) where T: RawRepresentable, + T: StringProtocol { + self.type = "string" + self.enum = Array(type.allCases.map { $0.rawValue as! String }) + self.description = description + self.items = nil + } + } + + + public struct Parameters: Codable { + public let properties: [String: Property] + public let required: [String] + public let type = "object" + + public init(properties: [String : Property], required: [String]) { + self.properties = properties + self.required = required + } + } + + let name: String + let description: String + let parameters: Parameters + + public init(name: String, description: String, parameters: Parameters) { + self.name = name + self.description = description + self.parameters = parameters + } +} + +public protocol JSONSchemaConvertible : Codable { + static var type: String { get } + static var jsonSchema: [String : Any] { get } + static func decode(from container: KeyedDecodingContainer, + forKey key: K) throws -> Self +} + +extension RawRepresentable where Self : CaseIterable, RawValue : JSONSchemaConvertible, Self: Codable { + public static var type: String { + RawValue.type + } + public static var jsonSchema: [String: Any] { + [ + "type": RawValue.type, + "enum": Self.allCases.map(\.rawValue) + ] + } +} + +extension JSONSchemaConvertible { + public static var items: JSONSchema.Items? { + nil + } + public static var properties: [JSONSchema.Property]? { + nil + } + public static var `enum`: [String]? { + nil + } + public static func decode(from container: KeyedDecodingContainer, forKey key: K) throws -> Self { + return try container.decode(Self.self, forKey: key) + } +} +extension String : JSONSchemaConvertible { + public static var type: String { "string" } + public static var jsonSchema: [String: Any] { + [ + "type": "string" + ] + } +} +extension Int : JSONSchemaConvertible { + public static var type: String { "number" } + public static var jsonSchema: [String: Any] { + [ + "type": "integer" + ] + } +} +extension Double : JSONSchemaConvertible { + public static var type: String { "number" } + public static var jsonSchema: [String: Any] { + [ + "type": "number" + ] + } +} +extension Date : JSONSchemaConvertible { + public static var type: String { "string" } + + public static var jsonSchema: [String: Any] { + [ + "type": "string" + ] + } + + public static func decode(from container: KeyedDecodingContainer, forKey key: K) throws -> Self { + let value = try container.decode(String.self, forKey: key) + let detector = try? NSDataDetector(types: NSTextCheckingResult.CheckingType.date.rawValue) + let matches = detector?.matches(in: value, options: [], range: NSMakeRange(0, value.utf16.count)) + return matches!.first!.date! + // return ISO8601DateFormatter().date(from: value)! + } +} + +extension Array : JSONSchemaConvertible where Element : JSONSchemaConvertible { + public static var type: String { "array" } + public static var items: JSONSchema.Items? { + JSONSchema.Items(type: Element.type, enum: Element.enum) + } + public static var jsonSchema: [String : Any] { + [ + "type": "array", + "items": Element.jsonSchema + ] + } +} + +@attached(member, names: arbitrary) +@attached(extension, conformances: JSONSchemaConvertible, CaseIterable, names: arbitrary) +public macro JSONSchema() = #externalMacro(module: "JSONSchemaMacros", + type: "JSONSchemaMacro") + +//@attached(member, names: arbitrary) + diff --git a/swift/JSONSchemaMacros/JSONSchemaMacros.swift b/swift/JSONSchemaMacros/JSONSchemaMacros.swift new file mode 100644 index 000000000..07c166b87 --- /dev/null +++ b/swift/JSONSchemaMacros/JSONSchemaMacros.swift @@ -0,0 +1,229 @@ +import SwiftSyntaxMacros +import SwiftCompilerPlugin +import SwiftSyntax + +private struct MemberView { + let name: String + let type: String + var attributeKey: String? + var assignment: String? +} + +private func view(for member: MemberBlockItemListSyntax.Element) throws -> MemberView? { + guard let decl = member.decl.as(VariableDeclSyntax.self), + let binding = decl.bindings.compactMap({ + $0.pattern.as(IdentifierPatternSyntax.self) + }).first, + let type = decl.bindings.compactMap({ + $0.typeAnnotation?.type + }).first, + !(type.syntaxNodeType is StructDeclSyntax.Type) else { + return nil + } + var memberView = MemberView(name: "\(binding.identifier)", type: "\(type)", attributeKey: nil) + if let macroName = decl.attributes.first?.as(AttributeSyntax.self)? + .arguments?.as(LabeledExprListSyntax.self)?.first?.expression.as(StringLiteralExprSyntax.self) { + memberView.attributeKey = "\(macroName.segments)" + } + if let assignment = decl.bindings.compactMap({ + $0.initializer?.value + }).first { + memberView.assignment = "\(assignment)" + } + return memberView +} + +struct JSONSchemaMacro: ExtensionMacro, MemberMacro { + static func expansion(of node: AttributeSyntax, providingMembersOf declaration: some DeclGroupSyntax, conformingTo protocols: [TypeSyntax], in context: some MacroExpansionContext) throws -> [DeclSyntax] { + let members = try declaration.memberBlock.members.compactMap(view(for:)) + if declaration is EnumDeclSyntax { + return [] + } + return [ + """ + enum CodingKeys: CodingKey { + case \(raw: members.map(\.name).joined(separator: ", ")) + } + """, + """ + init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + \(raw: members.map { + """ + self.\($0.name) = try \($0.type).decode(from: container, forKey: .\($0.name)) + """ + }.joined(separator: "\n")) + } + """ + ] + } + + static func expansion(of node: SwiftSyntax.AttributeSyntax, + attachedTo declaration: some SwiftSyntax.DeclGroupSyntax, + providingExtensionsOf type: some SwiftSyntax.TypeSyntaxProtocol, + conformingTo protocols: [SwiftSyntax.TypeSyntax], + in context: some SwiftSyntaxMacros.MacroExpansionContext) throws -> [SwiftSyntax.ExtensionDeclSyntax] { + let members = try declaration.memberBlock.members.compactMap(view(for:)) + var inheritedTypes: [InheritedTypeSyntax] = [] + inheritedTypes.append(InheritedTypeSyntax(type: TypeSyntax("JSONSchemaConvertible"))) + if declaration is EnumDeclSyntax { + inheritedTypes.append(InheritedTypeSyntax(type: TypeSyntax(", CaseIterable"))) + } + let properties = members.map { + """ + "\($0.name)": \($0.type).jsonSchema + """ + } + if !(declaration is EnumDeclSyntax) { + return [ + ExtensionDeclSyntax(extendedType: type, + inheritanceClause: .init(inheritedTypes: .init(inheritedTypes)), + memberBlock: """ + { + static var type: String { + "object" + } + static var jsonSchema: [String: Any] { + [ + "type": "object", + "properties": [ + \(raw: properties.joined(separator: ",")) + ] + ] + } + } + """) + ] + } else { + return [ + ExtensionDeclSyntax(extendedType: type, + inheritanceClause: .init(inheritedTypes: .init(inheritedTypes)), + memberBlock: """ + { + public static func decode(from container: KeyedDecodingContainer, forKey key: K) throws -> Self { + if RawValue.self is Int.Type { + return Self(rawValue: Int(try container.decode(String.self, forKey: key)) as! Self.RawValue)! + } else { + return try container.decode(Self.self, forKey: key) + } + } + } + """) + ] + } + } +} + +enum TestError: Error { + case message(String) +} + +struct LlamaActorMacro: ExtensionMacro, MemberMacro { + static func expansion(of node: AttributeSyntax, providingMembersOf declaration: some DeclGroupSyntax, conformingTo protocols: [TypeSyntax], in context: some MacroExpansionContext) throws -> [DeclSyntax] { + [ + """ + let session: LlamaToolSession + + public init(params: GPTParams) async throws { + self.session = try await LlamaToolSession(params: params, tools: Self.tools) + } + """ + ] + } + + static func expansion(of node: AttributeSyntax, + attachedTo declaration: some DeclGroupSyntax, + providingExtensionsOf type: some TypeSyntaxProtocol, + conformingTo protocols: [TypeSyntax], + in context: some MacroExpansionContext) throws -> [ExtensionDeclSyntax] { + var tools: [ + (name: String, + description: String, + parameters: [(name: String, + type: String, + description: String)], + callableString: String, + callableName: String) + ] = [] + for member in declaration.memberBlock.members { + let comments = member.leadingTrivia.filter { $0.isComment } + + guard let member = member.decl.as(FunctionDeclSyntax.self) else { + continue + } + let name = member.name + guard case var .docLineComment(description) = comments.first else { + throw TestError.message("Missing comment") + } + description = String(description.dropFirst(3)) + var parameters: [(name: String, type: String, description: String)] = [] + var index = 0 + for parameter in member.signature.parameterClause.parameters { + let firstName = parameter.firstName.text + let typeName = parameter.type.as(IdentifierTypeSyntax.self)!.name.text + guard case var .docLineComment(description) = comments[index + 1] else { + throw TestError.message("Missing comment for \(firstName)") + } + description = String(description.dropFirst(3)) + parameters.append((name: firstName, type: typeName, description: description)) + index += 1 + } + let callableName = context.makeUniqueName(name.text) + let callableString = """ + @dynamicCallable struct \(callableName.text): DynamicCallable { + @discardableResult + func dynamicallyCall(withKeywordArguments args: [String: Any]) async throws -> String { + \(parameters.map { + "var \($0.name): \($0.type)!" + }.joined(separator: "\n")) + for (key, value) in args { + \(parameters.map { + "if key == \"\($0.name)\" { \($0.name) = value as! \($0.type) }" + }.joined(separator: "\n")) + } + + let returnValue = try await \(name.text)(\(parameters.map { "\($0.name): \($0.name)" }.joined(separator: ","))) + let jsonValue = try JSONEncoder().encode(returnValue) + return String(data: jsonValue, encoding: .utf8)! + } + } + """ + tools.append((name: name.text, description: description, + parameters: parameters, + callableString: callableString, + callableName: callableName.text)) + } + + + return [ + .init(extendedType: type, + inheritanceClause: .init(inheritedTypes: InheritedTypeListSyntax.init(arrayLiteral: .init(type: IdentifierTypeSyntax(name: "LlamaActor")))), + memberBlock: """ + { + \(raw: tools.map { + $0.callableString + }.joined(separator: "\n")) + + static var tools: [String: (DynamicCallable, _JSONFunctionSchema)] { + [\(raw: tools.map { tool in + """ + "\(tool.name)": (\(tool.callableName)(), _JSONFunctionSchema(name: "\(tool.name)", description: "\(tool.description)", parameters: _JSONFunctionSchema.Parameters(properties: \(tool.parameters.count == 0 ? "[:]" : "[" + tool.parameters.map { parameter in + """ + "\(parameter.name)": _JSONFunctionSchema.Property(type: \(parameter.type).self, description: "\(parameter.description)"), + """ + }.joined() + "]"), required: []))) + """ + }.joined(separator: ","))] + } + } + """) + ] + } +} + +@main +struct JSONSchemaMacrosPlugin: CompilerPlugin { + let providingMacros: [Macro.Type] = [ + JSONSchemaMacro.self, LlamaActorMacro.self + ] +} diff --git a/swift/LlamaKit/LlamaKit.swift b/swift/LlamaKit/LlamaKit.swift new file mode 100644 index 000000000..0adf20a0b --- /dev/null +++ b/swift/LlamaKit/LlamaKit.swift @@ -0,0 +1,189 @@ +import Foundation +@_exported import JSONSchema +@_exported import LlamaObjC + +public protocol DynamicCallable: Sendable { + @discardableResult + func dynamicallyCall(withKeywordArguments args: [String: Any]) async throws -> String +} + + +struct ToolCall: Decodable { + let id: Int + let name: String + let arguments: [String: String] +} + +struct ToolResponse: Encodable { + let id: Int + let result: T +} + +// MARK: LlamaChatSession +/// Standard chat session for a given LLM. +public actor LlamaChatSession { + private let queue = BlockingLineQueue() + private let session: LlamaObjC.LlamaSession + + public init(params: GPTParams, flush: Bool = true) async throws { + session = LlamaObjC.LlamaSession(params: params) + Task.detached { [session, queue] in + session.start(queue) + } + + // flush + guard flush else { return } + _ = queue.outputLine() + } + + public func chat(message: String) async -> String { + queue.addInputLine(message) + return queue.outputLine() + } +} + +// MARK: LlamaGrammarSession +public actor LlamaSession { + private let session: LlamaChatSession + + public init(params: GPTParams) async throws { + let converter = SchemaConverter(propOrder: []) + _ = converter.visit(schema: T.jsonSchema, name: nil) + params.samplerParams.grammar = converter.formatGrammar() + session = try await LlamaChatSession(params: params) + } + + public func chat(message: String) async throws -> T { + let output = await session.chat(message: message).data(using: .utf8)! + return try JSONDecoder().decode(T.self, from: output) + } +} + +// MARK: LlamaToolSession +public actor LlamaToolSession { + private let session: LlamaChatSession + + private struct GetIpAddress: DynamicCallable { + func dynamicallyCall(withKeywordArguments args: [String : Any]) async throws -> String { + getIPAddress() + } + } + + internal static func getIPAddress() -> String { + var address: String! + + // Get list of all interfaces on the local machine: + var ifaddr: UnsafeMutablePointer? = nil + if getifaddrs(&ifaddr) == 0 { + // Loop through linked list of interfaces + var ptr = ifaddr + while ptr != nil { + let interface = ptr!.pointee + + // Check if the interface is IPv4 or IPv6: + let addrFamily = interface.ifa_addr.pointee.sa_family + if addrFamily == UInt8(AF_INET) || addrFamily == UInt8(AF_INET6) { + + // Convert interface name to String: + let name = String(cString: interface.ifa_name) + + // Only consider non-loopback interfaces (e.g., "en0" for Wi-Fi) + if name == "en0" { // Typically en0 is the Wi-Fi interface + // Convert the address to a readable format: + var hostname = [CChar](repeating: 0, count: Int(NI_MAXHOST)) + if getnameinfo(interface.ifa_addr, socklen_t(interface.ifa_addr.pointee.sa_len), + &hostname, socklen_t(hostname.count), + nil, socklen_t(0), NI_NUMERICHOST) == 0 { + address = String(cString: hostname) + } + } + } + + ptr = interface.ifa_next + } + + freeifaddrs(ifaddr) + } + + return address + } + + public private(set) var tools: [String: (DynamicCallable, _JSONFunctionSchema)] + + public init(params: GPTParams, + tools: [String: (DynamicCallable, _JSONFunctionSchema)]) async throws { + self.tools = tools + let ipFnSchema = _JSONFunctionSchema(name: "getIpAddress", description: "Get the IP Address for this system", parameters: _JSONFunctionSchema.Parameters(properties: [:], required: [])) + self.tools["getIpAddress"] = (GetIpAddress(), ipFnSchema) + let encoded = try JSONEncoder().encode(self.tools.values.map(\.1)) + let prompt = """ + You are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. For each function call return a json object with function name and arguments within XML tags as follows: + + {"name": ,"arguments": } + + + Here are the available tools: + \(String(data: encoded, encoding: .utf8)!) <|eot_id|> + """ + params.prompt = prompt + params.interactive = true + params.antiPrompts.append("<|eot_id|>"); + params.inputPrefix = "<|start_header_id|>user<|end_header_id|>"; + params.inputSuffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"; + session = try await LlamaChatSession(params: params, flush: false) + let fn = await session.chat(message: "What is my IP address?") + let toolCall = try JSONDecoder().decode(ToolCall.self, from: fn.data(using: .utf8)!) + guard let tool = self.tools[toolCall.name] else { + fatalError() + } + let resp = try await tool.0.dynamicallyCall(withKeywordArguments: toolCall.arguments) + print(resp) + + let output = await session.chat(message: """ + + {"id": \(toolCall.id), result: \(resp)} + + """) + print(output) + } + + public func chat(message: String) async throws -> String { + var nxt = await session.chat(message: message) + let fn = nxt + // try to see if the output is a function call + do { + let toolCall = try JSONDecoder().decode(ToolCall.self, from: fn.data(using: .utf8)!) + guard let tool = tools[toolCall.name] else { + fatalError() + } + let callable = tool.0 + let resp = try await callable.dynamicallyCall(withKeywordArguments: toolCall.arguments) + print("tool response: \(resp)") + nxt = await session.chat(message: """ + + {"id": \(toolCall.id), result: \(resp)} + + """) + print(nxt) + } catch { + print(error) + } + return nxt + } +} + +public protocol LlamaActor: Actor { + static var tools: [String: (DynamicCallable, _JSONFunctionSchema)] { get } + var session: LlamaToolSession { get } +} + +public extension LlamaActor { + func chat(_ message: String) async throws -> String { + try await session.chat(message: message) + } +} + +@attached(member, names: arbitrary) +@attached(extension, conformances: LlamaActor, names: arbitrary) +public macro llamaActor() = #externalMacro(module: "JSONSchemaMacros", + type: "LlamaActorMacro") diff --git a/swift/main/main.swift b/swift/main/main.swift new file mode 100644 index 000000000..9e58d9190 --- /dev/null +++ b/swift/main/main.swift @@ -0,0 +1,76 @@ +import LlamaKit +import WeatherKit +import CoreLocation + +@llamaActor actor MyLlama { + struct CurrentWeather: Codable { + let temperature: Double + let condition: WeatherCondition + } + + /// Get the current weather in a given location. + /// - parameter location: The city and state, e.g. San Francisco, CA + /// - parameter unit: The unit of temperature + public static func getCurrentWeather(location: String, unit: String) async throws -> CurrentWeather { + let weather = try await WeatherService().weather(for: CLGeocoder().geocodeAddressString(location)[0].location!) + var temperature = weather.currentWeather.temperature + temperature.convert(to: .fahrenheit) + return CurrentWeather(temperature: temperature.value, + condition: weather.currentWeather.condition) + } +} + +func downloadFile() async throws -> String { + let fm = FileManager.default + let tmpDir = fm.temporaryDirectory + let destinationURL = tmpDir.appending(path: "llama_groq_gguf.gguf") + + guard !fm.fileExists(atPath: destinationURL.path()) else { + return destinationURL.path() + } + print("Downloading Llama Tools, this may take a while...") + // Define the URL + guard let url = URL(string: "https://huggingface.co/bartowski/Llama-3-Groq-8B-Tool-Use-GGUF/resolve/main/Llama-3-Groq-8B-Tool-Use-Q5_K_M.gguf?download=true") else { + print("Invalid URL.") + throw URLError(.badURL) + } + + // Start the async download + let (tempURL, _) = try await URLSession.shared.download(from: url) + + // Define the destination path in the documents directory + + + // Move the downloaded file to the destination + try fm.moveItem(at: tempURL, to: destinationURL) + print("File downloaded to: \(destinationURL.path())") + return destinationURL.path() +} + +let params = GPTParams() +params.modelPath = try await downloadFile() +params.nPredict = 512 +params.nCtx = 4096 +params.cpuParams.nThreads = 8 +params.cpuParamsBatch.nThreads = 8 +params.nBatch = 1024 +params.nGpuLayers = 1024 +let llama = try await MyLlama(params: params) + +while true { + print("Enter input: ", terminator: "") + + // Read user input + if let userInput = readLine() { + if userInput.lowercased() == "exit" { + print("Exiting the loop.") + break + } else { + print("🧔🏽‍♂️: \(userInput)") + let response = try await llama.chat(userInput) + print("🤖: \(response)") + } + } else { + print("Failed to read input.") + } +} diff --git a/swift/test/LlamaKitTests.swift b/swift/test/LlamaKitTests.swift new file mode 100644 index 000000000..a0361dffe --- /dev/null +++ b/swift/test/LlamaKitTests.swift @@ -0,0 +1,140 @@ +import Foundation +import Testing +@testable import LlamaKit +import JSONSchema + +// MARK: LlamaGrammarSession Suite +@Suite("LlamaGrammarSession Suite") +struct LlamaGrammarSessionSuite { + @JSONSchema struct Trip { + let location: String + let startDate: TimeInterval + let durationInDays: Int + } + + func downloadFile() async throws -> String { + let fm = FileManager.default + let tmpDir = fm.temporaryDirectory + let destinationURL = tmpDir.appending(path: "tinyllama.gguf") + + guard !fm.fileExists(atPath: destinationURL.path()) else { + return destinationURL.path() + } + print("Downloading TinyLlama, this may take a while...") + // Define the URL + guard let url = URL(string: "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q3_K_L.gguf?download=true") else { + print("Invalid URL.") + throw URLError(.badURL) + } + + // Start the async download + let (tempURL, _) = try await URLSession.shared.download(from: url) + + // Define the destination path in the documents directory + + + // Move the downloaded file to the destination + try fm.moveItem(at: tempURL, to: destinationURL) + print("File downloaded to: \(destinationURL.path())") + return destinationURL.path() + } + + @Test func llamaGrammarSession() async throws { + let params = GPTParams() + params.modelPath = try await downloadFile() + params.nPredict = 256 + params.nCtx = 1024 + params.cpuParams.nThreads = 4 + params.cpuParamsBatch.nThreads = 4 + params.nBatch = 1024 + params.nGpuLayers = 128 + params.chatTemplate = """ + <|system|> + {system_message} + <|user|> + {prompt} + <|assistant|> + """ + params.prompt = """ + You are a travel agent. The current date epoch \(Date.now.timeIntervalSince1970). + Responses should have the following fields: + + location: the location of the trip + startDate: the start of the trip as the unix epoch since 1970 + durationInDays: the duration of the trip in days + + """ + params.interactive = true + let session = try await LlamaSession(params: params) + await #expect(throws: Never.self) { + let trip = try await session.chat(message: "Please create a trip for me to New York City that starts two weeks from now. The duration of the trip MUST be 3 days long.") + #expect(trip.location.contains("New York")) + // TODO: Testing the other fields is difficult considering model size + // TODO: so for now, we are just asserting the grammar works + } + } +} + +import WeatherKit +import CoreLocation + +@llamaActor actor MyLlama { + struct CurrentWeather: Codable { + let temperature: Double + let condition: WeatherCondition + } + + /// Get the current weather in a given location. + /// - parameter location: The city and state, e.g. San Francisco, CA + /// - parameter unit: The unit of temperature + public static func getCurrentWeather(location: String, unit: String) async throws -> CurrentWeather { + let weather = try await WeatherService().weather(for: CLGeocoder().geocodeAddressString(location)[0].location!) + var temperature = weather.currentWeather.temperature + temperature.convert(to: .fahrenheit) + return CurrentWeather(temperature: temperature.value, + condition: weather.currentWeather.condition) + } +} + +func downloadFile() async throws -> String { + let fm = FileManager.default + let tmpDir = fm.temporaryDirectory + let destinationURL = tmpDir.appending(path: "llama_groq_gguf.gguf") + + guard !fm.fileExists(atPath: destinationURL.path()) else { + return destinationURL.path() + } + print("Downloading Llama Tools, this may take a while...") + // Define the URL + guard let url = URL(string: "https://huggingface.co/bartowski/Llama-3-Groq-8B-Tool-Use-GGUF/resolve/main/Llama-3-Groq-8B-Tool-Use-Q5_K_M.gguf?download=true") else { + print("Invalid URL.") + throw URLError(.badURL) + } + + // Start the async download + let (tempURL, _) = try await URLSession.shared.download(from: url) + + // Define the destination path in the documents directory + + + // Move the downloaded file to the destination + try fm.moveItem(at: tempURL, to: destinationURL) + print("File downloaded to: \(destinationURL.path())") + return destinationURL.path() +} + +@Test func llamaToolSession() async throws { + let params = GPTParams() + params.modelPath = try await downloadFile() + params.nPredict = 512 + params.nCtx = 4096 + params.cpuParams.nThreads = 8 + params.cpuParamsBatch.nThreads = 8 + params.nBatch = 1024 + params.nGpuLayers = 1024 + let llama = try await MyLlama(params: params) + let currentWeather = try await MyLlama.getCurrentWeather(location: "San Francisco, CA", unit: "farenheit") + let output = try await llama.chat("What's the weather (in farenheit) in San Francisco, CA?") + #expect(output.contains(String(format: "%.2f", currentWeather.temperature))) + // #expect(output.contains(currentWeather.condition.rawValue)) +}