Init LlamaObjC Commit

2024-10-30 19:15:23 -04:00 · 2024-10-30 19:15:23 -04:00 · 56f9d4b52a
commit 56f9d4b52a
parent 6026da52d6
29 changed files with 3472 additions and 32 deletions
--- a/Package.resolved
+++ b/Package.resolved
@ -0,0 +1,14 @@
+{
+  "pins" : [
+    {
+      "identity" : "swift-syntax",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-syntax.git",
+      "state" : {
+        "branch" : "main",
+        "revision" : "2c271e5ce55124ae534c2eff6e74f745e4db4f68"
+      }
+    }
+  ],
+  "version" : 2
+}
--- a/Package.swift
+++ b/Package.swift
@ -1,21 +1,28 @@
-// swift-tools-version:5.5
-
+// swift-tools-version:5.9
+import CompilerPluginSupport
 import PackageDescription

-var sources = [
+var cppSources = [
    "src/llama.cpp",
    "src/llama-vocab.cpp",
    "src/llama-grammar.cpp",
    "src/llama-sampling.cpp",
    "src/unicode.cpp",
    "src/unicode-data.cpp",
-    "ggml/src/ggml.c",
-    "ggml/src/ggml-alloc.c",
-    "ggml/src/ggml-backend.c",
-    "ggml/src/ggml-quants.c",
-    "ggml/src/ggml-aarch64.c",
+    "common/sampling.cpp",
+    "common/common.cpp",
+    "common/json-schema-to-grammar.cpp",
+    "common/log.cpp",
+    "common/console.cpp"
 ]

+var ggmlSources = [
+    "src/ggml.c",
+    "src/ggml-alloc.c",
+    "src/ggml-backend.c",
+    "src/ggml-quants.c",
+    "src/ggml-aarch64.c"
+]
 var resources: [Resource] = []
 var linkerSettings: [LinkerSetting] = []
 var cSettings: [CSetting] =  [
@ -24,13 +31,13 @@ var cSettings: [CSetting] =  [
    // NOTE: NEW_LAPACK will required iOS version 16.4+
    // We should consider add this in the future when we drop support for iOS 14
    // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
-    // .define("ACCELERATE_NEW_LAPACK"),
-    // .define("ACCELERATE_LAPACK_ILP64")
+     .define("ACCELERATE_NEW_LAPACK"),
+     .define("ACCELERATE_LAPACK_ILP64")
 ]

 #if canImport(Darwin)
-sources.append("ggml/src/ggml-metal.m")
-resources.append(.process("ggml/src/ggml-metal.metal"))
+ggmlSources.append("src/ggml-metal.m")
+resources.append(.process("src/ggml-metal.metal"))
 linkerSettings.append(.linkedFramework("Accelerate"))
 cSettings.append(
    contentsOf: [
@ -47,33 +54,84 @@ cSettings.append(
 let package = Package(
    name: "llama",
    platforms: [
-        .macOS(.v12),
+        .macOS(.v13),
        .iOS(.v14),
        .watchOS(.v4),
        .tvOS(.v14)
    ],
    products: [
        .library(name: "llama", targets: ["llama"]),
+        .executable(name: "LlamaKitMain", targets: ["LlamaKitMain"])
+    ],
+    dependencies: [
+        .package(url: "https://github.com/apple/swift-syntax.git", branch: "main")
    ],
    targets: [
+        .target(name: "llama_cpp",
+                path: ".",
+                exclude: [
+                   "cmake",
+                   "examples",
+                   "scripts",
+                   "models",
+                   "tests",
+                   "CMakeLists.txt",
+                   "Makefile"
+                ],
+                sources: cppSources,
+                publicHeadersPath: "spm-headers"),
        .target(
            name: "llama",
-            path: ".",
-            exclude: [
-               "cmake",
-               "examples",
-               "scripts",
-               "models",
-               "tests",
-               "CMakeLists.txt",
-               "Makefile"
-            ],
-            sources: sources,
+            dependencies: ["llama_cpp"],
+            path: "ggml",
+            sources: ggmlSources,
            resources: resources,
-            publicHeadersPath: "spm-headers",
            cSettings: cSettings,
-            linkerSettings: linkerSettings
-        )
+            linkerSettings: linkerSettings),
+        .target(name: "LlamaObjC",
+                dependencies: ["llama"],
+                path: "objc",
+                sources: [
+                    "GPTParams.mm",
+                    "GPTSampler.mm",
+                    "LlamaBatch.mm",
+                    "LlamaObjC.mm",
+                    "LlamaModel.mm",
+                    "LlamaContext.mm",
+                    "LlamaSession.mm",
+                ],
+                publicHeadersPath: "include",
+                cSettings: cSettings,
+                linkerSettings: linkerSettings),
+        .macro(
+            name: "JSONSchemaMacros",
+            dependencies: [
+                .product(name: "SwiftSyntax", package: "swift-syntax"),
+                .product(name: "SwiftSyntaxMacros", package: "swift-syntax"),
+                .product(name: "SwiftCompilerPlugin", package: "swift-syntax"),
+            ], 
+            path: "swift/JSONSchemaMacros"
+        ),
+        .target(
+            name: "JSONSchema",
+            dependencies: ["JSONSchemaMacros"],
+            path: "swift/JSONSchema"
+        ),
+        .target(
+            name: "LlamaKit",
+            dependencies: ["JSONSchema", "LlamaObjC"],
+            path: "swift/LlamaKit"
+        ),
+        .testTarget(name: "LlamaKitTests",
+                    dependencies: ["LlamaKit", "JSONSchema", "JSONSchemaMacros"],
+                    path: "swift/test",
+                    linkerSettings: [
+                        .linkedFramework("XCTest"),
+                        .linkedFramework("Testing")]),
+        .executableTarget(name: "LlamaKitMain",
+                          dependencies: ["LlamaKit"],
+                          path: "swift/main",
+                          resources: [.process("Llama-3.2-3B-Instruct-Q4_0.gguf")]),
    ],
-    cxxLanguageStandard: .cxx11
+    cxxLanguageStandard: .cxx17
 )
--- a/common/common.h
+++ b/common/common.h
@ -34,10 +34,10 @@ struct llama_lora_adapter_container : llama_lora_adapter_info {
 };

 // build info
-extern int LLAMA_BUILD_NUMBER;
-extern char const * LLAMA_COMMIT;
-extern char const * LLAMA_COMPILER;
-extern char const * LLAMA_BUILD_TARGET;
+static int LLAMA_BUILD_NUMBER = 0;
+static char const * LLAMA_COMMIT = "";
+static char const * LLAMA_COMPILER = "";
+static char const * LLAMA_BUILD_TARGET = "";

 struct llama_control_vector_load_info;

--- a/objc/GPTParams.mm
+++ b/objc/GPTParams.mm
@ -0,0 +1,726 @@
+#import <Foundation/Foundation.h>
+#import "GPTParams_Private.hpp"
+#import "../common/common.h"
+#import "ggml.h"
+
+@implementation GGMLThreadpool {
+    ggml_threadpool *threadpool;
+}
+
+- (instancetype)initWithThreadpool:(ggml_threadpool *)threadpool
+{
+    self = [super init];
+    if (self) {
+        self->threadpool = threadpool;
+    }
+    return self;
+}
+
+- (ggml_threadpool *)threadpool {
+    return threadpool;
+}
+
+@end
+
+@implementation GGMLThreadpoolParams {
+    ggml_threadpool_params params;
+}
+
+- (BOOL)getCpuMaskAtIndex:(NSUInteger)index {
+    abort();
+}
+
+- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index {
+    abort();
+}
+
+- (instancetype)initWithParams:(ggml_threadpool_params&&)params
+{
+    self = [super init];
+    if (self) {
+        self->params = params;
+    }
+    return self;
+}
+
+- (BOOL)isEqual:(id)other {
+    GGMLThreadpoolParams *rhs = (GGMLThreadpoolParams *)other;
+    ggml_threadpool_params rhs_params = rhs->params;
+    return ggml_threadpool_params_match(&params, &rhs_params);
+}
+
+- (GGMLThreadpool *)threadpool {
+    auto tp = ggml_threadpool_new(&params);
+    return [[GGMLThreadpool alloc] initWithThreadpool:tp];
+}
+@end
+
+@implementation CPUParams {
+    cpu_params *params;
+}
+
+- (instancetype)initWithParams:(cpu_params&)params;
+{
+    self = [super init];
+    if (self) {
+        self->params = &params;
+    }
+    return self;
+}
+
+- (int)nThreads {
+    return params->n_threads;
+}
+
+- (void)setNThreads:(int)nThreads {
+    params->n_threads = nThreads;
+}
+
+- (BOOL)maskValid {
+    return params->mask_valid;
+}
+
+- (void)setMaskValid:(BOOL)maskValid {
+    params->mask_valid = maskValid;
+}
+
+- (GGMLSchedPriority)priority {
+    return GGMLSchedPriority(params->priority);
+}
+
+- (void)setPriority:(GGMLSchedPriority)priority {
+    params->priority = ggml_sched_priority(priority);
+}
+
+- (BOOL)strictCPU {
+    return params->strict_cpu;
+}
+
+- (void)setStrictCPU:(BOOL)strictCPU {
+    params->strict_cpu = strictCPU;
+}
+
+- (uint32_t)poll {
+    return params->poll;
+}
+
+- (void)setPoll:(uint32_t)poll {
+    params->poll = poll;
+}
+
+- (BOOL)getCpuMaskAtIndex:(NSUInteger)index {
+    return params->cpumask[index];
+}
+
+- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index {
+    params->cpumask[index] = value;
+}
+
+- (GGMLThreadpoolParams *)ggmlThreadpoolParams {
+    return [[GGMLThreadpoolParams alloc] initWithParams:ggml_threadpool_params_from_cpu_params(*params)];
+}
+
+@end
+
+@implementation GPTSamplerParams {
+    gpt_sampler_params *gpt_sampler_params;
+}
+
+- (instancetype)initWithParams:(gpt_sampler_params&)params {
+    self = [super init];
+    if (self) {
+        gpt_sampler_params = &params;
+    }
+    return self;
+}
+
+// Getters and setters for Objective-C properties, which manipulate the C++ struct
+
+- (uint32_t)seed {
+    return gpt_sampler_params->seed;
+}
+
+- (void)setSeed:(uint32_t)seed {
+    gpt_sampler_params->seed = seed;
+}
+
+- (int32_t)nPrev {
+    return gpt_sampler_params->n_prev;
+}
+
+- (void)setNPrev:(int32_t)nPrev {
+    gpt_sampler_params->n_prev = nPrev;
+}
+
+- (int32_t)nProbs {
+    return gpt_sampler_params->n_probs;
+}
+
+- (void)setNProbs:(int32_t)nProbs {
+    gpt_sampler_params->n_probs = nProbs;
+}
+
+- (int32_t)minKeep {
+    return gpt_sampler_params->min_keep;
+}
+
+- (void)setMinKeep:(int32_t)minKeep {
+    gpt_sampler_params->min_keep = minKeep;
+}
+
+- (int32_t)topK {
+    return gpt_sampler_params->top_k;
+}
+
+- (void)setTopK:(int32_t)topK {
+    gpt_sampler_params->top_k = topK;
+}
+
+- (float)topP {
+    return gpt_sampler_params->top_p;
+}
+
+- (void)setTopP:(float)topP {
+    gpt_sampler_params->top_p = topP;
+}
+
+- (float)minP {
+    return gpt_sampler_params->min_p;
+}
+
+- (void)setMinP:(float)minP {
+    gpt_sampler_params->min_p = minP;
+}
+
+- (float)tfsZ {
+    return gpt_sampler_params->tfs_z;
+}
+
+- (void)setTfsZ:(float)tfsZ {
+    gpt_sampler_params->tfs_z = tfsZ;
+}
+
+- (float)typP {
+    return gpt_sampler_params->typ_p;
+}
+
+- (void)setTypP:(float)typP {
+    gpt_sampler_params->typ_p = typP;
+}
+
+- (float)temp {
+    return gpt_sampler_params->temp;
+}
+
+- (void)setTemp:(float)temp {
+    gpt_sampler_params->temp = temp;
+}
+
+- (float)dynatempRange {
+    return gpt_sampler_params->dynatemp_range;
+}
+
+- (void)setDynatempRange:(float)dynatempRange {
+    gpt_sampler_params->dynatemp_range = dynatempRange;
+}
+
+- (float)dynatempExponent {
+    return gpt_sampler_params->dynatemp_exponent;
+}
+
+- (void)setDynatempExponent:(float)dynatempExponent {
+    gpt_sampler_params->dynatemp_exponent = dynatempExponent;
+}
+
+- (int32_t)penaltyLastN {
+    return gpt_sampler_params->penalty_last_n;
+}
+
+- (void)setPenaltyLastN:(int32_t)penaltyLastN {
+    gpt_sampler_params->penalty_last_n = penaltyLastN;
+}
+
+- (float)penaltyRepeat {
+    return gpt_sampler_params->penalty_repeat;
+}
+
+- (void)setPenaltyRepeat:(float)penaltyRepeat {
+    gpt_sampler_params->penalty_repeat = penaltyRepeat;
+}
+
+- (float)penaltyFreq {
+    return gpt_sampler_params->penalty_freq;
+}
+
+- (void)setPenaltyFreq:(float)penaltyFreq {
+    gpt_sampler_params->penalty_freq = penaltyFreq;
+}
+
+- (float)penaltyPresent {
+    return gpt_sampler_params->penalty_present;
+}
+
+- (void)setPenaltyPresent:(float)penaltyPresent {
+    gpt_sampler_params->penalty_present = penaltyPresent;
+}
+
+- (int32_t)mirostat {
+    return gpt_sampler_params->mirostat;
+}
+
+- (void)setMirostat:(int32_t)mirostat {
+    gpt_sampler_params->mirostat = mirostat;
+}
+
+- (float)mirostatTau {
+    return gpt_sampler_params->mirostat_tau;
+}
+
+- (void)setMirostatTau:(float)mirostatTau {
+    gpt_sampler_params->mirostat_tau = mirostatTau;
+}
+
+- (float)mirostatEta {
+    return gpt_sampler_params->mirostat_eta;
+}
+
+- (void)setMirostatEta:(float)mirostatEta {
+    gpt_sampler_params->mirostat_eta = mirostatEta;
+}
+
+- (BOOL)penalizeNl {
+    return gpt_sampler_params->penalize_nl;
+}
+
+- (void)setPenalizeNl:(BOOL)penalizeNl {
+    gpt_sampler_params->penalize_nl = penalizeNl;
+}
+
+- (BOOL)ignoreEos {
+    return gpt_sampler_params->ignore_eos;
+}
+
+- (void)setIgnoreEos:(BOOL)ignoreEos {
+    gpt_sampler_params->ignore_eos = ignoreEos;
+}
+
+- (BOOL)noPerf {
+    return gpt_sampler_params->no_perf;
+}
+
+- (void)setNoPerf:(BOOL)noPerf {
+    gpt_sampler_params->no_perf = noPerf;
+}
+
+// For `samplers`, convert from NSArray<NSNumber *> to std::vector
+- (NSArray<NSNumber *> *)samplers {
+    NSMutableArray<NSNumber *> *samplersArray = [NSMutableArray array];
+    for (auto sampler : gpt_sampler_params->samplers) {
+        [samplersArray addObject:@(sampler)];
+    }
+    return [samplersArray copy];
+}
+
+- (void)setSamplers:(NSArray<NSNumber *> *)samplers {
+    gpt_sampler_params->samplers.clear();
+    for (NSNumber *sampler in samplers) {
+        gpt_sampler_params->samplers.push_back(static_cast<gpt_sampler_type>(sampler.intValue));
+    }
+}
+
+//// For `logitBias`, convert from NSArray<NSNumber *> to std::vector
+//- (NSArray<NSNumber *> *)logitBias {
+//    NSMutableArray<llama_logit_bias *> *logitBiasArray = [NSMutableArray array];
+//    for (auto bias : gpt_sampler_params.logit_bias) {
+//        [logitBiasArray addObject:bias];
+//    }
+//    return [logitBiasArray copy];
+//}
+//
+//- (void)setLogitBias:(NSArray<NSNumber *> *)logitBias {
+//    gpt_sampler_params.logit_bias.clear();
+//    for (NSNumber *bias in logitBias) {
+//        gpt_sampler_params.logit_bias.push_back(bias.floatValue);
+//    }
+//}
+
+// For `grammar`, convert between NSString and std::string
+- (NSString *)grammar {
+    return [NSString stringWithUTF8String:gpt_sampler_params->grammar.c_str()];
+}
+
+- (void)setGrammar:(NSString *)grammar {
+    gpt_sampler_params->grammar = std::string([grammar UTF8String]);
+}
+
+// Method to print out the parameters as a string
+- (NSString *)print {
+    NSMutableString *output = [NSMutableString stringWithString:@"GPT Sampler Params:\n"];
+    [output appendFormat:@"Seed: %u\n", self.seed];
+    [output appendFormat:@"nPrev: %d\n", self.nPrev];
+    [output appendFormat:@"nProbs: %d\n", self.nProbs];
+    [output appendFormat:@"minKeep: %d\n", self.minKeep];
+    [output appendFormat:@"topK: %d\n", self.topK];
+    [output appendFormat:@"topP: %.2f\n", self.topP];
+    [output appendFormat:@"minP: %.2f\n", self.minP];
+    [output appendFormat:@"tfsZ: %.2f\n", self.tfsZ];
+    [output appendFormat:@"typP: %.2f\n", self.typP];
+    [output appendFormat:@"temp: %.2f\n", self.temp];
+    [output appendFormat:@"dynatempRange: %.2f\n", self.dynatempRange];
+    [output appendFormat:@"dynatempExponent: %.2f\n", self.dynatempExponent];
+    [output appendFormat:@"penaltyLastN: %d\n", self.penaltyLastN];
+    [output appendFormat:@"penaltyRepeat: %.2f\n", self.penaltyRepeat];
+    [output appendFormat:@"penaltyFreq: %.2f\n", self.penaltyFreq];
+    [output appendFormat:@"penaltyPresent: %.2f\n", self.penaltyPresent];
+    [output appendFormat:@"mirostat: %d\n", self.mirostat];
+    [output appendFormat:@"mirostatTau: %.2f\n", self.mirostatTau];
+    [output appendFormat:@"mirostatEta: %.2f\n", self.mirostatEta];
+    [output appendFormat:@"penalizeNl: %@\n", self.penalizeNl ? @"YES" : @"NO"];
+    [output appendFormat:@"ignoreEos: %@\n", self.ignoreEos ? @"YES" : @"NO"];
+    [output appendFormat:@"noPerf: %@\n", self.noPerf ? @"YES" : @"NO"];
+    [output appendFormat:@"Grammar: %@\n", self.grammar];
+    
+    // Print samplers
+    [output appendString:@"Samplers: "];
+    for (NSNumber *sampler in self.samplers) {
+        [output appendFormat:@"%d, ", sampler.intValue];
+    }
+    [output appendString:@"\n"];
+    
+    // Print logit biases
+    [output appendString:@"Logit Biases: "];
+    for (NSNumber *bias in self.logitBias) {
+        [output appendFormat:@"%.2f, ", bias.floatValue];
+    }
+    [output appendString:@"\n"];
+
+    return [output copy];
+}
+
+- (gpt_sampler_params&)cParams {
+    return *gpt_sampler_params;
+}
+
+@end
+
+@implementation GPTParams {
+    gpt_params gpt_params;
+}
+
+- (NSArray<NSString *> *)antiPrompts {
+    auto antiprompts = [[NSMutableArray alloc] init];
+    for (auto& antiprompt : gpt_params.antiprompt) {
+        [antiprompts addObject:[NSString stringWithCString:antiprompt.c_str() encoding:NSUTF8StringEncoding]];
+    }
+    return antiprompts;
+}
+
+- (gpt_params&)params {
+    return gpt_params;
+}
+
+- (int32_t)nPredict {
+    return gpt_params.n_predict;
+}
+
+- (void)setNPredict:(int32_t)nPredict {
+    gpt_params.n_predict = nPredict;
+}
+
+- (NSInteger)nCtx {
+    return gpt_params.n_ctx;
+}
+
+- (void)setNCtx:(NSInteger)nCtx {
+    gpt_params.n_ctx = nCtx;
+}
+
+- (int32_t)nBatch {
+    return gpt_params.n_batch;
+}
+
+- (void)setNBatch:(int32_t)nBatch {
+    gpt_params.n_batch = nBatch;
+}
+
+- (int32_t)nUBatch {
+    return gpt_params.n_ubatch;
+}
+
+- (void)setNUBatch:(int32_t)nUBatch {
+    gpt_params.n_ubatch = nUBatch;
+}
+
+- (int32_t)nKeep {
+    return gpt_params.n_keep;
+}
+
+- (void)setNKeep:(int32_t)nKeep {
+    gpt_params.n_keep = nKeep;
+}
+
+- (int32_t)nDraft {
+    return gpt_params.n_draft;
+}
+
+- (void)setNDraft:(int32_t)nDraft {
+    gpt_params.n_draft = nDraft;
+}
+
+- (int32_t)nChunks {
+    return gpt_params.n_chunks;
+}
+
+- (void)setNChunks:(int32_t)nChunks {
+    gpt_params.n_chunks = nChunks;
+}
+
+- (int32_t)nParallel {
+    return gpt_params.n_parallel;
+}
+
+- (void)setNParallel:(int32_t)nParallel {
+    gpt_params.n_parallel = nParallel;
+}
+
+- (int32_t)nSequences {
+    return gpt_params.n_sequences;
+}
+
+- (void)setNSequences:(int32_t)nSequences {
+    gpt_params.n_sequences = nSequences;
+}
+
+- (float)pSplit {
+    return gpt_params.p_split;
+}
+
+- (void)setPSplit:(float)pSplit {
+    gpt_params.p_split = pSplit;
+}
+
+- (int32_t)nGpuLayers {
+    return gpt_params.n_gpu_layers;
+}
+
+- (void)setNGpuLayers:(int32_t)nGpuLayers {
+    gpt_params.n_gpu_layers = nGpuLayers;
+}
+
+- (int32_t)nGpuLayersDraft {
+    return gpt_params.n_gpu_layers_draft;
+}
+
+- (void)setNGpuLayersDraft:(int32_t)nGpuLayersDraft {
+    gpt_params.n_gpu_layers_draft = nGpuLayersDraft;
+}
+
+- (int32_t)mainGpu {
+    return gpt_params.main_gpu;
+}
+
+- (void)setMainGpu:(int32_t)mainGpu {
+    gpt_params.main_gpu = mainGpu;
+}
+
+- (int32_t)grpAttnN {
+    return gpt_params.grp_attn_n;
+}
+
+- (void)setGrpAttnN:(int32_t)grpAttnN {
+    gpt_params.grp_attn_n = grpAttnN;
+}
+
+- (int32_t)grpAttnW {
+    return gpt_params.grp_attn_w;
+}
+
+- (void)setGrpAttnW:(int32_t)grpAttnW {
+    gpt_params.grp_attn_w = grpAttnW;
+}
+
+- (int32_t)nPrint {
+    return gpt_params.n_print;
+}
+
+- (void)setNPrint:(int32_t)nPrint {
+    gpt_params.n_print = nPrint;
+}
+
+- (float)ropeFreqBase {
+    return gpt_params.rope_freq_base;
+}
+
+- (void)setRopeFreqBase:(float)ropeFreqBase {
+    gpt_params.rope_freq_base = ropeFreqBase;
+}
+
+- (float)ropeFreqScale {
+    return gpt_params.rope_freq_scale;
+}
+
+- (void)setRopeFreqScale:(float)ropeFreqScale {
+    gpt_params.rope_freq_scale = ropeFreqScale;
+}
+
+- (float)yarnExtFactor {
+    return gpt_params.yarn_ext_factor;
+}
+
+- (void)setYarnExtFactor:(float)yarnExtFactor {
+    gpt_params.yarn_ext_factor = yarnExtFactor;
+}
+
+- (float)yarnAttnFactor {
+    return gpt_params.yarn_attn_factor;
+}
+
+- (void)setYarnAttnFactor:(float)yarnAttnFactor {
+    gpt_params.yarn_attn_factor = yarnAttnFactor;
+}
+
+- (float)yarnBetaFast {
+    return gpt_params.yarn_beta_fast;
+}
+
+- (void)setYarnBetaFast:(float)yarnBetaFast {
+    gpt_params.yarn_beta_fast = yarnBetaFast;
+}
+
+- (float)yarnBetaSlow {
+    return gpt_params.yarn_beta_slow;
+}
+
+- (void)setYarnBetaSlow:(float)yarnBetaSlow {
+    gpt_params.yarn_beta_slow = yarnBetaSlow;
+}
+
+- (int32_t)yarnOrigCtx {
+    return gpt_params.yarn_orig_ctx;
+}
+
+- (void)setYarnOrigCtx:(int32_t)yarnOrigCtx {
+    gpt_params.yarn_orig_ctx = yarnOrigCtx;
+}
+
+- (float)defragThold {
+    return gpt_params.defrag_thold;
+}
+
+- (void)setDefragThold:(float)defragThold {
+    gpt_params.defrag_thold = defragThold;
+}
+
+// Assuming tensorSplit remains a fixed array in C struct, we can create a method to access specific values.
+- (float)tensorSplitAtIndex:(NSUInteger)index {
+    if (index < 128) {
+        return gpt_params.tensor_split[index];
+    }
+    return 0.0f;  // Return default value if index is out of bounds
+}
+
+- (void)setTensorSplitValue:(float)value atIndex:(NSUInteger)index {
+    if (index < 128) {
+        gpt_params.tensor_split[index] = value;
+    }
+}
+
+- (BOOL)embedding {
+    return gpt_params.embedding;
+}
+
+- (void)setEmbedding:(BOOL)embedding {
+    gpt_params.embedding = embedding;
+}
+
+- (LlamaModelParams *)LlamaModelParams {
+    return nil;
+}
+
+- (BOOL)ctxShift {
+    return gpt_params.ctx_shift;
+}
+
+- (void)setCtxShift:(BOOL)ctxShift {
+    gpt_params.ctx_shift = ctxShift;
+}
+
+- (CPUParams *)cpuParams {
+    return [[CPUParams alloc] initWithParams:gpt_params.cpuparams];
+}
+
+- (CPUParams *)cpuParamsBatch {
+    return [[CPUParams alloc] initWithParams:gpt_params.cpuparams_batch];
+}
+
+- (GPTSamplerParams *)samplerParams {
+    return [[GPTSamplerParams alloc] initWithParams:gpt_params.sparams];
+}
+
+- (NSString *)modelURL {
+    return [NSString stringWithCString:gpt_params.model_url.c_str() encoding:NSUTF8StringEncoding];
+}
+
+- (void)setModelURL:(NSString *)modelURL {
+    gpt_params.model_url = [modelURL cStringUsingEncoding:NSUTF8StringEncoding];
+}
+
+- (NSString *)modelPath {
+    return [NSString stringWithCString:gpt_params.model.c_str() encoding:NSUTF8StringEncoding];
+}
+
+- (void)setModelPath:(NSString *)modelPath {
+    gpt_params.model = [modelPath cStringUsingEncoding:NSUTF8StringEncoding];
+}
+
+- (NSString *)pathPromptCache {
+    return [[NSString alloc] initWithCString:gpt_params.path_prompt_cache.c_str() encoding:NSUTF8StringEncoding];
+}
+
+- (void)setPathPromptCache:(NSString *)pathPromptCache {
+    gpt_params.path_prompt_cache = [pathPromptCache cStringUsingEncoding:NSUTF8StringEncoding];
+}
+
+- (BOOL)enableChatTemplate {
+    return gpt_params.enable_chat_template;
+}
+
+- (void)setEnableChatTemplate:(BOOL)enableChatTemplate {
+    gpt_params.enable_chat_template = enableChatTemplate;
+}
+
+- (NSString *)chatTemplate {
+    return [NSString stringWithCString:gpt_params.chat_template.c_str()
+                              encoding:NSUTF8StringEncoding];
+}
+
+- (void)setChatTemplate:(NSString *)chatTemplate {
+    gpt_params.chat_template = [chatTemplate cStringUsingEncoding:NSUTF8StringEncoding];
+}
+
+- (NSString *)inputPrefix {
+    return [NSString stringWithCString:gpt_params.input_prefix.c_str()
+                              encoding:NSUTF8StringEncoding];
+}
+
+- (void)setInputPrefix:(NSString *)inputPrefix {
+    gpt_params.input_prefix = [inputPrefix cStringUsingEncoding:NSUTF8StringEncoding];
+}
+
+- (NSString *)inputSuffix {
+    return [NSString stringWithCString:gpt_params.input_suffix.c_str()
+                              encoding:NSUTF8StringEncoding];
+}
+
+- (void)setInputSuffix:(NSString *)inputSuffix {
+    gpt_params.input_suffix = [inputSuffix cStringUsingEncoding:NSUTF8StringEncoding];
+}
+
+
+- (LlamaContextParams *)llamaContextParams {
+}
+
+- (LlamaModelParams *)llamaModelParams {
+}
+
+@end
--- a/objc/GPTSampler.mm
+++ b/objc/GPTSampler.mm
@ -0,0 +1,49 @@
+#import <Foundation/Foundation.h>
+#import <GPTSampler.h>
+#import <GPTParams_Private.hpp>
+#import <LlamaModel_Private.hpp>
+#import <LlamaContext_Private.hpp>
+#import "../../common/sampling.h"
+
+@implementation GPTSampler {
+    gpt_sampler *sampler;
+}
+
+- (instancetype)init:(LlamaModel *)model gptSamplerParams:(GPTSamplerParams *)gptSamplerParams
+{
+    self = [super init];
+    if (self) {
+        self->sampler = gpt_sampler_init([model cModel], [gptSamplerParams cParams]);
+    }
+    return self;
+}
+
+- (uint32_t)seed {
+    return gpt_sampler_get_seed(sampler);
+}
+
+- (LlamaToken)sample:(LlamaContext *)context index:(NSInteger)index {
+    return [self sample:context index:index grammarFirst:false];
+}
+
+- (LlamaToken)sample:(LlamaContext *)context index:(NSInteger)index grammarFirst:(BOOL)grammarFirst {
+    return gpt_sampler_sample(sampler, [context cContext], index, grammarFirst);
+}
+
+- (void)accept:(LlamaToken)token acceptGrammar:(BOOL)acceptGrammar {
+    gpt_sampler_accept(sampler, token, acceptGrammar);
+}
+
+- (NSString *)previousString:(LlamaContext *)context n:(NSInteger)n {
+    return [[NSString alloc] initWithCString:gpt_sampler_prev_str(sampler, [context cContext], n).data() encoding:NSUTF8StringEncoding];
+}
+
+- (LlamaToken)last {
+    return gpt_sampler_last(sampler);
+}
+
+- (void)reset {
+    gpt_sampler_reset(sampler);
+}
+
+@end
--- a/objc/LlamaBatch.mm
+++ b/objc/LlamaBatch.mm
@ -0,0 +1,21 @@
+#import <Foundation/Foundation.h>
+#import "LlamaBatch_Private.hpp"
+#import "llama.h"
+
+@implementation LlamaBatch {
+    llama_batch batch;
+}
+
+- (instancetype)initWithBatch:(llama_batch)batch {
+    self->batch = batch;
+}
+
+- (NSData *)output {
+    return [[NSData alloc] initWithBytes:batch.logits length:batch.n_tokens];
+}
+
+- (llama_batch)cBatch {
+    return batch;
+}
+
+@end
--- a/objc/LlamaContext.mm
+++ b/objc/LlamaContext.mm
@ -0,0 +1,94 @@
+#import <Foundation/Foundation.h>
+#import "LlamaContext_Private.hpp"
+#import "GPTParams_Private.hpp"
+#import "LlamaModel_Private.hpp"
+#import "LlamaBatch_Private.hpp"
+#import "../../common/common.h"
+
+@implementation LlamaContext {
+    llama_context *ctx;
+}
+
+- (instancetype)initWithContext:(llama_context *)context {
+    self = [super init];
+    if (self) {
+        ctx = context;
+    }
+    return self;
+}
+
+- (void)attachThreadpool:(GGMLThreadpool *)threadpool
+         threadpoolBatch:(GGMLThreadpool *)threadpoolBatch {
+    llama_attach_threadpool(ctx, [threadpool threadpool], [threadpoolBatch threadpool]);
+}
+
+
+- (NSUInteger)nCtx {
+    return llama_n_ctx(ctx);
+}
+
+- (BOOL)loadStateFile:(NSString *)pathSession
+            tokensOut:(llama_token *)tokensOut
+        nTokenCpacity:(size_t)nTokenCapacity
+       nTokenCountOut:(size_t *)nTokenCountOut {
+    return llama_state_load_file(ctx, [pathSession cStringUsingEncoding:NSUTF8StringEncoding], tokensOut, nTokenCapacity, nTokenCountOut);
+}
+
+- (LlamaModel *)model {
+    auto model = llama_get_model(ctx);
+    return [[LlamaModel alloc] init:std::remove_const_t<llama_model *>(model)];
+}
+
+- (std::vector<llama_token>)tokenize:(NSString *)text
+addSpecial:(BOOL)addSpecial
+parseSpecial:(BOOL)parseSpecial {
+    return llama_tokenize(ctx, [text cStringUsingEncoding:NSUTF8StringEncoding], addSpecial, parseSpecial);
+}
+
+- (std::string)convertTokensToString:(const std::vector<llama_token>&)tokens {
+    return string_from(ctx, tokens);
+}
+
+- (llama_context *)cContext {
+    return ctx;
+}
+
+- (int32_t)encode:(llama_batch)batch {
+    return llama_encode(ctx, batch);
+}
+
+- (void)kvCacheSeqAdd:(LlamaSequenceId)sequenceId
+                   p0:(LlamaPosition)p0
+                   p1:(LlamaPosition)p1
+                delta:(LlamaPosition)delta {
+    llama_kv_cache_seq_add(ctx, sequenceId, p0, p1, delta);
+}
+
+- (void)kvCacheSeqDiv:(LlamaSequenceId)sequenceId
+                   p0:(LlamaPosition)p0
+                   p1:(LlamaPosition)p1
+                delta:(LlamaPosition)delta {
+    llama_kv_cache_seq_div(ctx, sequenceId, p0, p1, delta);
+}
+
+- (NSString *)tokenToPiece:(LlamaToken)token {
+    return [self tokenToPiece:token special:YES];
+}
+
+- (NSString *)tokenToPiece:(LlamaToken)token special:(BOOL)special {
+    return [[NSString alloc] initWithCString:llama_token_to_piece(ctx, token, special).c_str() encoding:NSUTF8StringEncoding];
+}
+
+- (NSInteger)decode:(LlamaBatch *)batch {
+    return llama_decode(ctx, [batch cBatch]);
+}
+
+- (BOOL)saveStateFile:(NSString *)pathSession
+               tokens:(const LlamaToken *)tokens
+          nTokenCount:(size_t)nTokenCount {
+    return llama_state_save_file(ctx,
+                                 [pathSession cStringUsingEncoding:NSUTF8StringEncoding],
+                                 tokens, nTokenCount);
+}
+
+@end
--- a/objc/LlamaModel.mm
+++ b/objc/LlamaModel.mm
@ -0,0 +1,70 @@
+#import <Foundation/Foundation.h>
+#import "LlamaModel_Private.hpp"
+#import "LlamaContext_Private.hpp"
+#import "LlamaBatch_Private.hpp"
+#import "GPTParams_Private.hpp"
+#import "GPTSampler.h"
+#import "ggml.h"
+#import "../common/common.h"
+
+@implementation LlamaChatMessage
+@end
+
+@implementation LlamaModel {
+    llama_model *model;
+}
+
+- (instancetype)init:(llama_model *)l_model {
+    self = [super init];
+    if (self) {
+        model = l_model;
+    }
+    return self;
+}
+
+- (LlamaContext *)context:(LlamaContextParams *)params {
+    return nil;
+}
+
+- (BOOL)addBOSToken {
+    return llama_add_bos_token(model);
+}
+
+- (BOOL)addEOSToken {
+    return llama_add_eos_token(model);
+}
+
+- (LlamaToken)tokenBOS {
+    return llama_token_bos(model);
+}
+
+- (int32_t)nCtxTrain {
+    return llama_n_ctx_train(model);
+}
+
+- (NSString *)formatExample:(NSString *)tmpl {
+    return [[NSString alloc] initWithCString:llama_chat_format_example(model, [tmpl cStringUsingEncoding:NSUTF8StringEncoding]).c_str()
+                                    encoding:NSUTF8StringEncoding];
+}
+
+- (BOOL)hasEncoder {
+    return llama_model_has_encoder(model);
+}
+
+- (llama_model *)cModel {
+    return model;
+}
+
+- (BOOL)tokenIsEOG:(LlamaToken)token {
+    return llama_token_is_eog(model, token);
+}
+
+- (LlamaToken)tokenEOT {
+    return llama_token_eot(model);
+}
+
+- (LlamaToken)tokenEOS {
+    return llama_token_eos(model);
+}
+
+@end
--- a/objc/LlamaObjC.mm
+++ b/objc/LlamaObjC.mm
@ -0,0 +1,2 @@
+#import "LlamaObjC.h"
+
--- a/objc/LlamaSession.mm
+++ b/objc/LlamaSession.mm
@ -0,0 +1,906 @@
+#import <Foundation/Foundation.h>
+#import "LlamaSession_Private.hpp"
+#import "../../common/common.h"
+#import "LlamaModel_Private.hpp"
+#import "LlamaContext_Private.hpp"
+#import "GPTSampler.h"
+#import <OSLog/OSLog.h>
+#import "ggml.h"
+#import "GPTParams_Private.hpp"
+#import "LlamaBatch_Private.hpp"
+
+@implementation BlockingLineQueue {
+    // Input queue and related synchronization
+    NSMutableArray<NSString *> *inputQueue;
+    NSCondition *inputCondition;
+
+    // Output queue and related synchronization
+    NSMutableArray<NSString *> *outputQueue;
+    NSCondition *outputCondition;
+    
+    // Log queue
+    NSMutableArray<NSString *> *log;
+}
+
+- (instancetype)init {
+    if (self = [super init]) {
+        inputQueue = [NSMutableArray new];
+        outputQueue = [NSMutableArray new];
+        log = [NSMutableArray new];
+        inputCondition = [[NSCondition alloc] init];
+        outputCondition = [[NSCondition alloc] init];
+    }
+    return self;
+}
+
+- (void)addInputLine:(NSString *)line {
+    [inputCondition lock];
+    [inputQueue addObject:line];
+    [log addObject:line];
+    [inputCondition signal]; // Notify that a new input line is available
+    [inputCondition unlock];
+}
+
+- (NSString *)inputLine {
+    [inputCondition lock];
+    while ([inputQueue count] == 0) {
+        [inputCondition wait];
+    }
+    NSString *line = [inputQueue objectAtIndex:0];
+    [inputQueue removeObjectAtIndex:0];
+    [inputCondition unlock];
+    return line;
+}
+
+- (void)addOutputLine:(NSString *)line {
+    [outputCondition lock];
+    [outputQueue addObject:line];
+    [log addObject:line];
+    [outputCondition signal]; // Notify that a new output line is available
+    [outputCondition unlock];
+}
+
+- (NSString *)outputLine {
+    [outputCondition lock];
+    while ([outputQueue count] == 0) {
+        [outputCondition wait];
+    }
+    NSString *line = [outputQueue objectAtIndex:0];
+    [outputQueue removeObjectAtIndex:0];
+    [outputCondition unlock];
+    return line;
+}
+@end
+
+@implementation LlamaSession {
+    std::vector<llama_token> embd_inp;
+    std::vector<llama_chat_msg> chat_msgs;
+    GPTParams *params;
+    GPTSampler *smpl;
+    BOOL isInteracting;
+    
+    bool is_antiprompt;
+    bool input_echo;
+    bool display;
+    bool need_to_save_session;
+    
+    int n_past;
+    int n_remain;
+    int n_consumed;
+    int n_session_consumed;
+    
+    std::vector<int>   input_tokens;
+    std::vector<int>   output_tokens;;
+    std::ostringstream output_ss;
+    std::stringstream last_output_ss;
+    std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
+    
+    std::vector<llama_token> embd;
+    NSMutableString *pathSession;
+    NSInteger ga_i;
+    NSInteger ga_n;
+    NSInteger ga_w;
+    std::vector<llama_token> session_tokens;
+    // tokenized antiprompts
+    std::vector<std::vector<llama_token>> antiprompt_ids;
+    BOOL need_insert_eot;
+    int n_ctx;
+}
+
+- (NSString *)chat_add_and_format:(std::vector<llama_chat_msg> &) chat_msgs role:(const std::string &) role content:(const std::string &) content {
+    llama_chat_msg new_msg{role, content};
+    auto formatted = llama_chat_format_single([self.model cModel], [params params].chat_template, chat_msgs, new_msg, role == "user");
+    chat_msgs.push_back({role, content});
+    os_log_debug(OS_LOG_DEFAULT, "formatted: '%s'\n", formatted.c_str());
+    return [NSString stringWithCString:formatted.c_str() encoding:NSUTF8StringEncoding];
+}
+
+static BOOL file_is_empty(NSString *path) {
+    NSFileManager *manager = [NSFileManager defaultManager];
+    if ([manager fileExistsAtPath:path]) {
+        NSDictionary *attributes = [manager attributesOfItemAtPath:path error:nil];
+        unsigned long long size = [attributes fileSize];
+        if (attributes && size == 0) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+    return true;
+}
+
+- (instancetype)initWithParams:(GPTParams *)params {
+    self = [super init];
+    
+    self->params = params;
+    //    model = llama_init.model;
+    //    ctx = llama_init.context;
+    //
+    //    if model == nil {
+    //        LOG_ERR("%s: error: unable to load model\n", __func__);
+    //        return 1;
+    //    }
+    //
+    os_log_info(OS_LOG_DEFAULT,
+                "%s: llama threadpool init, n_threads = %d\n",
+                __func__, params.cpuParams.nThreads);
+
+    if (params.embedding) {
+        os_log_error(OS_LOG_DEFAULT,
+                     R"(************
+                     please use the 'embedding' tool for embedding calculations
+                     ************)");
+        abort();
+    }
+
+    if (params.nCtx != 0 && params.nCtx < 8) {
+        os_log_info(OS_LOG_DEFAULT, "minimum context size is 8, using minimum size.");
+        params.nCtx = 8;
+    }
+
+    if (params.ropeFreqBase != 0) {
+        os_log_info(OS_LOG_DEFAULT, "changing RoPE frequency base to \(params.ropeFreqBase)");
+    }
+
+    if (params.ropeFreqScale != 0.0) {
+        os_log_info(OS_LOG_DEFAULT, "scaling RoPE frequency by \(params.ropeFreqScale)");
+    }
+
+    llama_backend_init();
+    llama_numa_init(ggml_numa_strategy(params.numaStrategy));
+    auto llama_init = llama_init_from_gpt_params([params params]);
+    
+    auto tpp_batch = params.cpuParamsBatch.ggmlThreadpoolParams;
+    auto tpp = params.cpuParams.ggmlThreadpoolParams;
+
+    set_process_priority(ggml_sched_priority(params.cpuParams.priority));
+    
+    GGMLThreadpool *threadpool_batch;
+    if (tpp != tpp_batch) {
+        threadpool_batch = [tpp_batch threadpool];
+        if (!threadpool_batch) {
+            [NSException raise:@"batch threadpool create failed"
+                        format:@"batch threadpool create failed"];
+        }
+        
+        // Start the non-batch threadpool in the paused state
+        tpp.paused = true;
+    }
+    
+    GGMLThreadpool *threadpool = [tpp threadpool];
+    if (!threadpool) {
+        [NSException raise:@"threadpool create failed"
+                    format:@"threadpool create failed"];
+    }
+    
+    self.ctx = [[LlamaContext alloc] initWithContext:llama_init.context];
+    [self.ctx attachThreadpool:threadpool threadpoolBatch:threadpool_batch];
+    self.model = [[LlamaModel alloc] init:llama_init.model];
+    const int n_ctx_train = [self.model nCtxTrain];
+    n_ctx = [self.ctx nCtx];
+    //
+    if (n_ctx > n_ctx_train) {
+        os_log_info(OS_LOG_DEFAULT, "%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
+    }
+
+    // print chat template example in conversation mode
+    if (params.conversation) {
+        if (params.enableChatTemplate) {
+            os_log_info(OS_LOG_DEFAULT, "%s: chat template example:\n%s\n", __func__,
+                        [[self.model formatExample:params.chatTemplate] cStringUsingEncoding:NSUTF8StringEncoding]);
+        } else {
+            os_log_info(OS_LOG_DEFAULT, "%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
+        }
+    }
+    // print system information
+    @autoreleasepool {
+        NSLog(@"%s", gpt_params_get_system_info([params params]).c_str());
+    }
+    
+    pathSession = [[NSMutableString alloc] initWithString:params.pathPromptCache];
+    
+    NSFileManager *fileManager = [NSFileManager defaultManager];
+    
+    if ([pathSession length] != 0) {
+        os_log_info(OS_LOG_DEFAULT, "%s: attempting to load saved session from '%s'\n", __func__, [pathSession cStringUsingEncoding:NSUTF8StringEncoding]);
+        if (![fileManager fileExistsAtPath:pathSession]) {
+            os_log_info(OS_LOG_DEFAULT, "%s: session file does not exist, will create.\n", __func__);
+        } else if (file_is_empty(pathSession)) {
+            os_log_info(OS_LOG_DEFAULT,"%s: The session file is empty. A new session will be initialized.\n", __func__);
+        } else {
+            // The file exists and is not empty
+            session_tokens.resize(n_ctx);
+            size_t n_token_count_out = 0;
+            if (![self.ctx loadStateFile:pathSession tokensOut:session_tokens.data() nTokenCpacity:session_tokens.capacity() nTokenCountOut:&n_token_count_out]) {
+                [NSException raise:@"SessionLoadFailure" format:@"%s: failed to load session file '%s'\n", __func__, [pathSession cStringUsingEncoding:NSUTF8StringEncoding]];
+            }
+            session_tokens.resize(n_token_count_out);
+            os_log_info(OS_LOG_DEFAULT,"%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
+        }
+    }
+    
+    BOOL addBOS = [self.model addBOSToken];
+    if (![self.model hasEncoder]) {
+        GGML_ASSERT(![self.model addEOSToken]);
+    }
+    
+    os_log_debug(OS_LOG_DEFAULT, "n_ctx: %d, add_bos: %d\n", n_ctx, addBOS);
+    
+    
+    {
+        auto prompt = (params.conversation && params.enableChatTemplate && params.prompt.length > 0)
+        ? [self chat_add_and_format:chat_msgs role:"system" content:[params params].prompt] // format the system prompt in conversation mode
+        : params.prompt;
+        if (params.interactiveFirst || [params.prompt length] > 0 || session_tokens.empty()) {
+            os_log_debug(OS_LOG_DEFAULT, "tokenize the prompt\n");
+            embd_inp = [self.ctx tokenize:prompt addSpecial:true parseSpecial:true];
+        } else {
+            os_log_debug(OS_LOG_DEFAULT,"use session tokens\n");
+            embd_inp = session_tokens;
+        }
+        
+        os_log_debug(OS_LOG_DEFAULT,"prompt: \"%s\"\n", [prompt cStringUsingEncoding:NSUTF8StringEncoding]);
+        os_log_debug(OS_LOG_DEFAULT,"tokens: %s\n", [self.ctx convertTokensToString:embd_inp].c_str());
+    }
+
+    // Should not run without any tokens
+    if (embd_inp.empty()) {
+        if (addBOS) {
+            embd_inp.push_back([self.model tokenBOS]);
+//            LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
+        } else {
+            [NSException raise:@"InputEmptyError" format:@"input is empty"];
+        }
+    }
+    
+    // Tokenize negative prompt
+    if (embd_inp.size() > n_ctx - 4) {
+        [NSException raise:@"PromptError" format:@"%s: prompt is too long (%d tokens, max %d)\n", __func__, (int)embd_inp.size(), n_ctx - 4];
+    }
+    
+    // debug message about similarity of saved session, if applicable
+    size_t n_matching_session_tokens = 0;
+    if (!session_tokens.empty()) {
+        for (llama_token id : session_tokens) {
+            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
+                break;
+            }
+            n_matching_session_tokens++;
+        }
+        if ([params.prompt length] == 0 && n_matching_session_tokens == embd_inp.size()) {
+//            LOG_INF("%s: using full prompt from session file\n", __func__);
+        } else if (n_matching_session_tokens >= embd_inp.size()) {
+//            LOG_INF("%s: session file has exact match for prompt!\n", __func__);
+        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
+//            LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+//                    __func__, n_matching_session_tokens, embd_inp.size());
+        } else {
+//            LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
+//                    __func__, n_matching_session_tokens, embd_inp.size());
+        }
+        
+        // remove any "future" tokens that we might have inherited from the previous session
+        llama_kv_cache_seq_rm([self.ctx cContext], -1, n_matching_session_tokens, -1);
+    }
+    //
+    //    os_log_debug(OS_LOG_DEFAULT, "recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
+    //         embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
+    //
+    // if we will use the cache for the full prompt without reaching the end of the cache, force
+    // reevaluation of the last token to recalculate the cached logits
+    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
+//        os_log_debug(OS_LOG_DEFAULT, "recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
+        
+        session_tokens.resize(embd_inp.size() - 1);
+    }
+    
+    // number of tokens to keep when resetting context
+    if (params.nKeep < 0 || params.nKeep > (int) embd_inp.size()) {
+        params.nKeep = (int)embd_inp.size();
+    } else {
+        params.nKeep += addBOS; // always keep the BOS token
+    }
+    
+    if (params.conversation) {
+        params.interactiveFirst = true;
+    }
+    
+    // enable interactive mode if interactive start is specified
+    if (params.interactiveFirst) {
+        params.interactive = true;
+    }
+    
+    if (params.verbosePrompt) {
+//        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+//        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        for (int i = 0; i < (int) embd_inp.size(); i++) {
+            os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n", embd_inp[i],
+                        [[self.ctx tokenToPiece:embd_inp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
+        }
+        
+        if (params.nKeep > addBOS) {
+//            LOG_INF("%s: static prompt based on n_keep: '", __func__);
+            for (int i = 0; i < params.nKeep; i++) {
+                os_log_debug(OS_LOG_DEFAULT, "%s",
+                             [[self.ctx tokenToPiece:embd_inp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
+            }
+//            LOG("'\n");
+        }
+//        LOG_INF("\n");
+    }
+    //
+    //    // ctrl+C handling
+    //    {
+    //#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+    //        struct sigaction sigint_action;
+    //        sigint_action.sa_handler = sigint_handler;
+    //        sigemptyset (&sigint_action.sa_mask);
+    //        sigint_action.sa_flags = 0;
+    //        sigaction(SIGINT, &sigint_action, NULL);
+    //#elif defined (_WIN32)
+    //        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+    //            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
+    //        };
+    //        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+    //#endif
+    //    }
+    //
+    if (params.interactive) {
+        os_log_info(OS_LOG_DEFAULT, "%s: interactive mode on.\n", __func__);
+        
+        if ([params.antiPrompts count] > 0) {
+            for (NSString *antiprompt in params.antiPrompts) {
+                os_log_info(OS_LOG_DEFAULT, "Reverse prompt: '%s'\n", [antiprompt cStringUsingEncoding:NSUTF8StringEncoding]);
+                if (params.verbosePrompt) {
+                    auto tmp = [_ctx tokenize:antiprompt
+                                  addSpecial:false
+                                parseSpecial:true];
+                    for (int i = 0; i < (int) tmp.size(); i++) {
+                        os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n", tmp[i], [[self.ctx tokenToPiece:tmp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
+                    }
+                }
+            }
+        }
+        
+        if (params.inputPrefixBOS) {
+            os_log_info(OS_LOG_DEFAULT, "Input prefix with BOS\n");
+        }
+        
+        if ([params.inputPrefix length] > 0) {
+            os_log_info(OS_LOG_DEFAULT, "Input prefix: '%s'\n", [params.inputPrefix cStringUsingEncoding:NSUTF8StringEncoding]);
+            if (params.verbosePrompt) {
+                auto tmp = [_ctx tokenize:params.inputPrefix addSpecial:true parseSpecial:true];
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n",
+                                tmp[i], [[self.ctx tokenToPiece:tmp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
+                }
+            }
+        }
+        
+        if ([params.inputSuffix length] > 0) {
+            os_log_info(OS_LOG_DEFAULT, "Input suffix: '%s'\n", [params.inputSuffix cStringUsingEncoding:NSUTF8StringEncoding]);
+            if (params.verbosePrompt) {
+                auto tmp = [_ctx tokenize:params.inputSuffix addSpecial:false parseSpecial:true];
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n",
+                                tmp[i], [[self.ctx tokenToPiece:tmp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
+                }
+            }
+        }
+    }
+    
+    smpl = [[GPTSampler alloc] init:_model gptSamplerParams:[params samplerParams]];
+    if (!smpl) {
+        [NSException raise:@"SamplingFailure" format:@"failed to initialize sampling subsystem"];
+    }
+    
+    os_log_info(OS_LOG_DEFAULT, "sampler seed: %u\n", [smpl seed]);
+    //    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+    //    LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
+    //
+    //    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    //
+    // group-attention state
+    // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
+    
+    ga_n = params.grpAttnN;
+    ga_w = params.grpAttnW;
+    
+    if (ga_n != 1) {
+        GGML_ASSERT(ga_n > 0                    && "grp_attn_n must be positive");                     // NOLINT
+        GGML_ASSERT(ga_w % ga_n == 0            && "grp_attn_w must be a multiple of grp_attn_n");     // NOLINT
+        //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of grp_attn_w");    // NOLINT
+        //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
+        os_log_info(OS_LOG_DEFAULT, "self-extend: n_ctx_train = %d, grp_attn_n = %ld, grp_attn_w = %ld\n", n_ctx_train, static_cast<long>(ga_n), static_cast<long>(ga_w));
+    }
+    
+    if (params.interactive) {
+        const char * control_message;
+        if (params.multilineInput) {
+            control_message = " - To return control to the AI, end your input with '\\'.\n"
+            " - To return control without starting a new line, end your input with '/'.\n";
+        } else {
+            control_message = " - Press Return to return control to the AI.\n"
+            " - To return control without starting a new line, end your input with '/'.\n"
+            " - If you want to submit another line, end your input with '\\'.\n";
+        }
+        
+        isInteracting = params.interactiveFirst;
+    }
+    
+    is_antiprompt        = false;
+    input_echo           = true;
+    display              = true;
+    need_to_save_session = [pathSession length] > 0 && n_matching_session_tokens < embd_inp.size();
+    n_remain           = params.nPredict;
+    
+    //    // the first thing we will do is to output the prompt, so set color accordingly
+    //    console::set_display(console::prompt);
+    //    display = params.display_prompt;
+    //
+    
+    
+    
+    
+    antiprompt_ids.reserve([params.antiPrompts count]);
+    for (NSString *antiprompt in params.antiPrompts) {
+        antiprompt_ids.emplace_back([self.ctx tokenize:antiprompt addSpecial:false parseSpecial:true]);
+    }
+    
+    if ([self.model hasEncoder]) {
+        int enc_input_size = embd_inp.size();
+        llama_token * enc_input_buf = embd_inp.data();
+        
+        if ([_ctx encode:llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0)]) {
+            [NSException raise:@"EvalFailure" format:@"failed to eval"];
+        }
+        
+        llama_token decoder_start_token_id = llama_model_decoder_start_token([self.model cModel]);
+        if (decoder_start_token_id == -1) {
+            decoder_start_token_id = [self.model tokenBOS];
+        }
+        
+        embd_inp.clear();
+        embd_inp.push_back(decoder_start_token_id);
+    }
+    return self;
+}
+
+- (void)start:(BlockingLineQueue *)queue {
+    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
+        // predict
+        if (!embd.empty()) {
+            // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
+            // --prompt or --file which uses the same value.
+            int max_embd_size = n_ctx - 4;
+
+            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
+            if ((int) embd.size() > max_embd_size) {
+                const int skipped_tokens = (int) embd.size() - max_embd_size;
+                embd.resize(max_embd_size);
+
+//                console::set_display(console::error);
+                os_log_error(OS_LOG_DEFAULT, "<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+//                console::set_display(console::reset);
+            }
+
+            if (params.grpAttnN == 1) {
+                // infinite text generation via context shifting
+                // if we run out of context:
+                // - take the n_keep first tokens from the original prompt (via n_past)
+                // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
+
+                if (n_past + (int) embd.size() >= [_ctx nCtx]) {
+                    if (!params.ctxShift) {
+                        os_log_debug(OS_LOG_DEFAULT, "\n\n%s: context full and context shift is disabled => stopping\n", __func__);
+                        break;
+                    } else {
+                        if (params.nPredict == -2) {
+                            os_log_debug(OS_LOG_DEFAULT, "\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.nPredict);
+                            break;
+                        }
+
+                        const int n_left    = n_past - params.nKeep;
+                        const int n_discard = n_left/2;
+
+                        os_log_debug(OS_LOG_DEFAULT, "context full, swapping: n_past = %d, n_left = %d, n_ctx = %lu, n_keep = %d, n_discard = %d\n",
+                                     n_past, n_left, static_cast<unsigned long>([_ctx nCtx]), params.nKeep, n_discard);
+
+                        llama_kv_cache_seq_rm ([self.ctx cContext], 0, params.nKeep            , params.nKeep + n_discard);
+                        llama_kv_cache_seq_add([self.ctx cContext], 0, params.nKeep + n_discard, n_past, -n_discard);
+
+                        n_past -= n_discard;
+
+                        os_log_debug(OS_LOG_DEFAULT, "after swap: n_past = %d\n", n_past);
+
+                        os_log_debug(OS_LOG_DEFAULT, "embd: %s\n", [self.ctx convertTokensToString:embd].c_str());
+
+                        os_log_debug(OS_LOG_DEFAULT, "clear session path\n");
+                        [pathSession setString:@""];
+                    }
+                }
+            } else {
+                // context extension via Self-Extend
+                while (n_past >= ga_i + ga_w) {
+                    const int ib = (ga_n*ga_i)/ga_w;
+                    const int bd = (ga_w/ga_n)*(ga_n - 1);
+                    const int dd = (ga_w/ga_n) - ib*bd - ga_w;
+
+                    os_log_debug(OS_LOG_DEFAULT, "\n");
+                    os_log_debug(OS_LOG_DEFAULT, "shift: [%6ld, %6d] + %6d -> [%6ld, %6d]\n", static_cast<long>(ga_i), n_past, ib*bd, static_cast<long>(ga_i + ib*bd), n_past + ib*bd);
+                    os_log_debug(OS_LOG_DEFAULT, "div:   [%6ld, %6ld] / %6ld -> [%6ld, %6ld]\n", static_cast<long>(ga_i + ib*bd), static_cast<long>(ga_i + ib*bd + ga_w), static_cast<long>(ga_n), static_cast<long>((ga_i + ib*bd)/ga_n), static_cast<long>((ga_i + ib*bd + ga_w)/ga_n));
+                    os_log_debug(OS_LOG_DEFAULT, "shift: [%6ld, %6d] + %6d -> [%6ld, %6d]\n", static_cast<long>(ga_i + ib*bd + ga_w), n_past + ib*bd, dd, static_cast<long>(ga_i + ib*bd + ga_w + dd), n_past + ib*bd + dd);
+
+                    [self.ctx kvCacheSeqAdd:0 p0:ga_i p1:n_past delta:ib*bd];
+                    [self.ctx kvCacheSeqDiv:0 p0:ga_i + ib*bd p1:ga_i + ib*bd + ga_w delta:ga_n];
+                    [self.ctx kvCacheSeqAdd:0 p0:ga_i + ib*bd + ga_w p1:n_past + ib*bd delta:dd];
+
+                    n_past -= bd;
+
+                    ga_i += ga_w/ga_n;
+
+                    os_log_debug(OS_LOG_DEFAULT, "\nn_past_old = %d, n_past = %d, ga_i = %ld\n\n", n_past + bd, n_past, static_cast<long>(ga_i));
+                }
+            }
+
+            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+            if (n_session_consumed < (int) session_tokens.size()) {
+                size_t i = 0;
+                for ( ; i < embd.size(); i++) {
+                    if (embd[i] != session_tokens[n_session_consumed]) {
+                        session_tokens.resize(n_session_consumed);
+                        break;
+                    }
+
+                    n_past++;
+                    n_session_consumed++;
+
+                    if (n_session_consumed >= (int) session_tokens.size()) {
+                        ++i;
+                        break;
+                    }
+                }
+                if (i > 0) {
+                    embd.erase(embd.begin(), embd.begin() + i);
+                }
+            }
+
+            for (int i = 0; i < (int) embd.size(); i += params.nBatch) {
+                int n_eval = (int) embd.size() - i;
+                if (n_eval > params.nBatch) {
+                    n_eval = params.nBatch;
+                }
+
+                os_log_debug(OS_LOG_DEFAULT, "eval: %s\n", [self.ctx convertTokensToString:embd].c_str());
+
+                
+                if ([self.ctx decode:[[LlamaBatch alloc] initWithBatch:llama_batch_get_one(&embd[i], n_eval, n_past, 0)] ]) {
+                    [NSException raise:@"EvalFailure" format:@"failed to eval"];
+                }
+
+                n_past += n_eval;
+
+                os_log_debug(OS_LOG_DEFAULT, "n_past = %d\n", n_past);
+                // Display total tokens alongside total time
+                if (params.nPrint > 0 && n_past % params.nPrint == 0) {
+                    os_log_debug(OS_LOG_DEFAULT, "\n\033[31mTokens consumed so far = %d / %lu \033[0m\n", n_past, static_cast<unsigned long>([self.ctx nCtx]));
+                }
+            }
+
+            if (!embd.empty() && [pathSession length] > 0) {
+                session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
+                n_session_consumed = session_tokens.size();
+            }
+        }
+
+        embd.clear();
+
+        if ((int) embd_inp.size() <= n_consumed && !isInteracting) {
+            // optionally save the session on first sample (for faster prompt loading next time)
+            if ([pathSession length] > 0 && need_to_save_session && !params.promptCacheRO) {
+                need_to_save_session = false;
+                [self.ctx saveStateFile:pathSession tokens:session_tokens.data() nTokenCount:session_tokens.size()];
+//                llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+
+                os_log_debug(OS_LOG_DEFAULT, "saved session to %s\n", [pathSession cStringUsingEncoding:NSUTF8StringEncoding]);
+            }
+
+            const llama_token idToken = [smpl sample:self.ctx index:-1];
+
+            [smpl accept:idToken acceptGrammar:true];
+
+            // os_log_debug(OS_LOG_DEFAULT, "last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
+
+            embd.push_back(idToken);
+
+            // echo this to console
+            input_echo = true;
+
+            // decrement remaining sampling budget
+            --n_remain;
+
+            os_log_debug(OS_LOG_DEFAULT, "n_remain: %d\n", n_remain);
+        } else {
+            // some user input remains from prompt or interaction, forward it to processing
+            os_log_debug(OS_LOG_DEFAULT, "embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            while ((int) embd_inp.size() > n_consumed) {
+                embd.push_back(embd_inp[n_consumed]);
+
+                // push the prompt in the sampling context in order to apply repetition penalties later
+                // for the prompt, we don't apply grammar rules
+                [smpl accept:embd_inp[n_consumed] acceptGrammar:false];
+
+                ++n_consumed;
+                if ((int) embd.size() >= params.nBatch) {
+                    break;
+                }
+            }
+        }
+
+        // display text
+        if (input_echo && display) {
+//            std::cout<< "DISPLAYING TEXT" << std::endl;
+            
+            for (auto idToken : embd) {
+                NSString *token_str = [self.ctx tokenToPiece:idToken special:params.special];
+
+                // Console/Stream Output
+                os_log_info(OS_LOG_DEFAULT, "%s", [token_str cStringUsingEncoding:NSUTF8StringEncoding]);
+
+                // Record Displayed Tokens To Log
+                // Note: Generated tokens are created one by one hence this check
+                if (embd.size() > 1) {
+                    // Incoming Requested Tokens
+                    input_tokens.push_back(idToken);
+                    
+                } else {
+                    // Outgoing Generated Tokens
+                    output_tokens.push_back(idToken);
+                    output_ss << [token_str cStringUsingEncoding:NSUTF8StringEncoding];
+                    last_output_ss << [token_str cStringUsingEncoding:NSUTF8StringEncoding];
+                }
+                
+            }
+            if (!last_output_ss.str().empty()) {
+//                queue->addOutputLine(last_output_ss.str());
+            }
+        }
+
+        // reset color to default if there is no pending user input
+        if (input_echo && (int) embd_inp.size() == n_consumed) {
+            if (!last_output_ss.str().empty()) {
+//                queue->addOutputLine(last_output_ss.str());
+            }
+//            console::set_display(console::reset);
+            display = true;
+        }
+
+        // if not currently processing queued inputs;
+        if ((int) embd_inp.size() <= n_consumed) {
+            // check for reverse prompt in the last n_prev tokens
+            if ([params.antiPrompts count] > 0) {
+                const int n_prev = 32;
+                NSString *last_output = [smpl previousString:self.ctx n:n_prev];
+
+                is_antiprompt = false;
+                // Check if each of the reverse prompts appears at the end of the output.
+                // If we're not running interactively, the reverse prompt might be tokenized with some following characters
+                // so we'll compensate for that by widening the search window a bit.
+                for (NSString *antiprompt in params.antiPrompts) {
+                    size_t extra_padding = params.interactive ? 0 : 2;
+                    size_t search_start_pos = [last_output length] > static_cast<size_t>([antiprompt length] + extra_padding)
+                    ? [last_output length] - static_cast<size_t>([antiprompt length] + extra_padding)
+                        : 0;
+
+                    // TODO: Check if correct
+                    if ([last_output rangeOfString:antiprompt options:0 range:NSMakeRange(search_start_pos, last_output.length - search_start_pos)].location != NSNotFound) {
+                        if (params.interactive) {
+                            isInteracting = true;
+                        }
+                        is_antiprompt = true;
+                        break;
+                    }
+                }
+
+                // check for reverse prompt using special tokens
+                llama_token last_token = [smpl last];
+                for (std::vector<llama_token> ids : antiprompt_ids) {
+                    if (ids.size() == 1 && last_token == ids[0]) {
+                        if (params.interactive) {
+                            isInteracting = true;
+                        }
+                        is_antiprompt = true;
+                        break;
+                    }
+                }
+
+                if (is_antiprompt) {
+                    os_log_debug(OS_LOG_DEFAULT, "found antiprompt: %s\n", [last_output cStringUsingEncoding:NSUTF8StringEncoding]);
+                }
+            }
+
+            // deal with end of generation tokens in interactive mode
+            
+            if ([self.model tokenIsEOG:[smpl last]]) {
+                os_log_debug(OS_LOG_DEFAULT, "found an EOG token\n");
+
+                if (params.interactive) {
+                    if ([[params antiPrompts] count] > 0) {
+                        // tokenize and inject first reverse prompt
+                        
+                        const auto first_antiprompt = [self.ctx tokenize:params.antiPrompts[0] addSpecial:false parseSpecial:true];
+                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+                        is_antiprompt = true;
+                    }
+
+                    if (params.enableChatTemplate) {
+                        [self chat_add_and_format:chat_msgs
+                                             role:"assistant"
+                                          content:assistant_ss.str()];
+                    }
+                    isInteracting = true;
+//                    LOG("\n");
+                }
+            }
+
+            // if current token is not EOG, we add it to current assistant message
+            if (params.conversation) {
+                const auto idToken = [smpl last];
+                assistant_ss << [[self.ctx tokenToPiece:idToken special:false] cStringUsingEncoding:NSUTF8StringEncoding];
+            }
+
+            if (n_past > 0 && isInteracting) {
+                os_log_debug(OS_LOG_DEFAULT, "waiting for user input\n");
+
+                if (params.conversation) {
+//                    osLog_("\n> ");
+                }
+
+                if (params.inputPrefixBOS) {
+                    os_log_debug(OS_LOG_DEFAULT, "adding input prefix BOS token\n");
+                    embd_inp.push_back([self.model tokenBOS]);
+                }
+
+                std::string buffer;
+                if ([params.inputPrefix length] > 0 && !params.conversation) {
+                    os_log_debug(OS_LOG_DEFAULT, "appending input prefix: '%s'\n", [params.inputPrefix cStringUsingEncoding:NSUTF8StringEncoding]);
+                    os_log_info(OS_LOG_DEFAULT, "%s", [params.inputPrefix cStringUsingEncoding:NSUTF8StringEncoding]);
+                }
+
+                // color user input only
+//                console::set_display(console::user_input);
+                display = params.displayPrompt;
+
+                std::string line;
+//                bool another_line = true;
+                static int read_one = 0;
+//                if (!read_one) {
+//                    do {
+//                        another_line = false;// console::readline(line, params.multiline_input);
+//                        buffer += "What is the weather in New York?";//line;
+//                    } while (another_line);
+//                    read_one++;
+//                }
+//                else {
+                if (!last_output_ss.str().empty()) {
+                    auto str = last_output_ss.str();
+                    last_output_ss.str("");
+                    [queue addOutputLine:[NSString stringWithCString:str.c_str() encoding:NSUTF8StringEncoding]];
+                }
+                 
+                buffer = [[queue inputLine] cStringUsingEncoding:NSUTF8StringEncoding];
+//                    do {
+//                        another_line = console::readline(line, params.multiline_input);
+//                        buffer += line;
+//                    } while (another_line);
+//                }
+                // done taking input, reset color
+//                console::set_display(console::reset);
+                display = true;
+
+                // Add tokens to embd only if the input buffer is non-empty
+                // Entering a empty line lets the user pass control back
+                if (buffer.length() > 1) {
+                    // append input suffix if any
+                    if ([params.inputSuffix length] > 0 && !params.conversation) {
+                        os_log_debug(OS_LOG_DEFAULT, "appending input suffix: '%s'\n", [params.inputSuffix cStringUsingEncoding:NSUTF8StringEncoding]);
+                        os_log_info(OS_LOG_DEFAULT, "%s", [params.inputSuffix cStringUsingEncoding:NSUTF8StringEncoding]);
+                    }
+
+                    os_log_debug(OS_LOG_DEFAULT, "buffer: '%s'\n", buffer.c_str());
+
+                    const size_t original_size = embd_inp.size();
+
+                    if (params.escapeSequences) {
+                        string_process_escapes(buffer);
+                    }
+
+                    bool format_chat = params.conversation && params.enableChatTemplate;
+                    std::string user_inp = format_chat
+                    ? [[self chat_add_and_format:chat_msgs role:"user" content:std::move(buffer)] cStringUsingEncoding:NSUTF8StringEncoding]
+                        : std::move(buffer);
+                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
+                    const auto line_pfx = [self.ctx tokenize:params.inputPrefix addSpecial:false parseSpecial:true];
+                    const auto line_inp = [self.ctx tokenize:[NSString stringWithCString:user_inp.c_str()
+                                                                           encoding:NSUTF8StringEncoding]
+                                             addSpecial:false
+                                           parseSpecial:format_chat];
+                    const auto line_sfx = [self.ctx tokenize:params.inputSuffix
+                                             addSpecial:false
+                                           parseSpecial:true];
+
+                    os_log_debug(OS_LOG_DEFAULT, "input tokens: %s\n", [self.ctx convertTokensToString:line_inp].c_str());
+
+                    // if user stop generation mid-way, we must add EOT to finish model's last response
+                    if (need_insert_eot && format_chat) {
+                        llama_token eot = [self.model tokenEOT];
+                        embd_inp.push_back(eot == -1 ? [self.model tokenEOS] : eot);
+                        need_insert_eot = false;
+                    }
+
+                    embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
+                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
+
+                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
+                        const llama_token token = embd_inp[i];
+                        output_tokens.push_back(token);
+                        output_ss << [[self.ctx tokenToPiece:token] cStringUsingEncoding:NSUTF8StringEncoding];
+                    }
+
+                    // reset assistant message
+                    assistant_ss.str("");
+
+                    n_remain -= line_inp.size();
+                    os_log_debug(OS_LOG_DEFAULT, "n_remain: %d\n", n_remain);
+                } else {
+                    os_log_debug(OS_LOG_DEFAULT, "empty line, passing control back\n");
+                }
+
+                input_echo = false; // do not echo this again
+            }
+
+            if (n_past > 0) {
+                if (isInteracting) {
+                    [smpl reset];
+                }
+                isInteracting = false;
+            }
+        }
+
+        // end of generation
+        if (!embd.empty() && [self.model tokenIsEOG:embd.back()] && !(params.interactive)) {
+            os_log_info(OS_LOG_DEFAULT, " [end of text]\n");
+            break;
+        }
+
+        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
+        // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
+        if (params.interactive && n_remain <= 0 && params.nPredict >= 0) {
+            n_remain = params.nPredict;
+            isInteracting = true;
+        }
+    }
+}
+
+@end
--- a/objc/include/GPTParams.h
+++ b/objc/include/GPTParams.h
@ -0,0 +1,264 @@
+#ifndef GPTParams_h
+#define GPTParams_h
+
+@class LlamaModelParams;
+@class LlamaContextParams;
+@class GGMLThreadpool;
+
+// Define the ggml_sched_priority enum
+typedef NS_ENUM(NSInteger, GGMLSchedPriority) {
+    GGMLSchedPriorityNormal = 0,  // Normal priority
+    GGMLSchedPriorityMedium = 1,  // Medium priority
+    GGMLSchedPriorityHigh = 2,    // High priority
+    GGMLSchedPriorityRealtime = 3 // Realtime priority
+};
+
+@interface GGMLThreadpoolParams : NSObject
+
+@property (nonatomic, assign) int nThreads;
+@property (nonatomic, assign) GGMLSchedPriority priority;
+@property (nonatomic, assign) uint32_t poll;
+@property (nonatomic, assign) BOOL strictCPU;
+@property (nonatomic, assign) BOOL paused;
+
+// Custom access methods for the cpumask array
+- (BOOL)getCpuMaskAtIndex:(NSUInteger)index;
+- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index;
+- (GGMLThreadpool *)threadpool;
+
+@end
+
+@interface GGMLThreadpool : NSObject
+@end
+
+@interface CPUParams : NSObject
+
+// Properties
+@property (nonatomic, assign) int nThreads;
+@property (nonatomic, assign) BOOL maskValid;
+@property (nonatomic, assign) GGMLSchedPriority priority;
+@property (nonatomic, assign) BOOL strictCPU;
+@property (nonatomic, assign) uint32_t poll;
+
+// Custom methods to access or manipulate the cpumask array
+- (BOOL)getCpuMaskAtIndex:(NSUInteger)index;
+- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index;
+- (GGMLThreadpoolParams *)ggmlThreadpoolParams;
+
+@end
+
+@interface GPTSamplerParams : NSObject
+
+// Properties corresponding to C++ struct fields
+@property (nonatomic, assign) uint32_t seed;
+@property (nonatomic, assign) int32_t nPrev;
+@property (nonatomic, assign) int32_t nProbs;
+@property (nonatomic, assign) int32_t minKeep;
+@property (nonatomic, assign) int32_t topK;
+@property (nonatomic, assign) float topP;
+@property (nonatomic, assign) float minP;
+@property (nonatomic, assign) float tfsZ;
+@property (nonatomic, assign) float typP;
+@property (nonatomic, assign) float temp;
+@property (nonatomic, assign) float dynatempRange;
+@property (nonatomic, assign) float dynatempExponent;
+@property (nonatomic, assign) int32_t penaltyLastN;
+@property (nonatomic, assign) float penaltyRepeat;
+@property (nonatomic, assign) float penaltyFreq;
+@property (nonatomic, assign) float penaltyPresent;
+@property (nonatomic, assign) int32_t mirostat;
+@property (nonatomic, assign) float mirostatTau;
+@property (nonatomic, assign) float mirostatEta;
+@property (nonatomic, assign) BOOL penalizeNl;
+@property (nonatomic, assign) BOOL ignoreEos;
+@property (nonatomic, assign) BOOL noPerf;
+
+// Arrays and Strings
+@property (nonatomic, strong) NSArray<NSNumber *> *samplers; // Samplers mapped to NSArray of NSNumber (for enums)
+@property (nonatomic, copy) NSString *grammar;               // Grammar as NSString
+@property (nonatomic, strong) NSArray<NSNumber *> *logitBias; // Logit biases mapped to NSArray of NSNumber
+
+// Method to print the parameters into a string
+- (NSString *)print;
+
+@end
+
+@interface GPTParams : NSObject
+
+@property (nonatomic, assign) int32_t nPredict;
+@property (nonatomic, assign) NSInteger nCtx;
+@property (nonatomic, assign) int32_t nBatch;
+@property (nonatomic, assign) int32_t nUBatch;
+@property (nonatomic, assign) int32_t nKeep;
+@property (nonatomic, assign) int32_t nDraft;
+@property (nonatomic, assign) int32_t nChunks;
+@property (nonatomic, assign) int32_t nParallel;
+@property (nonatomic, assign) int32_t nSequences;
+@property (nonatomic, assign) float pSplit;
+@property (nonatomic, assign) int32_t nGpuLayers;
+@property (nonatomic, assign) int32_t nGpuLayersDraft;
+@property (nonatomic, assign) int32_t mainGpu;
+@property (nonatomic, strong) NSMutableArray<NSNumber *> *tensorSplit; // Fixed-size array, stays the same
+@property (nonatomic, assign) int32_t grpAttnN;
+@property (nonatomic, assign) int32_t grpAttnW;
+@property (nonatomic, assign) int32_t nPrint;
+@property (nonatomic, assign) float ropeFreqBase;
+@property (nonatomic, assign) float ropeFreqScale;
+@property (nonatomic, assign) float yarnExtFactor;
+@property (nonatomic, assign) float yarnAttnFactor;
+@property (nonatomic, assign) float yarnBetaFast;
+@property (nonatomic, assign) float yarnBetaSlow;
+@property (nonatomic, assign) int32_t yarnOrigCtx;
+@property (nonatomic, assign) float defragThold;
+
+// You need to replace your C++ struct "cpu_params" with an Objective-C class or struct accordingly
+@property (nonatomic, strong) CPUParams *cpuParams;
+@property (nonatomic, strong) CPUParams *cpuParamsBatch;
+@property (nonatomic, strong) CPUParams *draftCpuParams;
+@property (nonatomic, strong) CPUParams *draftCpuParamsBatch;
+
+// Callbacks (assuming they are blocks in Objective-C)
+@property (nonatomic, copy) void (^cbEval)(void *);
+@property (nonatomic, assign) void *cbEvalUserData;
+
+@property (nonatomic, assign) NSInteger numaStrategy; // Enumerations
+
+@property (nonatomic, assign) NSInteger splitMode;
+@property (nonatomic, assign) NSInteger ropeScalingType;
+@property (nonatomic, assign) NSInteger poolingType;
+@property (nonatomic, assign) NSInteger attentionType;
+
+// Sampler parameters would also be converted to an Objective-C object
+@property (nonatomic, strong) GPTSamplerParams *samplerParams;
+
+@property (nonatomic, copy) NSString *modelPath;
+@property (nonatomic, copy) NSString *modelDraft;
+@property (nonatomic, copy) NSString *modelAlias;
+@property (nonatomic, copy) NSString *modelURL;
+@property (nonatomic, copy) NSString *hfToken;
+@property (nonatomic, copy) NSString *hfRepo;
+@property (nonatomic, copy) NSString *hfFile;
+@property (nonatomic, copy) NSString *prompt;
+@property (nonatomic, copy) NSString *promptFile;
+@property (nonatomic, copy) NSString *pathPromptCache;
+@property (nonatomic, copy) NSString *inputPrefix;
+@property (nonatomic, copy) NSString *inputSuffix;
+@property (nonatomic, copy) NSString *logdir;
+@property (nonatomic, copy) NSString *lookupCacheStatic;
+@property (nonatomic, copy) NSString *lookupCacheDynamic;
+@property (nonatomic, copy) NSString *logitsFile;
+@property (nonatomic, copy) NSString *rpcServers;
+
+// Arrays in Objective-C are represented with `NSArray`
+@property (nonatomic, strong) NSArray<NSString *> *inputFiles;
+@property (nonatomic, strong) NSArray<NSString *> *antiPrompts;
+@property (nonatomic, strong) NSArray *kvOverrides;
+
+// Boolean values (in Objective-C, use `BOOL`)
+@property (nonatomic, assign) BOOL loraInitWithoutApply;
+@property (nonatomic, strong) NSArray *loraAdapters;
+@property (nonatomic, strong) NSArray *controlVectors;
+
+// Control params
+@property (nonatomic, assign) int32_t verbosity;
+@property (nonatomic, assign) int32_t controlVectorLayerStart;
+@property (nonatomic, assign) int32_t controlVectorLayerEnd;
+
+// Performance and configuration params
+@property (nonatomic, assign) int32_t pplStride;
+@property (nonatomic, assign) int32_t pplOutputType;
+
+@property (nonatomic, assign) BOOL hellaswag;
+@property (nonatomic, assign) size_t hellaswagTasks;
+@property (nonatomic, assign) BOOL winogrande;
+@property (nonatomic, assign) size_t winograndeTasks;
+@property (nonatomic, assign) BOOL multipleChoice;
+@property (nonatomic, assign) size_t multipleChoiceTasks;
+@property (nonatomic, assign) BOOL klDivergence;
+
+@property (nonatomic, assign) BOOL usage;
+@property (nonatomic, assign) BOOL useColor;
+@property (nonatomic, assign) BOOL special;
+@property (nonatomic, assign) BOOL interactive;
+@property (nonatomic, assign) BOOL interactiveFirst;
+@property (nonatomic, assign) BOOL conversation;
+@property (nonatomic, assign) BOOL promptCacheAll;
+@property (nonatomic, assign) BOOL promptCacheRO;
+
+@property (nonatomic, assign) BOOL escapeSequences;
+@property (nonatomic, assign) BOOL multilineInput;
+@property (nonatomic, assign) BOOL simpleIO;
+@property (nonatomic, assign) BOOL continuousBatching;
+@property (nonatomic, assign) BOOL flashAttention;
+@property (nonatomic, assign) BOOL noPerformanceMetrics;
+@property (nonatomic, assign) BOOL contextShift;
+
+// Server and I/O settings
+@property (nonatomic, assign) int32_t port;
+@property (nonatomic, assign) int32_t timeoutRead;
+@property (nonatomic, assign) int32_t timeoutWrite;
+@property (nonatomic, assign) int32_t httpThreads;
+
+@property (nonatomic, copy) NSString *hostname;
+@property (nonatomic, copy) NSString *publicPath;
+@property (nonatomic, copy) NSString *chatTemplate;
+@property (nonatomic, copy) NSString *systemPrompt;
+@property (nonatomic, assign) BOOL enableChatTemplate;
+
+@property (nonatomic, strong) NSArray<NSString *> *apiKeys;
+
+@property (nonatomic, copy) NSString *sslFileKey;
+@property (nonatomic, copy) NSString *sslFileCert;
+
+@property (nonatomic, assign) BOOL endpointSlots;
+@property (nonatomic, assign) BOOL endpointMetrics;
+@property (nonatomic, assign) BOOL logJSON;
+
+@property (nonatomic, copy) NSString *slotSavePath;
+@property (nonatomic, assign) float slotPromptSimilarity;
+
+// batched-bench params
+@property (nonatomic, assign) BOOL isPPShared;
+@property (nonatomic, strong) NSArray<NSNumber *> *nPP;
+@property (nonatomic, strong) NSArray<NSNumber *> *nTG;
+@property (nonatomic, strong) NSArray<NSNumber *> *nPL;
+
+// retrieval params
+@property (nonatomic, strong) NSArray<NSString *> *contextFiles;
+@property (nonatomic, assign) int32_t chunkSize;
+@property (nonatomic, copy) NSString *chunkSeparator;
+
+// passkey params
+@property (nonatomic, assign) int32_t nJunk;
+@property (nonatomic, assign) int32_t iPos;
+
+// imatrix params
+@property (nonatomic, copy) NSString *outFile;
+@property (nonatomic, assign) int32_t nOutFreq;
+@property (nonatomic, assign) int32_t nSaveFreq;
+@property (nonatomic, assign) int32_t iChunk;
+@property (nonatomic, assign) BOOL processOutput;
+@property (nonatomic, assign) BOOL computePPL;
+
+// cvector-generator params
+@property (nonatomic, assign) int nPCABatch;
+@property (nonatomic, assign) int nPCAIterations;
+@property (nonatomic, assign) int cvectorDimreMethod;
+@property (nonatomic, copy) NSString *cvectorOutfile;
+@property (nonatomic, copy) NSString *cvectorPositiveFile;
+@property (nonatomic, copy) NSString *cvectorNegativeFile;
+
+@property (nonatomic, assign) BOOL spmInfill;
+@property (nonatomic, copy) NSString *loraOutfile;
+@property (nonatomic, assign) BOOL embedding;
+@property (nonatomic, assign) BOOL verbosePrompt; // print prompt tokens before generation
+@property (nonatomic, assign) BOOL batchedBenchOutputJSONL;
+@property (nonatomic, assign) BOOL inputPrefixBOS; // prefix BOS to user inputs, preceding input_prefix
+@property (nonatomic, assign) BOOL ctxShift; // context shift on inifinite text generation
+@property (nonatomic, assign) BOOL displayPrompt; // print prompt before generation
+- (LlamaModelParams *)llamaModelParams;
+- (LlamaContextParams *)llamaContextParams;
+
+@end
+
+#endif /* GPTParams_h */
--- a/objc/include/GPTParams_Private.hpp
+++ b/objc/include/GPTParams_Private.hpp
@ -0,0 +1,25 @@
+#ifndef GPTParams_Private_hpp
+#define GPTParams_Private_hpp
+
+#import "GPTParams.h"
+#import "ggml.h"
+#import "../../common/common.h"
+
+@interface GGMLThreadpool()
+
+- (ggml_threadpool *)threadpool;
+
+@end
+
+@interface GPTParams()
+
+- (gpt_params&)params;
+
+@end
+
+@interface GPTSamplerParams()
+
+- (gpt_sampler_params&)cParams;
+
+@end
+#endif /* GPTParams_Private_hpp */
--- a/objc/include/GPTSampler.h
+++ b/objc/include/GPTSampler.h
@ -0,0 +1,55 @@
+#ifndef GPTSampler_h
+#define GPTSampler_h
+
+@class LlamaModel;
+@class GPTSamplerParams;
+@class LlamaContext;
+typedef int32_t LlamaToken;
+
+@interface GPTSampler : NSObject
+
+- (instancetype)init:(LlamaModel *)model gptSamplerParams:(GPTSamplerParams *)gptSamplerParams;
+- (uint32_t)seed;
+
+// extended sampling implementation:
+//
+// - set logits
+// - apply the configured sampler chain
+// - check if the token fits the grammar (if any)
+// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
+//
+// if grammar_first is true, the grammar is applied before the samplers (slower)
+// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
+//
+- (LlamaToken)sample:(LlamaContext *)context
+               index:(NSInteger) index;
+
+// extended sampling implementation:
+//
+// - set logits
+// - apply the configured sampler chain
+// - check if the token fits the grammar (if any)
+// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
+//
+// if grammar_first is true, the grammar is applied before the samplers (slower)
+// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
+//
+- (LlamaToken)sample:(LlamaContext *)context
+               index:(NSInteger) index
+        grammarFirst:(BOOL)grammarFirst;
+
+// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
+- (void)accept:(LlamaToken)token
+ acceptGrammar:(BOOL)acceptGrammar;
+
+// get a string representation of the last accepted tokens
+- (NSString *)previousString:(LlamaContext *)context n:(NSInteger)n;
+
+// get the last accepted token
+- (LlamaToken)last;
+
+- (void)reset;
+
+@end
+
+#endif /* GPTSampler_h */
--- a/objc/include/LlamaBatch.h
+++ b/objc/include/LlamaBatch.h
@ -0,0 +1,34 @@
+#ifndef LlamaBatch_h
+#define LlamaBatch_h
+
+typedef NSInteger LlamaSequenceId;
+typedef NSInteger LlamaPosition;
+typedef int32_t LlamaToken;
+
+// Input data for llama_decode
+// A llama_batch object can contain input about one or many sequences
+// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
+//
+// - token  : the token ids of the input (used when embd is NULL)
+// - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
+// - pos    : the positions of the respective token in the sequence
+// - seq_id : the sequence to which the respective token belongs
+// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
+@interface LlamaBatch : NSObject
+
+@property (nonatomic, assign) NSInteger nTokens;
+@property (nonatomic, assign) LlamaToken *tokens;
+@property (nonatomic, assign) float *embd;
+@property (nonatomic, assign) LlamaPosition *pos;
+@property (nonatomic, assign) int32_t *nSeqId;
+@property (nonatomic, assign) LlamaSequenceId **seqId;
+@property (nonatomic, assign) NSData *output;
+
+// Helpers for smooth API transition (optional usage in the interface)
+@property (nonatomic, assign) LlamaPosition allPos0;
+@property (nonatomic, assign) LlamaPosition allPos1;
+@property (nonatomic, assign) LlamaSequenceId allSeqId;
+
+@end
+
+#endif /* LlamaBatch_h */
--- a/objc/include/LlamaBatch_Private.hpp
+++ b/objc/include/LlamaBatch_Private.hpp
@ -0,0 +1,13 @@
+#ifndef LlamaBatch_Private_hpp
+#define LlamaBatch_Private_hpp
+#import "LlamaBatch.h"
+#import "llama.h"
+
+@interface LlamaBatch()
+
+- (instancetype)initWithBatch:(llama_batch)batch;
+- (llama_batch)cBatch;
+
+@end
+
+#endif /* LlamaBatch_Private_hpp */
--- a/objc/include/LlamaContext.h
+++ b/objc/include/LlamaContext.h
@ -0,0 +1,57 @@
+#ifndef LlamaContext_h
+#define LlamaContext_h
+
+@class GGMLThreadpool;
+@class LlamaBatch;
+
+typedef NSInteger LlamaSequenceId;
+typedef NSInteger LlamaPosition;
+typedef int32_t LlamaToken;
+
+@interface LlamaContext : NSObject
+
+- (void)attachThreadpool:(GGMLThreadpool *)threadpool
+         threadpoolBatch:(GGMLThreadpool *)threadpoolBatch;
+
+- (NSUInteger)nCtx;
+
+// Positive return values does not mean a fatal error, but rather a warning.
+//   0 - success
+//   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+// < 0 - error
+- (NSInteger)decode:(LlamaBatch *)batch;
+
+// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
+// If the KV cache is RoPEd, the KV data is updated accordingly:
+//   - lazily on next llama_decode()
+//   - explicitly with llama_kv_cache_update()
+// p0 < 0 : [0,  p1]
+// p1 < 0 : [p0, inf)
+- (void)kvCacheSeqAdd:(LlamaSequenceId)sequenceId
+                   p0:(LlamaPosition)p0
+                   p1:(LlamaPosition)p1
+                delta:(LlamaPosition)delta;
+
+// Integer division of the positions by factor of `d > 1`
+// If the KV cache is RoPEd, the KV data is updated accordingly:
+//   - lazily on next llama_decode()
+//   - explicitly with llama_kv_cache_update()
+// p0 < 0 : [0,  p1]
+// p1 < 0 : [p0, inf)
+- (void)kvCacheSeqDiv:(LlamaSequenceId)sequenceId
+                   p0:(LlamaPosition)p0
+                   p1:(LlamaPosition)p1
+                delta:(LlamaPosition)delta;
+
+// tokenizes a token into a piece, optionally renders special/control tokens
+// should work similar to Python's `tokenizer.id_to_piece`
+- (NSString *)tokenToPiece:(LlamaToken)token;
+- (NSString *)tokenToPiece:(LlamaToken)token special:(BOOL)special;
+
+- (BOOL)saveStateFile:(NSString *)pathSession
+               tokens:(const LlamaToken *)tokens
+          nTokenCount:(size_t)nTokenCount;
+
+@end
+
+#endif /* LlamaContext_h */
--- a/objc/include/LlamaContext_Private.hpp
+++ b/objc/include/LlamaContext_Private.hpp
@ -0,0 +1,28 @@
+#ifndef LlamaContext_Private_hpp
+#define LlamaContext_Private_hpp
+
+#import "LlamaContext.h"
+#import "../../common/common.h"
+
+@interface LlamaContext()
+
+- (instancetype)initWithContext:(llama_context *)context;
+
+- (std::vector<llama_token>)tokenize:(NSString *)text
+                          addSpecial:(BOOL)addSpecial
+                        parseSpecial:(BOOL)parseSpecial;
+
+- (BOOL)loadStateFile:(NSString *)pathSession
+            tokensOut:(llama_token *)tokensOut
+        nTokenCpacity:(size_t)nTokenCapacity
+       nTokenCountOut:(size_t *)nTokenCountOut;
+
+- (std::string)convertTokensToString:(const std::vector<llama_token>&)tokens;
+
+- (llama_context *)cContext;
+
+- (int32_t)encode:(llama_batch)batch;
+
+@end
+
+#endif /* LlamaContext_Private_hpp */
--- a/objc/include/LlamaModel.h
+++ b/objc/include/LlamaModel.h
@ -0,0 +1,35 @@
+#ifndef LlamaModel_h
+#define LlamaModel_h
+
+@class GPTParams;
+@class GGMLThreadpool;
+@class LlamaContext;
+
+typedef int32_t LlamaToken;
+
+@interface LlamaChatMessage : NSObject
+
+@property (nonatomic, copy) NSString *role;
+@property (nonatomic, copy) NSString *content;
+
+@end
+
+@interface LlamaContextParams : NSObject
+@end
+
+@interface LlamaModel : NSObject
+
+- (LlamaContext *)context:(LlamaContextParams *)params;
+- (LlamaToken)tokenBOS;
+- (LlamaToken)tokenEOT;
+- (LlamaToken)tokenEOS;
+- (BOOL)tokenIsEOG:(LlamaToken)token;
+- (int32_t)nCtxTrain;
+- (BOOL)addBOSToken;
+- (BOOL)addEOSToken;
+- (BOOL)hasEncoder;
+- (NSString *)formatExample:(NSString *)tmpl;
+
+@end
+
+#endif /* LlamaModel_h */
--- a/objc/include/LlamaModel_Private.hpp
+++ b/objc/include/LlamaModel_Private.hpp
@ -0,0 +1,15 @@
+#ifndef LlamaModel_Private_hpp
+#define LlamaModel_Private_hpp
+
+#import "LlamaModel.h"
+#import "llama.h"
+
+@interface LlamaModel()
+
+- (instancetype)init:(llama_model *)model;
+
+- (llama_model *)cModel;
+
+@end
+
+#endif /* LlamaModel_Private_hpp */
--- a/objc/include/LlamaObjC.h
+++ b/objc/include/LlamaObjC.h
@ -0,0 +1,13 @@
+#ifndef LlamaObjC_h
+#define LlamaObjC_h
+
+#include <Foundation/Foundation.h>
+#include <llama.h>
+#include <LlamaModel.h>
+#include <LlamaContext.h>
+#include <LlamaSession.h>
+#include <GPTParams.h>
+
+
+
+#endif /* LlamaObjC_h */
--- a/objc/include/LlamaSession.h
+++ b/objc/include/LlamaSession.h
@ -0,0 +1,27 @@
+#ifndef LlamaSession_h
+#define LlamaSession_h
+
+@class GPTParams;
+@class LlamaModel;
+@class LlamaContext;
+
+@interface BlockingLineQueue : NSObject
+
+- (void)addInputLine:(NSString *)line;
+- (NSString *)inputLine;
+- (void)addOutputLine:(NSString *)line;
+- (NSString *)outputLine;
+
+@end
+
+@interface LlamaSession : NSObject
+
+@property (nonatomic, strong) LlamaModel *model;
+@property (nonatomic, strong) LlamaContext *ctx;
+
+- (instancetype)initWithParams:(GPTParams *)params;
+- (void)start:(BlockingLineQueue *)queue;
+
+@end
+
+#endif /* Header_h */
--- a/objc/include/LlamaSession_Private.hpp
+++ b/objc/include/LlamaSession_Private.hpp
@ -0,0 +1,10 @@
+#ifndef LlamaSession_Private_hpp
+#define LlamaSession_Private_hpp
+
+#import "LlamaSession.h"
+
+@interface LlamaSession()
+
+@end
+
+#endif /* LlamaSession_Private_hpp */
--- a/objc/include/ggml-metal.h
+++ b/objc/include/ggml-metal.h
@ -0,0 +1 @@
+../../ggml/include/ggml-metal.h
--- a/swift/JSONSchema/Grammar.swift
+++ b/swift/JSONSchema/Grammar.swift
@ -0,0 +1,102 @@
+import Foundation
+import RegexBuilder
+
+let SPACE_RULE = "\" \"?"
+
+let PRIMITIVE_RULES: [String: String] = [
+    "boolean": "(\"true\" | \"false\") space",
+    "number": "\"-\"? ([0-9] | [1-9] [0-9]*) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space",
+    "integer": "\"-\"? ([0-9] | [1-9] [0-9]*) space",
+    "string": "\"\\\"\" ([^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]))* \"\\\"\" space",
+    "null": "\"null\" space",
+]
+
+let INVALID_RULE_CHARS_RE = try! NSRegularExpression(pattern: "[^a-zA-Z0-9-]+")
+let GRAMMAR_LITERAL_ESCAPE_RE = try! NSRegularExpression(pattern: "[\r\n\"]")
+let GRAMMAR_LITERAL_ESCAPES: [String: String] = ["\r": "\\r", "\n": "\\n", "\"": "\\\""]
+
+public class SchemaConverter {
+    private var propOrder: [String]
+    private var rules: [String: String] = ["space": SPACE_RULE]
+
+    public init(propOrder: [String]) {
+        self.propOrder = propOrder
+    }
+
+    private func formatLiteral(_ literal: Any) -> String {
+//        let escaped = GRAMMAR_LITERAL_ESCAPES.reduce("\(literal)", {
+//        let regex = Regex("[\r\n\"]")
+        let escaped = GRAMMAR_LITERAL_ESCAPES.reduce("\(literal)") {
+            $0.replacingOccurrences(of: $1.key, with: $1.value)
+        }
+        
+        return "\\\"\(escaped)\\\""
+    }
+
+    private func addRule(name: String, rule: String) -> String {
+        let escName = INVALID_RULE_CHARS_RE.stringByReplacingMatches(
+            in: name,
+            options: [],
+            range: NSRange(location: 0, length: name.count),
+            withTemplate: "-"
+        )
+
+        var key = escName
+        if let existingRule = rules[escName], existingRule != rule {
+            var i = 0
+            while rules["\(escName)\(i)"] != nil {
+                i += 1
+            }
+            key = "\(escName)\(i)"
+        }
+
+        rules[key] = rule
+        return key
+    }
+
+    public func visit(schema: [String: Any], name: String?) -> String {
+        let schemaType = schema["type"] as? String
+        let ruleName = name ?? "root"
+
+        if let oneOf = schema["oneOf"] as? [[String: Any]] ?? schema["anyOf"] as? [[String: Any]] {
+            let rule = oneOf.enumerated().map { (i, altSchema) in
+                visit(schema: altSchema, name: "\(name ?? "")\(name != nil ? "-" : "")\(i)")
+            }.joined(separator: " | ")
+            return addRule(name: ruleName, rule: rule)
+        } else if let constValue = schema["const"] {
+            return addRule(name: ruleName, rule: formatLiteral(constValue))
+        } else if let enumValues = schema["enum"] as? [Any] {
+            let rule = enumValues.map { "\"\(formatLiteral($0))\"" }.joined(separator: " | ")
+            return addRule(name: ruleName, rule: rule)
+        } else if schemaType == "object", let properties = schema["properties"] as? [String: Any] {
+            let propPairs = properties.sorted { (kv1, kv2) in
+                let idx1 = propOrder.firstIndex(of: kv1.key) ?? propOrder.count
+                let idx2 = propOrder.firstIndex(of: kv2.key) ?? propOrder.count
+                return (idx1, kv1.key) < (idx2, kv2.key)
+            }
+
+            var rule = "\"{\" space"
+            for (i, (propName, propSchema)) in propPairs.enumerated() {
+                let propRuleName = visit(schema: propSchema as! [String : Any], name: "\(name ?? "")\(name != nil ? "-" : "")\(propName)")
+                if i > 0 {
+                    rule += " \",\" space"
+                }
+                rule += " \"\(formatLiteral(propName))\" space \":\" space \(propRuleName)"
+            }
+            rule += " \"}\" space"
+
+            return addRule(name: ruleName, rule: rule)
+        } else if schemaType == "array", let items = schema["items"] {
+            let itemRuleName = visit(schema: items as! [String : Any], name: "\(name ?? "")\(name != nil ? "-" : "")item")
+            let rule = "\"[\" space (\(itemRuleName) (\",\" space \(itemRuleName))*)? \"]\" space"
+            return addRule(name: ruleName, rule: rule)
+        } else {
+            assert(PRIMITIVE_RULES.keys.contains(schemaType ?? ""), "Unrecognized schema: \(schema)")
+            return addRule(name: ruleName == "root" ? "root" : schemaType!, rule: PRIMITIVE_RULES[schemaType!]!)
+        }
+    }
+
+    public func formatGrammar() -> String {
+        return rules.map { (name, rule) in "\(name) ::= \(rule)" }.joined(separator: "\n") + "\n"
+    }
+}
--- a/swift/JSONSchema/JSONSchema.swift
+++ b/swift/JSONSchema/JSONSchema.swift
@ -0,0 +1,187 @@
+import Foundation
+//import SwiftSyntaxMacros
+
+public struct JSONSchema : Codable {
+    public struct Items : Codable {
+        let type: String
+        let `enum`: [String]?
+        
+        public init(type: String, `enum`: [String]?) {
+            self.type = type
+            self.enum = `enum`
+        }
+    }
+    public struct Property : Codable {
+        let type: String
+        let items: Items?
+        let description: String?
+        
+        public init(type: String, items: Items?, description: String?) {
+            self.type = type
+            self.items = items
+            self.description = description
+        }
+    }
+    let type: String
+    let items: Items?
+    let properties: [String : Property]?
+    
+    public init(type: String, items: Items?, properties: [String : Property]?) {
+        self.type = type
+        self.items = items
+        self.properties = properties
+    }
+}
+
+
+public struct _JSONFunctionSchema: Codable {
+    public struct Items: Codable {
+        let type: String
+        let `enum`: [String]?
+        
+        public init(type: Any.Type, `enum`: [String]?) {
+            self.type = String(describing: type)
+            self.enum = `enum`
+        }
+    }
+
+    public struct Property: Codable {
+        let type: String
+        let items: Items?
+        let `enum`: [String]?
+        let description: String?
+        
+        public init(type: String.Type, description: String?) {
+            self.type = "string"
+            self.description = description
+            self.items = nil
+            self.enum = nil
+        }
+        
+        public init<T: CaseIterable>(type: T.Type, description: String?) where T: RawRepresentable,
+        T: StringProtocol {
+            self.type = "string"
+            self.enum = Array(type.allCases.map { $0.rawValue as! String })
+            self.description = description
+            self.items = nil
+        }
+    }
+    
+    
+    public struct Parameters: Codable {
+        public let properties: [String: Property]
+        public let required: [String]
+        public let type = "object"
+        
+        public init(properties: [String : Property], required: [String]) {
+            self.properties = properties
+            self.required = required
+        }
+    }
+    
+    let name: String
+    let description: String
+    let parameters: Parameters
+    
+    public init(name: String, description: String, parameters: Parameters) {
+        self.name = name
+        self.description = description
+        self.parameters = parameters
+    }
+}
+
+public protocol JSONSchemaConvertible : Codable {
+    static var type: String { get }
+    static var jsonSchema: [String : Any] { get }
+    static func decode<K: CodingKey>(from container: KeyedDecodingContainer<K>,
+                                     forKey key: K) throws -> Self
+}
+
+extension RawRepresentable where Self : CaseIterable, RawValue : JSONSchemaConvertible, Self: Codable {
+    public static var type: String {
+        RawValue.type
+    }
+    public static var jsonSchema: [String: Any] {
+        [
+            "type": RawValue.type,
+            "enum": Self.allCases.map(\.rawValue)
+        ]
+    }
+}
+
+extension JSONSchemaConvertible {
+    public static var items: JSONSchema.Items? {
+        nil
+    }
+    public static var properties: [JSONSchema.Property]? {
+        nil
+    }
+    public static var `enum`: [String]? {
+        nil
+    }
+    public static func decode<K: CodingKey>(from container: KeyedDecodingContainer<K>, forKey key: K) throws -> Self {
+        return try container.decode(Self.self, forKey: key)
+    }
+}
+extension String : JSONSchemaConvertible {
+    public static var type: String { "string" }
+    public static var jsonSchema: [String: Any] {
+        [
+            "type": "string"
+        ]
+    }
+}
+extension Int : JSONSchemaConvertible {
+    public static var type: String { "number" }
+    public static var jsonSchema: [String: Any] {
+        [
+            "type": "integer"
+        ]
+    }
+}
+extension Double : JSONSchemaConvertible {
+    public static var type: String { "number" }
+    public static var jsonSchema: [String: Any] {
+        [
+            "type": "number"
+        ]
+    }
+}
+extension Date : JSONSchemaConvertible {
+    public static var type: String { "string" }
+
+    public static var jsonSchema: [String: Any] {
+        [
+            "type": "string"
+        ]
+    }
+
+    public static func decode<K: CodingKey>(from container: KeyedDecodingContainer<K>, forKey key: K) throws -> Self {
+        let value = try container.decode(String.self, forKey: key)
+        let detector = try? NSDataDetector(types: NSTextCheckingResult.CheckingType.date.rawValue)
+        let matches = detector?.matches(in: value, options: [], range: NSMakeRange(0, value.utf16.count))
+        return matches!.first!.date!
+        // return ISO8601DateFormatter().date(from: value)!
+    }
+}
+
+extension Array : JSONSchemaConvertible where Element : JSONSchemaConvertible {
+    public static var type: String { "array" }
+    public static var items: JSONSchema.Items? {
+        JSONSchema.Items(type: Element.type, enum: Element.enum)
+    }
+    public static var jsonSchema: [String : Any] {
+        [
+            "type": "array",
+            "items": Element.jsonSchema
+        ]
+    }
+}
+
+@attached(member, names: arbitrary)
+@attached(extension, conformances: JSONSchemaConvertible, CaseIterable, names: arbitrary)
+public macro JSONSchema() = #externalMacro(module: "JSONSchemaMacros",
+                                           type: "JSONSchemaMacro")
+
+//@attached(member, names: arbitrary)
+
--- a/swift/JSONSchemaMacros/JSONSchemaMacros.swift
+++ b/swift/JSONSchemaMacros/JSONSchemaMacros.swift
@ -0,0 +1,229 @@
+import SwiftSyntaxMacros
+import SwiftCompilerPlugin
+import SwiftSyntax
+
+private struct MemberView {
+    let name: String
+    let type: String
+    var attributeKey: String?
+    var assignment: String?
+}
+
+private func view(for member: MemberBlockItemListSyntax.Element) throws -> MemberView? {
+    guard let decl = member.decl.as(VariableDeclSyntax.self),
+          let binding = decl.bindings.compactMap({
+              $0.pattern.as(IdentifierPatternSyntax.self)
+          }).first,
+          let type = decl.bindings.compactMap({
+              $0.typeAnnotation?.type
+          }).first,
+          !(type.syntaxNodeType is StructDeclSyntax.Type) else {
+        return nil
+    }
+    var memberView = MemberView(name: "\(binding.identifier)", type: "\(type)", attributeKey: nil)
+    if let macroName = decl.attributes.first?.as(AttributeSyntax.self)?
+        .arguments?.as(LabeledExprListSyntax.self)?.first?.expression.as(StringLiteralExprSyntax.self) {
+        memberView.attributeKey = "\(macroName.segments)"
+    }
+    if let assignment = decl.bindings.compactMap({
+        $0.initializer?.value
+    }).first {
+        memberView.assignment = "\(assignment)"
+    }
+    return memberView
+}
+
+struct JSONSchemaMacro: ExtensionMacro, MemberMacro {
+    static func expansion(of node: AttributeSyntax, providingMembersOf declaration: some DeclGroupSyntax, conformingTo protocols: [TypeSyntax], in context: some MacroExpansionContext) throws -> [DeclSyntax] {
+        let members = try declaration.memberBlock.members.compactMap(view(for:))
+        if declaration is EnumDeclSyntax {
+            return []
+        }
+        return [
+            """
+            enum CodingKeys: CodingKey {
+                case \(raw: members.map(\.name).joined(separator: ", "))
+            }
+            """,
+            """
+            init(from decoder: Decoder) throws {
+                let container = try decoder.container(keyedBy: CodingKeys.self)
+                \(raw: members.map {
+                    """
+                    self.\($0.name) = try \($0.type).decode(from: container, forKey: .\($0.name))
+                    """
+                }.joined(separator: "\n"))
+            }
+            """
+        ]
+    }
+    
+    static func expansion(of node: SwiftSyntax.AttributeSyntax,
+                          attachedTo declaration: some SwiftSyntax.DeclGroupSyntax,
+                          providingExtensionsOf type: some SwiftSyntax.TypeSyntaxProtocol,
+                          conformingTo protocols: [SwiftSyntax.TypeSyntax], 
+                          in context: some SwiftSyntaxMacros.MacroExpansionContext) throws -> [SwiftSyntax.ExtensionDeclSyntax] {
+        let members = try declaration.memberBlock.members.compactMap(view(for:))
+        var inheritedTypes: [InheritedTypeSyntax] = []
+        inheritedTypes.append(InheritedTypeSyntax(type: TypeSyntax("JSONSchemaConvertible")))
+        if declaration is EnumDeclSyntax {
+            inheritedTypes.append(InheritedTypeSyntax(type: TypeSyntax(", CaseIterable")))
+        }
+        let properties = members.map {
+            """
+            "\($0.name)": \($0.type).jsonSchema
+            """
+        }
+        if !(declaration is EnumDeclSyntax) {
+            return [
+                ExtensionDeclSyntax(extendedType: type,
+                                    inheritanceClause: .init(inheritedTypes: .init(inheritedTypes)),
+                                    memberBlock: """
+                                {
+                                    static var type: String {
+                                        "object"
+                                    }
+                                    static var jsonSchema: [String: Any] {
+                                        [
+                                            "type": "object",
+                                            "properties": [
+                                                \(raw: properties.joined(separator: ","))
+                                            ]
+                                        ]
+                                    }
+                                }
+                                """)
+            ]
+        } else {
+            return [
+                ExtensionDeclSyntax(extendedType: type,
+                                    inheritanceClause: .init(inheritedTypes: .init(inheritedTypes)),
+                                    memberBlock: """
+                                {
+                                    public static func decode<K: CodingKey>(from container: KeyedDecodingContainer<K>, forKey key: K) throws -> Self {
+                                        if RawValue.self is Int.Type {
+                                            return Self(rawValue: Int(try container.decode(String.self, forKey: key)) as! Self.RawValue)!
+                                        } else {
+                                            return try container.decode(Self.self, forKey: key)
+                                        }
+                                    }
+                                }
+                                """)
+            ]
+        }
+    }
+}
+
+enum TestError: Error {
+    case message(String)
+}
+
+struct LlamaActorMacro: ExtensionMacro, MemberMacro {
+    static func expansion(of node: AttributeSyntax, providingMembersOf declaration: some DeclGroupSyntax, conformingTo protocols: [TypeSyntax], in context: some MacroExpansionContext) throws -> [DeclSyntax] {
+        [
+            """
+            let session: LlamaToolSession
+            
+            public init(params: GPTParams) async throws {
+                self.session = try await LlamaToolSession(params: params, tools: Self.tools)
+            }
+            """
+        ]
+    }
+    
+    static func expansion(of node: AttributeSyntax,
+                          attachedTo declaration: some DeclGroupSyntax,
+                          providingExtensionsOf type: some TypeSyntaxProtocol,
+                          conformingTo protocols: [TypeSyntax],
+                          in context: some MacroExpansionContext) throws -> [ExtensionDeclSyntax] {
+        var tools: [
+            (name: String,
+             description: String,
+             parameters: [(name: String,
+                           type: String,
+                           description: String)],
+             callableString: String,
+             callableName: String)
+        ] = []
+        for member in declaration.memberBlock.members {
+            let comments = member.leadingTrivia.filter { $0.isComment }
+            
+            guard let member = member.decl.as(FunctionDeclSyntax.self) else {
+                continue
+            }
+            let name = member.name
+            guard case var .docLineComment(description) = comments.first else {
+                throw TestError.message("Missing comment")
+            }
+            description = String(description.dropFirst(3))
+            var parameters: [(name: String, type: String, description: String)] = []
+            var index = 0
+            for parameter in member.signature.parameterClause.parameters {
+                let firstName = parameter.firstName.text
+                let typeName = parameter.type.as(IdentifierTypeSyntax.self)!.name.text
+                guard case var .docLineComment(description) = comments[index + 1] else {
+                    throw TestError.message("Missing comment for \(firstName)")
+                }
+                description = String(description.dropFirst(3))
+                parameters.append((name: firstName, type: typeName, description: description))
+                index += 1
+            }
+            let callableName = context.makeUniqueName(name.text)
+            let callableString = """
+            @dynamicCallable struct \(callableName.text): DynamicCallable {
+                @discardableResult
+                func dynamicallyCall(withKeywordArguments args: [String: Any]) async throws -> String {
+                    \(parameters.map {
+                        "var \($0.name): \($0.type)!"
+                    }.joined(separator: "\n"))
+                    for (key, value) in args {
+                        \(parameters.map {
+                            "if key == \"\($0.name)\" { \($0.name) = value as! \($0.type) }"
+                        }.joined(separator: "\n"))
+                    }
+            
+                    let returnValue = try await \(name.text)(\(parameters.map { "\($0.name): \($0.name)" }.joined(separator: ",")))
+                    let jsonValue = try JSONEncoder().encode(returnValue)
+                    return String(data: jsonValue, encoding: .utf8)!
+                }
+            }
+            """
+            tools.append((name: name.text, description: description,
+                          parameters: parameters,
+                          callableString: callableString,
+                          callableName: callableName.text))
+        }
+        
+        
+        return [
+            .init(extendedType: type,
+                inheritanceClause: .init(inheritedTypes: InheritedTypeListSyntax.init(arrayLiteral: .init(type: IdentifierTypeSyntax(name: "LlamaActor")))),
+                  memberBlock: """
+            {
+                \(raw: tools.map {
+                    $0.callableString
+                }.joined(separator: "\n"))
+            
+                static var tools: [String: (DynamicCallable, _JSONFunctionSchema)] {
+                    [\(raw: tools.map { tool in
+                        """
+                        "\(tool.name)": (\(tool.callableName)(), _JSONFunctionSchema(name: "\(tool.name)", description: "\(tool.description)", parameters: _JSONFunctionSchema.Parameters(properties: \(tool.parameters.count == 0 ? "[:]" : "[" + tool.parameters.map { parameter in
+                            """
+                            "\(parameter.name)": _JSONFunctionSchema.Property(type: \(parameter.type).self, description: "\(parameter.description)"),
+                            """
+                            }.joined() + "]"), required: [])))
+                        """
+                    }.joined(separator: ","))]
+                }
+            }
+            """)
+        ]
+    }
+}
+
+@main
+struct JSONSchemaMacrosPlugin: CompilerPlugin {
+    let providingMacros: [Macro.Type] = [
+        JSONSchemaMacro.self, LlamaActorMacro.self
+    ]
+}
--- a/swift/LlamaKit/LlamaKit.swift
+++ b/swift/LlamaKit/LlamaKit.swift
@ -0,0 +1,189 @@
+import Foundation
+@_exported import JSONSchema
+@_exported import LlamaObjC
+
+public protocol DynamicCallable: Sendable {
+    @discardableResult
+    func dynamicallyCall(withKeywordArguments args: [String: Any]) async throws -> String
+}
+
+
+struct ToolCall: Decodable {
+    let id: Int
+    let name: String
+    let arguments: [String: String]
+}
+
+struct ToolResponse<T: Encodable>: Encodable {
+    let id: Int
+    let result: T
+}
+
+// MARK: LlamaChatSession
+/// Standard chat session for a given LLM.
+public actor LlamaChatSession {
+    private let queue = BlockingLineQueue()
+    private let session: LlamaObjC.LlamaSession
+    
+    public init(params: GPTParams, flush: Bool = true) async throws {
+        session = LlamaObjC.LlamaSession(params: params)
+        Task.detached { [session, queue] in
+            session.start(queue)
+        }
+        
+        // flush
+        guard flush else { return }
+        _ = queue.outputLine()
+    }
+    
+    public func chat(message: String) async -> String {
+        queue.addInputLine(message)
+        return queue.outputLine()
+    }
+}
+
+// MARK: LlamaGrammarSession
+public actor LlamaSession<T: JSONSchemaConvertible> {
+    private let session: LlamaChatSession
+    
+    public init(params: GPTParams) async throws {
+        let converter = SchemaConverter(propOrder: [])
+        _ = converter.visit(schema: T.jsonSchema, name: nil)
+        params.samplerParams.grammar = converter.formatGrammar()
+        session = try await LlamaChatSession(params: params)
+    }
+    
+    public func chat(message: String) async throws -> T {
+        let output = await session.chat(message: message).data(using: .utf8)!
+        return try JSONDecoder().decode(T.self, from: output)
+    }
+}
+
+// MARK: LlamaToolSession
+public actor LlamaToolSession {
+    private let session: LlamaChatSession
+    
+    private struct GetIpAddress: DynamicCallable {
+        func dynamicallyCall(withKeywordArguments args: [String : Any]) async throws -> String {
+            getIPAddress()
+        }
+    }
+    
+    internal static func getIPAddress() -> String {
+        var address: String!
+
+        // Get list of all interfaces on the local machine:
+        var ifaddr: UnsafeMutablePointer<ifaddrs>? = nil
+        if getifaddrs(&ifaddr) == 0 {
+            // Loop through linked list of interfaces
+            var ptr = ifaddr
+            while ptr != nil {
+                let interface = ptr!.pointee
+
+                // Check if the interface is IPv4 or IPv6:
+                let addrFamily = interface.ifa_addr.pointee.sa_family
+                if addrFamily == UInt8(AF_INET) || addrFamily == UInt8(AF_INET6) {
+
+                    // Convert interface name to String:
+                    let name = String(cString: interface.ifa_name)
+                    
+                    // Only consider non-loopback interfaces (e.g., "en0" for Wi-Fi)
+                    if name == "en0" {  // Typically en0 is the Wi-Fi interface
+                        // Convert the address to a readable format:
+                        var hostname = [CChar](repeating: 0, count: Int(NI_MAXHOST))
+                        if getnameinfo(interface.ifa_addr, socklen_t(interface.ifa_addr.pointee.sa_len),
+                                       &hostname, socklen_t(hostname.count),
+                                       nil, socklen_t(0), NI_NUMERICHOST) == 0 {
+                            address = String(cString: hostname)
+                        }
+                    }
+                }
+
+                ptr = interface.ifa_next
+            }
+
+            freeifaddrs(ifaddr)
+        }
+
+        return address
+    }
+    
+    public private(set) var tools: [String: (DynamicCallable, _JSONFunctionSchema)]
+    
+    public init(params: GPTParams,
+                tools: [String: (DynamicCallable, _JSONFunctionSchema)]) async throws {
+        self.tools = tools
+        let ipFnSchema = _JSONFunctionSchema(name: "getIpAddress", description: "Get the IP Address for this system", parameters: _JSONFunctionSchema.Parameters(properties: [:], required: []))
+        self.tools["getIpAddress"] = (GetIpAddress(), ipFnSchema)
+        let encoded = try JSONEncoder().encode(self.tools.values.map(\.1))
+        let prompt = """
+        You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+        <tool_call>
+        {"name": <function-name>,"arguments": <args-dict>}
+        </tool_call>
+
+        Here are the available tools:
+        <tools> \(String(data: encoded, encoding: .utf8)!) </tools><|eot_id|>
+        """
+        params.prompt = prompt
+        params.interactive = true
+        params.antiPrompts.append("<|eot_id|>");
+        params.inputPrefix = "<|start_header_id|>user<|end_header_id|>";
+        params.inputSuffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>";
+        session = try await LlamaChatSession(params: params, flush: false)
+        let fn = await session.chat(message: "What is my IP address?")
+        let toolCall = try JSONDecoder().decode(ToolCall.self, from: fn.data(using: .utf8)!)
+        guard let tool = self.tools[toolCall.name] else {
+            fatalError()
+        }
+        let resp = try await tool.0.dynamicallyCall(withKeywordArguments: toolCall.arguments)
+        print(resp)
+
+        let output = await session.chat(message: """
+        <tool_response>
+        {"id": \(toolCall.id), result: \(resp)}
+        </tool_response>
+        """)
+        print(output)
+    }
+    
+    public func chat(message: String) async throws -> String {
+        var nxt = await session.chat(message: message)
+        let fn = nxt
+        // try to see if the output is a function call
+        do {
+            let toolCall = try JSONDecoder().decode(ToolCall.self, from: fn.data(using: .utf8)!)
+            guard let tool = tools[toolCall.name] else {
+                fatalError()
+            }
+            let callable = tool.0
+            let resp = try await callable.dynamicallyCall(withKeywordArguments: toolCall.arguments)
+            print("tool response: \(resp)")
+            nxt = await session.chat(message: """
+            <tool_response>
+            {"id": \(toolCall.id), result: \(resp)}
+            </tool_response>
+            """)
+            print(nxt)
+        } catch {
+            print(error)
+        }
+        return nxt
+    }
+}
+
+public protocol LlamaActor: Actor {
+    static var tools: [String: (DynamicCallable, _JSONFunctionSchema)] { get }
+    var session: LlamaToolSession { get }
+}
+
+public extension LlamaActor {
+    func chat(_ message: String) async throws -> String {
+        try await session.chat(message: message)
+    }
+}
+
+@attached(member, names: arbitrary)
+@attached(extension, conformances: LlamaActor, names: arbitrary)
+public macro llamaActor() = #externalMacro(module: "JSONSchemaMacros",
+                                           type: "LlamaActorMacro")
--- a/swift/main/main.swift
+++ b/swift/main/main.swift
@ -0,0 +1,76 @@
+import LlamaKit
+import WeatherKit
+import CoreLocation
+
+@llamaActor actor MyLlama {
+    struct CurrentWeather: Codable {
+        let temperature: Double
+        let condition: WeatherCondition
+    }
+    
+    /// Get the current weather in a given location.
+    /// - parameter location: The city and state, e.g. San Francisco, CA
+    /// - parameter unit: The unit of temperature
+    public static func getCurrentWeather(location: String, unit: String) async throws -> CurrentWeather {
+        let weather = try await WeatherService().weather(for: CLGeocoder().geocodeAddressString(location)[0].location!)
+        var temperature = weather.currentWeather.temperature
+        temperature.convert(to: .fahrenheit)
+        return CurrentWeather(temperature: temperature.value,
+                              condition: weather.currentWeather.condition)
+    }
+}
+
+func downloadFile() async throws -> String {
+    let fm = FileManager.default
+    let tmpDir = fm.temporaryDirectory
+    let destinationURL = tmpDir.appending(path: "llama_groq_gguf.gguf")
+    
+    guard !fm.fileExists(atPath: destinationURL.path()) else {
+        return destinationURL.path()
+    }
+    print("Downloading Llama Tools, this may take a while...")
+    // Define the URL
+    guard let url = URL(string: "https://huggingface.co/bartowski/Llama-3-Groq-8B-Tool-Use-GGUF/resolve/main/Llama-3-Groq-8B-Tool-Use-Q5_K_M.gguf?download=true") else {
+        print("Invalid URL.")
+        throw URLError(.badURL)
+    }
+    
+    // Start the async download
+    let (tempURL, _) = try await URLSession.shared.download(from: url)
+    
+    // Define the destination path in the documents directory
+    
+    
+    // Move the downloaded file to the destination
+    try fm.moveItem(at: tempURL, to: destinationURL)
+    print("File downloaded to: \(destinationURL.path())")
+    return destinationURL.path()
+}
+
+let params = GPTParams()
+params.modelPath = try await downloadFile()
+params.nPredict = 512
+params.nCtx = 4096
+params.cpuParams.nThreads = 8
+params.cpuParamsBatch.nThreads = 8
+params.nBatch = 1024
+params.nGpuLayers = 1024
+let llama = try await MyLlama(params: params)
+
+while true {
+    print("Enter input: ", terminator: "")
+
+    // Read user input
+    if let userInput = readLine() {
+        if userInput.lowercased() == "exit" {
+            print("Exiting the loop.")
+            break
+        } else {
+            print("🧔🏽‍♂️: \(userInput)")
+            let response = try await llama.chat(userInput)
+            print("🤖: \(response)")
+        }
+    } else {
+        print("Failed to read input.")
+    }
+}
--- a/swift/test/LlamaKitTests.swift
+++ b/swift/test/LlamaKitTests.swift
@ -0,0 +1,140 @@
+import Foundation
+import Testing
+@testable import LlamaKit
+import JSONSchema
+
+// MARK: LlamaGrammarSession Suite
+@Suite("LlamaGrammarSession Suite")
+struct LlamaGrammarSessionSuite {
+    @JSONSchema struct Trip {
+        let location: String
+        let startDate: TimeInterval
+        let durationInDays: Int
+    }
+
+    func downloadFile() async throws -> String {
+        let fm = FileManager.default
+        let tmpDir = fm.temporaryDirectory
+        let destinationURL = tmpDir.appending(path: "tinyllama.gguf")
+        
+        guard !fm.fileExists(atPath: destinationURL.path()) else {
+            return destinationURL.path()
+        }
+        print("Downloading TinyLlama, this may take a while...")
+        // Define the URL
+        guard let url = URL(string: "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q3_K_L.gguf?download=true") else {
+            print("Invalid URL.")
+            throw URLError(.badURL)
+        }
+        
+        // Start the async download
+        let (tempURL, _) = try await URLSession.shared.download(from: url)
+        
+        // Define the destination path in the documents directory
+        
+        
+        // Move the downloaded file to the destination
+        try fm.moveItem(at: tempURL, to: destinationURL)
+        print("File downloaded to: \(destinationURL.path())")
+        return destinationURL.path()
+    }
+    
+    @Test func llamaGrammarSession() async throws {
+        let params = GPTParams()
+        params.modelPath = try await downloadFile()
+        params.nPredict = 256
+        params.nCtx = 1024
+        params.cpuParams.nThreads = 4
+        params.cpuParamsBatch.nThreads = 4
+        params.nBatch = 1024
+        params.nGpuLayers = 128
+        params.chatTemplate = """
+        <|system|>
+        {system_message}</s>
+        <|user|>
+        {prompt}</s>
+        <|assistant|>
+        """
+        params.prompt = """
+        You are a travel agent. The current date epoch \(Date.now.timeIntervalSince1970).
+        Responses should have the following fields:
+        
+            location: the location of the trip
+            startDate: the start of the trip as the unix epoch since 1970
+            durationInDays: the duration of the trip in days
+        
+        """
+        params.interactive = true
+        let session = try await LlamaSession<Trip>(params: params)
+        await #expect(throws: Never.self) {
+            let trip = try await session.chat(message: "Please create a trip for me to New York City that starts two weeks from now. The duration of the trip MUST be 3 days long.")
+            #expect(trip.location.contains("New York"))
+            // TODO: Testing the other fields is difficult considering model size
+            // TODO: so for now, we are just asserting the grammar works
+        }
+    }
+}
+
+import WeatherKit
+import CoreLocation
+
+@llamaActor actor MyLlama {
+    struct CurrentWeather: Codable {
+        let temperature: Double
+        let condition: WeatherCondition
+    }
+    
+    /// Get the current weather in a given location.
+    /// - parameter location: The city and state, e.g. San Francisco, CA
+    /// - parameter unit: The unit of temperature
+    public static func getCurrentWeather(location: String, unit: String) async throws -> CurrentWeather {
+        let weather = try await WeatherService().weather(for: CLGeocoder().geocodeAddressString(location)[0].location!)
+        var temperature = weather.currentWeather.temperature
+        temperature.convert(to: .fahrenheit)
+        return CurrentWeather(temperature: temperature.value,
+                              condition: weather.currentWeather.condition)
+    }
+}
+
+func downloadFile() async throws -> String {
+    let fm = FileManager.default
+    let tmpDir = fm.temporaryDirectory
+    let destinationURL = tmpDir.appending(path: "llama_groq_gguf.gguf")
+    
+    guard !fm.fileExists(atPath: destinationURL.path()) else {
+        return destinationURL.path()
+    }
+    print("Downloading Llama Tools, this may take a while...")
+    // Define the URL
+    guard let url = URL(string: "https://huggingface.co/bartowski/Llama-3-Groq-8B-Tool-Use-GGUF/resolve/main/Llama-3-Groq-8B-Tool-Use-Q5_K_M.gguf?download=true") else {
+        print("Invalid URL.")
+        throw URLError(.badURL)
+    }
+    
+    // Start the async download
+    let (tempURL, _) = try await URLSession.shared.download(from: url)
+    
+    // Define the destination path in the documents directory
+    
+    
+    // Move the downloaded file to the destination
+    try fm.moveItem(at: tempURL, to: destinationURL)
+    print("File downloaded to: \(destinationURL.path())")
+    return destinationURL.path()
+}
+
+@Test func llamaToolSession() async throws {
+    let params = GPTParams()
+    params.modelPath = try await downloadFile()
+    params.nPredict = 512
+    params.nCtx = 4096
+    params.cpuParams.nThreads = 8
+    params.cpuParamsBatch.nThreads = 8
+    params.nBatch = 1024
+    params.nGpuLayers = 1024
+    let llama = try await MyLlama(params: params)
+    let currentWeather = try await MyLlama.getCurrentWeather(location: "San Francisco, CA", unit: "farenheit")
+    let output = try await llama.chat("What's the weather (in farenheit) in San Francisco, CA?")
+    #expect(output.contains(String(format: "%.2f", currentWeather.temperature)))
+    // #expect(output.contains(currentWeather.condition.rawValue))
+}