Init LlamaObjC Commit
This commit is contained in:
parent
6026da52d6
commit
56f9d4b52a
29 changed files with 3472 additions and 32 deletions
14
Package.resolved
Normal file
14
Package.resolved
Normal file
|
@ -0,0 +1,14 @@
|
|||
{
|
||||
"pins" : [
|
||||
{
|
||||
"identity" : "swift-syntax",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/apple/swift-syntax.git",
|
||||
"state" : {
|
||||
"branch" : "main",
|
||||
"revision" : "2c271e5ce55124ae534c2eff6e74f745e4db4f68"
|
||||
}
|
||||
}
|
||||
],
|
||||
"version" : 2
|
||||
}
|
114
Package.swift
114
Package.swift
|
@ -1,21 +1,28 @@
|
|||
// swift-tools-version:5.5
|
||||
|
||||
// swift-tools-version:5.9
|
||||
import CompilerPluginSupport
|
||||
import PackageDescription
|
||||
|
||||
var sources = [
|
||||
var cppSources = [
|
||||
"src/llama.cpp",
|
||||
"src/llama-vocab.cpp",
|
||||
"src/llama-grammar.cpp",
|
||||
"src/llama-sampling.cpp",
|
||||
"src/unicode.cpp",
|
||||
"src/unicode-data.cpp",
|
||||
"ggml/src/ggml.c",
|
||||
"ggml/src/ggml-alloc.c",
|
||||
"ggml/src/ggml-backend.c",
|
||||
"ggml/src/ggml-quants.c",
|
||||
"ggml/src/ggml-aarch64.c",
|
||||
"common/sampling.cpp",
|
||||
"common/common.cpp",
|
||||
"common/json-schema-to-grammar.cpp",
|
||||
"common/log.cpp",
|
||||
"common/console.cpp"
|
||||
]
|
||||
|
||||
var ggmlSources = [
|
||||
"src/ggml.c",
|
||||
"src/ggml-alloc.c",
|
||||
"src/ggml-backend.c",
|
||||
"src/ggml-quants.c",
|
||||
"src/ggml-aarch64.c"
|
||||
]
|
||||
var resources: [Resource] = []
|
||||
var linkerSettings: [LinkerSetting] = []
|
||||
var cSettings: [CSetting] = [
|
||||
|
@ -24,13 +31,13 @@ var cSettings: [CSetting] = [
|
|||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||
// We should consider add this in the future when we drop support for iOS 14
|
||||
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||
// .define("ACCELERATE_NEW_LAPACK"),
|
||||
// .define("ACCELERATE_LAPACK_ILP64")
|
||||
.define("ACCELERATE_NEW_LAPACK"),
|
||||
.define("ACCELERATE_LAPACK_ILP64")
|
||||
]
|
||||
|
||||
#if canImport(Darwin)
|
||||
sources.append("ggml/src/ggml-metal.m")
|
||||
resources.append(.process("ggml/src/ggml-metal.metal"))
|
||||
ggmlSources.append("src/ggml-metal.m")
|
||||
resources.append(.process("src/ggml-metal.metal"))
|
||||
linkerSettings.append(.linkedFramework("Accelerate"))
|
||||
cSettings.append(
|
||||
contentsOf: [
|
||||
|
@ -47,33 +54,84 @@ cSettings.append(
|
|||
let package = Package(
|
||||
name: "llama",
|
||||
platforms: [
|
||||
.macOS(.v12),
|
||||
.macOS(.v13),
|
||||
.iOS(.v14),
|
||||
.watchOS(.v4),
|
||||
.tvOS(.v14)
|
||||
],
|
||||
products: [
|
||||
.library(name: "llama", targets: ["llama"]),
|
||||
.executable(name: "LlamaKitMain", targets: ["LlamaKitMain"])
|
||||
],
|
||||
dependencies: [
|
||||
.package(url: "https://github.com/apple/swift-syntax.git", branch: "main")
|
||||
],
|
||||
targets: [
|
||||
.target(name: "llama_cpp",
|
||||
path: ".",
|
||||
exclude: [
|
||||
"cmake",
|
||||
"examples",
|
||||
"scripts",
|
||||
"models",
|
||||
"tests",
|
||||
"CMakeLists.txt",
|
||||
"Makefile"
|
||||
],
|
||||
sources: cppSources,
|
||||
publicHeadersPath: "spm-headers"),
|
||||
.target(
|
||||
name: "llama",
|
||||
path: ".",
|
||||
exclude: [
|
||||
"cmake",
|
||||
"examples",
|
||||
"scripts",
|
||||
"models",
|
||||
"tests",
|
||||
"CMakeLists.txt",
|
||||
"Makefile"
|
||||
],
|
||||
sources: sources,
|
||||
dependencies: ["llama_cpp"],
|
||||
path: "ggml",
|
||||
sources: ggmlSources,
|
||||
resources: resources,
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: cSettings,
|
||||
linkerSettings: linkerSettings
|
||||
)
|
||||
linkerSettings: linkerSettings),
|
||||
.target(name: "LlamaObjC",
|
||||
dependencies: ["llama"],
|
||||
path: "objc",
|
||||
sources: [
|
||||
"GPTParams.mm",
|
||||
"GPTSampler.mm",
|
||||
"LlamaBatch.mm",
|
||||
"LlamaObjC.mm",
|
||||
"LlamaModel.mm",
|
||||
"LlamaContext.mm",
|
||||
"LlamaSession.mm",
|
||||
],
|
||||
publicHeadersPath: "include",
|
||||
cSettings: cSettings,
|
||||
linkerSettings: linkerSettings),
|
||||
.macro(
|
||||
name: "JSONSchemaMacros",
|
||||
dependencies: [
|
||||
.product(name: "SwiftSyntax", package: "swift-syntax"),
|
||||
.product(name: "SwiftSyntaxMacros", package: "swift-syntax"),
|
||||
.product(name: "SwiftCompilerPlugin", package: "swift-syntax"),
|
||||
],
|
||||
path: "swift/JSONSchemaMacros"
|
||||
),
|
||||
.target(
|
||||
name: "JSONSchema",
|
||||
dependencies: ["JSONSchemaMacros"],
|
||||
path: "swift/JSONSchema"
|
||||
),
|
||||
.target(
|
||||
name: "LlamaKit",
|
||||
dependencies: ["JSONSchema", "LlamaObjC"],
|
||||
path: "swift/LlamaKit"
|
||||
),
|
||||
.testTarget(name: "LlamaKitTests",
|
||||
dependencies: ["LlamaKit", "JSONSchema", "JSONSchemaMacros"],
|
||||
path: "swift/test",
|
||||
linkerSettings: [
|
||||
.linkedFramework("XCTest"),
|
||||
.linkedFramework("Testing")]),
|
||||
.executableTarget(name: "LlamaKitMain",
|
||||
dependencies: ["LlamaKit"],
|
||||
path: "swift/main",
|
||||
resources: [.process("Llama-3.2-3B-Instruct-Q4_0.gguf")]),
|
||||
],
|
||||
cxxLanguageStandard: .cxx11
|
||||
cxxLanguageStandard: .cxx17
|
||||
)
|
||||
|
|
|
@ -34,10 +34,10 @@ struct llama_lora_adapter_container : llama_lora_adapter_info {
|
|||
};
|
||||
|
||||
// build info
|
||||
extern int LLAMA_BUILD_NUMBER;
|
||||
extern char const * LLAMA_COMMIT;
|
||||
extern char const * LLAMA_COMPILER;
|
||||
extern char const * LLAMA_BUILD_TARGET;
|
||||
static int LLAMA_BUILD_NUMBER = 0;
|
||||
static char const * LLAMA_COMMIT = "";
|
||||
static char const * LLAMA_COMPILER = "";
|
||||
static char const * LLAMA_BUILD_TARGET = "";
|
||||
|
||||
struct llama_control_vector_load_info;
|
||||
|
||||
|
|
726
objc/GPTParams.mm
Normal file
726
objc/GPTParams.mm
Normal file
|
@ -0,0 +1,726 @@
|
|||
#import <Foundation/Foundation.h>
|
||||
#import "GPTParams_Private.hpp"
|
||||
#import "../common/common.h"
|
||||
#import "ggml.h"
|
||||
|
||||
@implementation GGMLThreadpool {
|
||||
ggml_threadpool *threadpool;
|
||||
}
|
||||
|
||||
- (instancetype)initWithThreadpool:(ggml_threadpool *)threadpool
|
||||
{
|
||||
self = [super init];
|
||||
if (self) {
|
||||
self->threadpool = threadpool;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (ggml_threadpool *)threadpool {
|
||||
return threadpool;
|
||||
}
|
||||
|
||||
@end
|
||||
|
||||
@implementation GGMLThreadpoolParams {
|
||||
ggml_threadpool_params params;
|
||||
}
|
||||
|
||||
- (BOOL)getCpuMaskAtIndex:(NSUInteger)index {
|
||||
abort();
|
||||
}
|
||||
|
||||
- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index {
|
||||
abort();
|
||||
}
|
||||
|
||||
- (instancetype)initWithParams:(ggml_threadpool_params&&)params
|
||||
{
|
||||
self = [super init];
|
||||
if (self) {
|
||||
self->params = params;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (BOOL)isEqual:(id)other {
|
||||
GGMLThreadpoolParams *rhs = (GGMLThreadpoolParams *)other;
|
||||
ggml_threadpool_params rhs_params = rhs->params;
|
||||
return ggml_threadpool_params_match(¶ms, &rhs_params);
|
||||
}
|
||||
|
||||
- (GGMLThreadpool *)threadpool {
|
||||
auto tp = ggml_threadpool_new(¶ms);
|
||||
return [[GGMLThreadpool alloc] initWithThreadpool:tp];
|
||||
}
|
||||
@end
|
||||
|
||||
@implementation CPUParams {
|
||||
cpu_params *params;
|
||||
}
|
||||
|
||||
- (instancetype)initWithParams:(cpu_params&)params;
|
||||
{
|
||||
self = [super init];
|
||||
if (self) {
|
||||
self->params = ¶ms;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (int)nThreads {
|
||||
return params->n_threads;
|
||||
}
|
||||
|
||||
- (void)setNThreads:(int)nThreads {
|
||||
params->n_threads = nThreads;
|
||||
}
|
||||
|
||||
- (BOOL)maskValid {
|
||||
return params->mask_valid;
|
||||
}
|
||||
|
||||
- (void)setMaskValid:(BOOL)maskValid {
|
||||
params->mask_valid = maskValid;
|
||||
}
|
||||
|
||||
- (GGMLSchedPriority)priority {
|
||||
return GGMLSchedPriority(params->priority);
|
||||
}
|
||||
|
||||
- (void)setPriority:(GGMLSchedPriority)priority {
|
||||
params->priority = ggml_sched_priority(priority);
|
||||
}
|
||||
|
||||
- (BOOL)strictCPU {
|
||||
return params->strict_cpu;
|
||||
}
|
||||
|
||||
- (void)setStrictCPU:(BOOL)strictCPU {
|
||||
params->strict_cpu = strictCPU;
|
||||
}
|
||||
|
||||
- (uint32_t)poll {
|
||||
return params->poll;
|
||||
}
|
||||
|
||||
- (void)setPoll:(uint32_t)poll {
|
||||
params->poll = poll;
|
||||
}
|
||||
|
||||
- (BOOL)getCpuMaskAtIndex:(NSUInteger)index {
|
||||
return params->cpumask[index];
|
||||
}
|
||||
|
||||
- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index {
|
||||
params->cpumask[index] = value;
|
||||
}
|
||||
|
||||
- (GGMLThreadpoolParams *)ggmlThreadpoolParams {
|
||||
return [[GGMLThreadpoolParams alloc] initWithParams:ggml_threadpool_params_from_cpu_params(*params)];
|
||||
}
|
||||
|
||||
@end
|
||||
|
||||
@implementation GPTSamplerParams {
|
||||
gpt_sampler_params *gpt_sampler_params;
|
||||
}
|
||||
|
||||
- (instancetype)initWithParams:(gpt_sampler_params&)params {
|
||||
self = [super init];
|
||||
if (self) {
|
||||
gpt_sampler_params = ¶ms;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
// Getters and setters for Objective-C properties, which manipulate the C++ struct
|
||||
|
||||
- (uint32_t)seed {
|
||||
return gpt_sampler_params->seed;
|
||||
}
|
||||
|
||||
- (void)setSeed:(uint32_t)seed {
|
||||
gpt_sampler_params->seed = seed;
|
||||
}
|
||||
|
||||
- (int32_t)nPrev {
|
||||
return gpt_sampler_params->n_prev;
|
||||
}
|
||||
|
||||
- (void)setNPrev:(int32_t)nPrev {
|
||||
gpt_sampler_params->n_prev = nPrev;
|
||||
}
|
||||
|
||||
- (int32_t)nProbs {
|
||||
return gpt_sampler_params->n_probs;
|
||||
}
|
||||
|
||||
- (void)setNProbs:(int32_t)nProbs {
|
||||
gpt_sampler_params->n_probs = nProbs;
|
||||
}
|
||||
|
||||
- (int32_t)minKeep {
|
||||
return gpt_sampler_params->min_keep;
|
||||
}
|
||||
|
||||
- (void)setMinKeep:(int32_t)minKeep {
|
||||
gpt_sampler_params->min_keep = minKeep;
|
||||
}
|
||||
|
||||
- (int32_t)topK {
|
||||
return gpt_sampler_params->top_k;
|
||||
}
|
||||
|
||||
- (void)setTopK:(int32_t)topK {
|
||||
gpt_sampler_params->top_k = topK;
|
||||
}
|
||||
|
||||
- (float)topP {
|
||||
return gpt_sampler_params->top_p;
|
||||
}
|
||||
|
||||
- (void)setTopP:(float)topP {
|
||||
gpt_sampler_params->top_p = topP;
|
||||
}
|
||||
|
||||
- (float)minP {
|
||||
return gpt_sampler_params->min_p;
|
||||
}
|
||||
|
||||
- (void)setMinP:(float)minP {
|
||||
gpt_sampler_params->min_p = minP;
|
||||
}
|
||||
|
||||
- (float)tfsZ {
|
||||
return gpt_sampler_params->tfs_z;
|
||||
}
|
||||
|
||||
- (void)setTfsZ:(float)tfsZ {
|
||||
gpt_sampler_params->tfs_z = tfsZ;
|
||||
}
|
||||
|
||||
- (float)typP {
|
||||
return gpt_sampler_params->typ_p;
|
||||
}
|
||||
|
||||
- (void)setTypP:(float)typP {
|
||||
gpt_sampler_params->typ_p = typP;
|
||||
}
|
||||
|
||||
- (float)temp {
|
||||
return gpt_sampler_params->temp;
|
||||
}
|
||||
|
||||
- (void)setTemp:(float)temp {
|
||||
gpt_sampler_params->temp = temp;
|
||||
}
|
||||
|
||||
- (float)dynatempRange {
|
||||
return gpt_sampler_params->dynatemp_range;
|
||||
}
|
||||
|
||||
- (void)setDynatempRange:(float)dynatempRange {
|
||||
gpt_sampler_params->dynatemp_range = dynatempRange;
|
||||
}
|
||||
|
||||
- (float)dynatempExponent {
|
||||
return gpt_sampler_params->dynatemp_exponent;
|
||||
}
|
||||
|
||||
- (void)setDynatempExponent:(float)dynatempExponent {
|
||||
gpt_sampler_params->dynatemp_exponent = dynatempExponent;
|
||||
}
|
||||
|
||||
- (int32_t)penaltyLastN {
|
||||
return gpt_sampler_params->penalty_last_n;
|
||||
}
|
||||
|
||||
- (void)setPenaltyLastN:(int32_t)penaltyLastN {
|
||||
gpt_sampler_params->penalty_last_n = penaltyLastN;
|
||||
}
|
||||
|
||||
- (float)penaltyRepeat {
|
||||
return gpt_sampler_params->penalty_repeat;
|
||||
}
|
||||
|
||||
- (void)setPenaltyRepeat:(float)penaltyRepeat {
|
||||
gpt_sampler_params->penalty_repeat = penaltyRepeat;
|
||||
}
|
||||
|
||||
- (float)penaltyFreq {
|
||||
return gpt_sampler_params->penalty_freq;
|
||||
}
|
||||
|
||||
- (void)setPenaltyFreq:(float)penaltyFreq {
|
||||
gpt_sampler_params->penalty_freq = penaltyFreq;
|
||||
}
|
||||
|
||||
- (float)penaltyPresent {
|
||||
return gpt_sampler_params->penalty_present;
|
||||
}
|
||||
|
||||
- (void)setPenaltyPresent:(float)penaltyPresent {
|
||||
gpt_sampler_params->penalty_present = penaltyPresent;
|
||||
}
|
||||
|
||||
- (int32_t)mirostat {
|
||||
return gpt_sampler_params->mirostat;
|
||||
}
|
||||
|
||||
- (void)setMirostat:(int32_t)mirostat {
|
||||
gpt_sampler_params->mirostat = mirostat;
|
||||
}
|
||||
|
||||
- (float)mirostatTau {
|
||||
return gpt_sampler_params->mirostat_tau;
|
||||
}
|
||||
|
||||
- (void)setMirostatTau:(float)mirostatTau {
|
||||
gpt_sampler_params->mirostat_tau = mirostatTau;
|
||||
}
|
||||
|
||||
- (float)mirostatEta {
|
||||
return gpt_sampler_params->mirostat_eta;
|
||||
}
|
||||
|
||||
- (void)setMirostatEta:(float)mirostatEta {
|
||||
gpt_sampler_params->mirostat_eta = mirostatEta;
|
||||
}
|
||||
|
||||
- (BOOL)penalizeNl {
|
||||
return gpt_sampler_params->penalize_nl;
|
||||
}
|
||||
|
||||
- (void)setPenalizeNl:(BOOL)penalizeNl {
|
||||
gpt_sampler_params->penalize_nl = penalizeNl;
|
||||
}
|
||||
|
||||
- (BOOL)ignoreEos {
|
||||
return gpt_sampler_params->ignore_eos;
|
||||
}
|
||||
|
||||
- (void)setIgnoreEos:(BOOL)ignoreEos {
|
||||
gpt_sampler_params->ignore_eos = ignoreEos;
|
||||
}
|
||||
|
||||
- (BOOL)noPerf {
|
||||
return gpt_sampler_params->no_perf;
|
||||
}
|
||||
|
||||
- (void)setNoPerf:(BOOL)noPerf {
|
||||
gpt_sampler_params->no_perf = noPerf;
|
||||
}
|
||||
|
||||
// For `samplers`, convert from NSArray<NSNumber *> to std::vector
|
||||
- (NSArray<NSNumber *> *)samplers {
|
||||
NSMutableArray<NSNumber *> *samplersArray = [NSMutableArray array];
|
||||
for (auto sampler : gpt_sampler_params->samplers) {
|
||||
[samplersArray addObject:@(sampler)];
|
||||
}
|
||||
return [samplersArray copy];
|
||||
}
|
||||
|
||||
- (void)setSamplers:(NSArray<NSNumber *> *)samplers {
|
||||
gpt_sampler_params->samplers.clear();
|
||||
for (NSNumber *sampler in samplers) {
|
||||
gpt_sampler_params->samplers.push_back(static_cast<gpt_sampler_type>(sampler.intValue));
|
||||
}
|
||||
}
|
||||
|
||||
//// For `logitBias`, convert from NSArray<NSNumber *> to std::vector
|
||||
//- (NSArray<NSNumber *> *)logitBias {
|
||||
// NSMutableArray<llama_logit_bias *> *logitBiasArray = [NSMutableArray array];
|
||||
// for (auto bias : gpt_sampler_params.logit_bias) {
|
||||
// [logitBiasArray addObject:bias];
|
||||
// }
|
||||
// return [logitBiasArray copy];
|
||||
//}
|
||||
//
|
||||
//- (void)setLogitBias:(NSArray<NSNumber *> *)logitBias {
|
||||
// gpt_sampler_params.logit_bias.clear();
|
||||
// for (NSNumber *bias in logitBias) {
|
||||
// gpt_sampler_params.logit_bias.push_back(bias.floatValue);
|
||||
// }
|
||||
//}
|
||||
|
||||
// For `grammar`, convert between NSString and std::string
|
||||
- (NSString *)grammar {
|
||||
return [NSString stringWithUTF8String:gpt_sampler_params->grammar.c_str()];
|
||||
}
|
||||
|
||||
- (void)setGrammar:(NSString *)grammar {
|
||||
gpt_sampler_params->grammar = std::string([grammar UTF8String]);
|
||||
}
|
||||
|
||||
// Method to print out the parameters as a string
|
||||
- (NSString *)print {
|
||||
NSMutableString *output = [NSMutableString stringWithString:@"GPT Sampler Params:\n"];
|
||||
[output appendFormat:@"Seed: %u\n", self.seed];
|
||||
[output appendFormat:@"nPrev: %d\n", self.nPrev];
|
||||
[output appendFormat:@"nProbs: %d\n", self.nProbs];
|
||||
[output appendFormat:@"minKeep: %d\n", self.minKeep];
|
||||
[output appendFormat:@"topK: %d\n", self.topK];
|
||||
[output appendFormat:@"topP: %.2f\n", self.topP];
|
||||
[output appendFormat:@"minP: %.2f\n", self.minP];
|
||||
[output appendFormat:@"tfsZ: %.2f\n", self.tfsZ];
|
||||
[output appendFormat:@"typP: %.2f\n", self.typP];
|
||||
[output appendFormat:@"temp: %.2f\n", self.temp];
|
||||
[output appendFormat:@"dynatempRange: %.2f\n", self.dynatempRange];
|
||||
[output appendFormat:@"dynatempExponent: %.2f\n", self.dynatempExponent];
|
||||
[output appendFormat:@"penaltyLastN: %d\n", self.penaltyLastN];
|
||||
[output appendFormat:@"penaltyRepeat: %.2f\n", self.penaltyRepeat];
|
||||
[output appendFormat:@"penaltyFreq: %.2f\n", self.penaltyFreq];
|
||||
[output appendFormat:@"penaltyPresent: %.2f\n", self.penaltyPresent];
|
||||
[output appendFormat:@"mirostat: %d\n", self.mirostat];
|
||||
[output appendFormat:@"mirostatTau: %.2f\n", self.mirostatTau];
|
||||
[output appendFormat:@"mirostatEta: %.2f\n", self.mirostatEta];
|
||||
[output appendFormat:@"penalizeNl: %@\n", self.penalizeNl ? @"YES" : @"NO"];
|
||||
[output appendFormat:@"ignoreEos: %@\n", self.ignoreEos ? @"YES" : @"NO"];
|
||||
[output appendFormat:@"noPerf: %@\n", self.noPerf ? @"YES" : @"NO"];
|
||||
[output appendFormat:@"Grammar: %@\n", self.grammar];
|
||||
|
||||
// Print samplers
|
||||
[output appendString:@"Samplers: "];
|
||||
for (NSNumber *sampler in self.samplers) {
|
||||
[output appendFormat:@"%d, ", sampler.intValue];
|
||||
}
|
||||
[output appendString:@"\n"];
|
||||
|
||||
// Print logit biases
|
||||
[output appendString:@"Logit Biases: "];
|
||||
for (NSNumber *bias in self.logitBias) {
|
||||
[output appendFormat:@"%.2f, ", bias.floatValue];
|
||||
}
|
||||
[output appendString:@"\n"];
|
||||
|
||||
return [output copy];
|
||||
}
|
||||
|
||||
- (gpt_sampler_params&)cParams {
|
||||
return *gpt_sampler_params;
|
||||
}
|
||||
|
||||
@end
|
||||
|
||||
@implementation GPTParams {
|
||||
gpt_params gpt_params;
|
||||
}
|
||||
|
||||
- (NSArray<NSString *> *)antiPrompts {
|
||||
auto antiprompts = [[NSMutableArray alloc] init];
|
||||
for (auto& antiprompt : gpt_params.antiprompt) {
|
||||
[antiprompts addObject:[NSString stringWithCString:antiprompt.c_str() encoding:NSUTF8StringEncoding]];
|
||||
}
|
||||
return antiprompts;
|
||||
}
|
||||
|
||||
- (gpt_params&)params {
|
||||
return gpt_params;
|
||||
}
|
||||
|
||||
- (int32_t)nPredict {
|
||||
return gpt_params.n_predict;
|
||||
}
|
||||
|
||||
- (void)setNPredict:(int32_t)nPredict {
|
||||
gpt_params.n_predict = nPredict;
|
||||
}
|
||||
|
||||
- (NSInteger)nCtx {
|
||||
return gpt_params.n_ctx;
|
||||
}
|
||||
|
||||
- (void)setNCtx:(NSInteger)nCtx {
|
||||
gpt_params.n_ctx = nCtx;
|
||||
}
|
||||
|
||||
- (int32_t)nBatch {
|
||||
return gpt_params.n_batch;
|
||||
}
|
||||
|
||||
- (void)setNBatch:(int32_t)nBatch {
|
||||
gpt_params.n_batch = nBatch;
|
||||
}
|
||||
|
||||
- (int32_t)nUBatch {
|
||||
return gpt_params.n_ubatch;
|
||||
}
|
||||
|
||||
- (void)setNUBatch:(int32_t)nUBatch {
|
||||
gpt_params.n_ubatch = nUBatch;
|
||||
}
|
||||
|
||||
- (int32_t)nKeep {
|
||||
return gpt_params.n_keep;
|
||||
}
|
||||
|
||||
- (void)setNKeep:(int32_t)nKeep {
|
||||
gpt_params.n_keep = nKeep;
|
||||
}
|
||||
|
||||
- (int32_t)nDraft {
|
||||
return gpt_params.n_draft;
|
||||
}
|
||||
|
||||
- (void)setNDraft:(int32_t)nDraft {
|
||||
gpt_params.n_draft = nDraft;
|
||||
}
|
||||
|
||||
- (int32_t)nChunks {
|
||||
return gpt_params.n_chunks;
|
||||
}
|
||||
|
||||
- (void)setNChunks:(int32_t)nChunks {
|
||||
gpt_params.n_chunks = nChunks;
|
||||
}
|
||||
|
||||
- (int32_t)nParallel {
|
||||
return gpt_params.n_parallel;
|
||||
}
|
||||
|
||||
- (void)setNParallel:(int32_t)nParallel {
|
||||
gpt_params.n_parallel = nParallel;
|
||||
}
|
||||
|
||||
- (int32_t)nSequences {
|
||||
return gpt_params.n_sequences;
|
||||
}
|
||||
|
||||
- (void)setNSequences:(int32_t)nSequences {
|
||||
gpt_params.n_sequences = nSequences;
|
||||
}
|
||||
|
||||
- (float)pSplit {
|
||||
return gpt_params.p_split;
|
||||
}
|
||||
|
||||
- (void)setPSplit:(float)pSplit {
|
||||
gpt_params.p_split = pSplit;
|
||||
}
|
||||
|
||||
- (int32_t)nGpuLayers {
|
||||
return gpt_params.n_gpu_layers;
|
||||
}
|
||||
|
||||
- (void)setNGpuLayers:(int32_t)nGpuLayers {
|
||||
gpt_params.n_gpu_layers = nGpuLayers;
|
||||
}
|
||||
|
||||
- (int32_t)nGpuLayersDraft {
|
||||
return gpt_params.n_gpu_layers_draft;
|
||||
}
|
||||
|
||||
- (void)setNGpuLayersDraft:(int32_t)nGpuLayersDraft {
|
||||
gpt_params.n_gpu_layers_draft = nGpuLayersDraft;
|
||||
}
|
||||
|
||||
- (int32_t)mainGpu {
|
||||
return gpt_params.main_gpu;
|
||||
}
|
||||
|
||||
- (void)setMainGpu:(int32_t)mainGpu {
|
||||
gpt_params.main_gpu = mainGpu;
|
||||
}
|
||||
|
||||
- (int32_t)grpAttnN {
|
||||
return gpt_params.grp_attn_n;
|
||||
}
|
||||
|
||||
- (void)setGrpAttnN:(int32_t)grpAttnN {
|
||||
gpt_params.grp_attn_n = grpAttnN;
|
||||
}
|
||||
|
||||
- (int32_t)grpAttnW {
|
||||
return gpt_params.grp_attn_w;
|
||||
}
|
||||
|
||||
- (void)setGrpAttnW:(int32_t)grpAttnW {
|
||||
gpt_params.grp_attn_w = grpAttnW;
|
||||
}
|
||||
|
||||
- (int32_t)nPrint {
|
||||
return gpt_params.n_print;
|
||||
}
|
||||
|
||||
- (void)setNPrint:(int32_t)nPrint {
|
||||
gpt_params.n_print = nPrint;
|
||||
}
|
||||
|
||||
- (float)ropeFreqBase {
|
||||
return gpt_params.rope_freq_base;
|
||||
}
|
||||
|
||||
- (void)setRopeFreqBase:(float)ropeFreqBase {
|
||||
gpt_params.rope_freq_base = ropeFreqBase;
|
||||
}
|
||||
|
||||
- (float)ropeFreqScale {
|
||||
return gpt_params.rope_freq_scale;
|
||||
}
|
||||
|
||||
- (void)setRopeFreqScale:(float)ropeFreqScale {
|
||||
gpt_params.rope_freq_scale = ropeFreqScale;
|
||||
}
|
||||
|
||||
- (float)yarnExtFactor {
|
||||
return gpt_params.yarn_ext_factor;
|
||||
}
|
||||
|
||||
- (void)setYarnExtFactor:(float)yarnExtFactor {
|
||||
gpt_params.yarn_ext_factor = yarnExtFactor;
|
||||
}
|
||||
|
||||
- (float)yarnAttnFactor {
|
||||
return gpt_params.yarn_attn_factor;
|
||||
}
|
||||
|
||||
- (void)setYarnAttnFactor:(float)yarnAttnFactor {
|
||||
gpt_params.yarn_attn_factor = yarnAttnFactor;
|
||||
}
|
||||
|
||||
- (float)yarnBetaFast {
|
||||
return gpt_params.yarn_beta_fast;
|
||||
}
|
||||
|
||||
- (void)setYarnBetaFast:(float)yarnBetaFast {
|
||||
gpt_params.yarn_beta_fast = yarnBetaFast;
|
||||
}
|
||||
|
||||
- (float)yarnBetaSlow {
|
||||
return gpt_params.yarn_beta_slow;
|
||||
}
|
||||
|
||||
- (void)setYarnBetaSlow:(float)yarnBetaSlow {
|
||||
gpt_params.yarn_beta_slow = yarnBetaSlow;
|
||||
}
|
||||
|
||||
- (int32_t)yarnOrigCtx {
|
||||
return gpt_params.yarn_orig_ctx;
|
||||
}
|
||||
|
||||
- (void)setYarnOrigCtx:(int32_t)yarnOrigCtx {
|
||||
gpt_params.yarn_orig_ctx = yarnOrigCtx;
|
||||
}
|
||||
|
||||
- (float)defragThold {
|
||||
return gpt_params.defrag_thold;
|
||||
}
|
||||
|
||||
- (void)setDefragThold:(float)defragThold {
|
||||
gpt_params.defrag_thold = defragThold;
|
||||
}
|
||||
|
||||
// Assuming tensorSplit remains a fixed array in C struct, we can create a method to access specific values.
|
||||
- (float)tensorSplitAtIndex:(NSUInteger)index {
|
||||
if (index < 128) {
|
||||
return gpt_params.tensor_split[index];
|
||||
}
|
||||
return 0.0f; // Return default value if index is out of bounds
|
||||
}
|
||||
|
||||
- (void)setTensorSplitValue:(float)value atIndex:(NSUInteger)index {
|
||||
if (index < 128) {
|
||||
gpt_params.tensor_split[index] = value;
|
||||
}
|
||||
}
|
||||
|
||||
- (BOOL)embedding {
|
||||
return gpt_params.embedding;
|
||||
}
|
||||
|
||||
- (void)setEmbedding:(BOOL)embedding {
|
||||
gpt_params.embedding = embedding;
|
||||
}
|
||||
|
||||
- (LlamaModelParams *)LlamaModelParams {
|
||||
return nil;
|
||||
}
|
||||
|
||||
- (BOOL)ctxShift {
|
||||
return gpt_params.ctx_shift;
|
||||
}
|
||||
|
||||
- (void)setCtxShift:(BOOL)ctxShift {
|
||||
gpt_params.ctx_shift = ctxShift;
|
||||
}
|
||||
|
||||
- (CPUParams *)cpuParams {
|
||||
return [[CPUParams alloc] initWithParams:gpt_params.cpuparams];
|
||||
}
|
||||
|
||||
- (CPUParams *)cpuParamsBatch {
|
||||
return [[CPUParams alloc] initWithParams:gpt_params.cpuparams_batch];
|
||||
}
|
||||
|
||||
- (GPTSamplerParams *)samplerParams {
|
||||
return [[GPTSamplerParams alloc] initWithParams:gpt_params.sparams];
|
||||
}
|
||||
|
||||
- (NSString *)modelURL {
|
||||
return [NSString stringWithCString:gpt_params.model_url.c_str() encoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
- (void)setModelURL:(NSString *)modelURL {
|
||||
gpt_params.model_url = [modelURL cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
- (NSString *)modelPath {
|
||||
return [NSString stringWithCString:gpt_params.model.c_str() encoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
- (void)setModelPath:(NSString *)modelPath {
|
||||
gpt_params.model = [modelPath cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
- (NSString *)pathPromptCache {
|
||||
return [[NSString alloc] initWithCString:gpt_params.path_prompt_cache.c_str() encoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
- (void)setPathPromptCache:(NSString *)pathPromptCache {
|
||||
gpt_params.path_prompt_cache = [pathPromptCache cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
- (BOOL)enableChatTemplate {
|
||||
return gpt_params.enable_chat_template;
|
||||
}
|
||||
|
||||
- (void)setEnableChatTemplate:(BOOL)enableChatTemplate {
|
||||
gpt_params.enable_chat_template = enableChatTemplate;
|
||||
}
|
||||
|
||||
- (NSString *)chatTemplate {
|
||||
return [NSString stringWithCString:gpt_params.chat_template.c_str()
|
||||
encoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
- (void)setChatTemplate:(NSString *)chatTemplate {
|
||||
gpt_params.chat_template = [chatTemplate cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
- (NSString *)inputPrefix {
|
||||
return [NSString stringWithCString:gpt_params.input_prefix.c_str()
|
||||
encoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
- (void)setInputPrefix:(NSString *)inputPrefix {
|
||||
gpt_params.input_prefix = [inputPrefix cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
- (NSString *)inputSuffix {
|
||||
return [NSString stringWithCString:gpt_params.input_suffix.c_str()
|
||||
encoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
- (void)setInputSuffix:(NSString *)inputSuffix {
|
||||
gpt_params.input_suffix = [inputSuffix cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
|
||||
- (LlamaContextParams *)llamaContextParams {
|
||||
}
|
||||
|
||||
- (LlamaModelParams *)llamaModelParams {
|
||||
}
|
||||
|
||||
@end
|
49
objc/GPTSampler.mm
Normal file
49
objc/GPTSampler.mm
Normal file
|
@ -0,0 +1,49 @@
|
|||
#import <Foundation/Foundation.h>
|
||||
#import <GPTSampler.h>
|
||||
#import <GPTParams_Private.hpp>
|
||||
#import <LlamaModel_Private.hpp>
|
||||
#import <LlamaContext_Private.hpp>
|
||||
#import "../../common/sampling.h"
|
||||
|
||||
@implementation GPTSampler {
|
||||
gpt_sampler *sampler;
|
||||
}
|
||||
|
||||
- (instancetype)init:(LlamaModel *)model gptSamplerParams:(GPTSamplerParams *)gptSamplerParams
|
||||
{
|
||||
self = [super init];
|
||||
if (self) {
|
||||
self->sampler = gpt_sampler_init([model cModel], [gptSamplerParams cParams]);
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (uint32_t)seed {
|
||||
return gpt_sampler_get_seed(sampler);
|
||||
}
|
||||
|
||||
- (LlamaToken)sample:(LlamaContext *)context index:(NSInteger)index {
|
||||
return [self sample:context index:index grammarFirst:false];
|
||||
}
|
||||
|
||||
- (LlamaToken)sample:(LlamaContext *)context index:(NSInteger)index grammarFirst:(BOOL)grammarFirst {
|
||||
return gpt_sampler_sample(sampler, [context cContext], index, grammarFirst);
|
||||
}
|
||||
|
||||
- (void)accept:(LlamaToken)token acceptGrammar:(BOOL)acceptGrammar {
|
||||
gpt_sampler_accept(sampler, token, acceptGrammar);
|
||||
}
|
||||
|
||||
- (NSString *)previousString:(LlamaContext *)context n:(NSInteger)n {
|
||||
return [[NSString alloc] initWithCString:gpt_sampler_prev_str(sampler, [context cContext], n).data() encoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
- (LlamaToken)last {
|
||||
return gpt_sampler_last(sampler);
|
||||
}
|
||||
|
||||
- (void)reset {
|
||||
gpt_sampler_reset(sampler);
|
||||
}
|
||||
|
||||
@end
|
21
objc/LlamaBatch.mm
Normal file
21
objc/LlamaBatch.mm
Normal file
|
@ -0,0 +1,21 @@
|
|||
#import <Foundation/Foundation.h>
|
||||
#import "LlamaBatch_Private.hpp"
|
||||
#import "llama.h"
|
||||
|
||||
@implementation LlamaBatch {
|
||||
llama_batch batch;
|
||||
}
|
||||
|
||||
- (instancetype)initWithBatch:(llama_batch)batch {
|
||||
self->batch = batch;
|
||||
}
|
||||
|
||||
- (NSData *)output {
|
||||
return [[NSData alloc] initWithBytes:batch.logits length:batch.n_tokens];
|
||||
}
|
||||
|
||||
- (llama_batch)cBatch {
|
||||
return batch;
|
||||
}
|
||||
|
||||
@end
|
94
objc/LlamaContext.mm
Normal file
94
objc/LlamaContext.mm
Normal file
|
@ -0,0 +1,94 @@
|
|||
#import <Foundation/Foundation.h>
|
||||
#import "LlamaContext_Private.hpp"
|
||||
#import "GPTParams_Private.hpp"
|
||||
#import "LlamaModel_Private.hpp"
|
||||
#import "LlamaBatch_Private.hpp"
|
||||
#import "../../common/common.h"
|
||||
|
||||
@implementation LlamaContext {
|
||||
llama_context *ctx;
|
||||
}
|
||||
|
||||
- (instancetype)initWithContext:(llama_context *)context {
|
||||
self = [super init];
|
||||
if (self) {
|
||||
ctx = context;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (void)attachThreadpool:(GGMLThreadpool *)threadpool
|
||||
threadpoolBatch:(GGMLThreadpool *)threadpoolBatch {
|
||||
llama_attach_threadpool(ctx, [threadpool threadpool], [threadpoolBatch threadpool]);
|
||||
}
|
||||
|
||||
|
||||
- (NSUInteger)nCtx {
|
||||
return llama_n_ctx(ctx);
|
||||
}
|
||||
|
||||
- (BOOL)loadStateFile:(NSString *)pathSession
|
||||
tokensOut:(llama_token *)tokensOut
|
||||
nTokenCpacity:(size_t)nTokenCapacity
|
||||
nTokenCountOut:(size_t *)nTokenCountOut {
|
||||
return llama_state_load_file(ctx, [pathSession cStringUsingEncoding:NSUTF8StringEncoding], tokensOut, nTokenCapacity, nTokenCountOut);
|
||||
}
|
||||
|
||||
- (LlamaModel *)model {
|
||||
auto model = llama_get_model(ctx);
|
||||
return [[LlamaModel alloc] init:std::remove_const_t<llama_model *>(model)];
|
||||
}
|
||||
|
||||
- (std::vector<llama_token>)tokenize:(NSString *)text
|
||||
addSpecial:(BOOL)addSpecial
|
||||
parseSpecial:(BOOL)parseSpecial {
|
||||
return llama_tokenize(ctx, [text cStringUsingEncoding:NSUTF8StringEncoding], addSpecial, parseSpecial);
|
||||
}
|
||||
|
||||
- (std::string)convertTokensToString:(const std::vector<llama_token>&)tokens {
|
||||
return string_from(ctx, tokens);
|
||||
}
|
||||
|
||||
- (llama_context *)cContext {
|
||||
return ctx;
|
||||
}
|
||||
|
||||
- (int32_t)encode:(llama_batch)batch {
|
||||
return llama_encode(ctx, batch);
|
||||
}
|
||||
|
||||
- (void)kvCacheSeqAdd:(LlamaSequenceId)sequenceId
|
||||
p0:(LlamaPosition)p0
|
||||
p1:(LlamaPosition)p1
|
||||
delta:(LlamaPosition)delta {
|
||||
llama_kv_cache_seq_add(ctx, sequenceId, p0, p1, delta);
|
||||
}
|
||||
|
||||
- (void)kvCacheSeqDiv:(LlamaSequenceId)sequenceId
|
||||
p0:(LlamaPosition)p0
|
||||
p1:(LlamaPosition)p1
|
||||
delta:(LlamaPosition)delta {
|
||||
llama_kv_cache_seq_div(ctx, sequenceId, p0, p1, delta);
|
||||
}
|
||||
|
||||
- (NSString *)tokenToPiece:(LlamaToken)token {
|
||||
return [self tokenToPiece:token special:YES];
|
||||
}
|
||||
|
||||
- (NSString *)tokenToPiece:(LlamaToken)token special:(BOOL)special {
|
||||
return [[NSString alloc] initWithCString:llama_token_to_piece(ctx, token, special).c_str() encoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
- (NSInteger)decode:(LlamaBatch *)batch {
|
||||
return llama_decode(ctx, [batch cBatch]);
|
||||
}
|
||||
|
||||
- (BOOL)saveStateFile:(NSString *)pathSession
|
||||
tokens:(const LlamaToken *)tokens
|
||||
nTokenCount:(size_t)nTokenCount {
|
||||
return llama_state_save_file(ctx,
|
||||
[pathSession cStringUsingEncoding:NSUTF8StringEncoding],
|
||||
tokens, nTokenCount);
|
||||
}
|
||||
|
||||
@end
|
70
objc/LlamaModel.mm
Normal file
70
objc/LlamaModel.mm
Normal file
|
@ -0,0 +1,70 @@
|
|||
#import <Foundation/Foundation.h>
|
||||
#import "LlamaModel_Private.hpp"
|
||||
#import "LlamaContext_Private.hpp"
|
||||
#import "LlamaBatch_Private.hpp"
|
||||
#import "GPTParams_Private.hpp"
|
||||
#import "GPTSampler.h"
|
||||
#import "ggml.h"
|
||||
#import "../common/common.h"
|
||||
|
||||
@implementation LlamaChatMessage
|
||||
@end
|
||||
|
||||
@implementation LlamaModel {
|
||||
llama_model *model;
|
||||
}
|
||||
|
||||
- (instancetype)init:(llama_model *)l_model {
|
||||
self = [super init];
|
||||
if (self) {
|
||||
model = l_model;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (LlamaContext *)context:(LlamaContextParams *)params {
|
||||
return nil;
|
||||
}
|
||||
|
||||
- (BOOL)addBOSToken {
|
||||
return llama_add_bos_token(model);
|
||||
}
|
||||
|
||||
- (BOOL)addEOSToken {
|
||||
return llama_add_eos_token(model);
|
||||
}
|
||||
|
||||
- (LlamaToken)tokenBOS {
|
||||
return llama_token_bos(model);
|
||||
}
|
||||
|
||||
- (int32_t)nCtxTrain {
|
||||
return llama_n_ctx_train(model);
|
||||
}
|
||||
|
||||
- (NSString *)formatExample:(NSString *)tmpl {
|
||||
return [[NSString alloc] initWithCString:llama_chat_format_example(model, [tmpl cStringUsingEncoding:NSUTF8StringEncoding]).c_str()
|
||||
encoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
- (BOOL)hasEncoder {
|
||||
return llama_model_has_encoder(model);
|
||||
}
|
||||
|
||||
- (llama_model *)cModel {
|
||||
return model;
|
||||
}
|
||||
|
||||
- (BOOL)tokenIsEOG:(LlamaToken)token {
|
||||
return llama_token_is_eog(model, token);
|
||||
}
|
||||
|
||||
- (LlamaToken)tokenEOT {
|
||||
return llama_token_eot(model);
|
||||
}
|
||||
|
||||
- (LlamaToken)tokenEOS {
|
||||
return llama_token_eos(model);
|
||||
}
|
||||
|
||||
@end
|
2
objc/LlamaObjC.mm
Normal file
2
objc/LlamaObjC.mm
Normal file
|
@ -0,0 +1,2 @@
|
|||
#import "LlamaObjC.h"
|
||||
|
906
objc/LlamaSession.mm
Normal file
906
objc/LlamaSession.mm
Normal file
|
@ -0,0 +1,906 @@
|
|||
#import <Foundation/Foundation.h>
|
||||
#import "LlamaSession_Private.hpp"
|
||||
#import "../../common/common.h"
|
||||
#import "LlamaModel_Private.hpp"
|
||||
#import "LlamaContext_Private.hpp"
|
||||
#import "GPTSampler.h"
|
||||
#import <OSLog/OSLog.h>
|
||||
#import "ggml.h"
|
||||
#import "GPTParams_Private.hpp"
|
||||
#import "LlamaBatch_Private.hpp"
|
||||
|
||||
@implementation BlockingLineQueue {
|
||||
// Input queue and related synchronization
|
||||
NSMutableArray<NSString *> *inputQueue;
|
||||
NSCondition *inputCondition;
|
||||
|
||||
// Output queue and related synchronization
|
||||
NSMutableArray<NSString *> *outputQueue;
|
||||
NSCondition *outputCondition;
|
||||
|
||||
// Log queue
|
||||
NSMutableArray<NSString *> *log;
|
||||
}
|
||||
|
||||
- (instancetype)init {
|
||||
if (self = [super init]) {
|
||||
inputQueue = [NSMutableArray new];
|
||||
outputQueue = [NSMutableArray new];
|
||||
log = [NSMutableArray new];
|
||||
inputCondition = [[NSCondition alloc] init];
|
||||
outputCondition = [[NSCondition alloc] init];
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (void)addInputLine:(NSString *)line {
|
||||
[inputCondition lock];
|
||||
[inputQueue addObject:line];
|
||||
[log addObject:line];
|
||||
[inputCondition signal]; // Notify that a new input line is available
|
||||
[inputCondition unlock];
|
||||
}
|
||||
|
||||
- (NSString *)inputLine {
|
||||
[inputCondition lock];
|
||||
while ([inputQueue count] == 0) {
|
||||
[inputCondition wait];
|
||||
}
|
||||
NSString *line = [inputQueue objectAtIndex:0];
|
||||
[inputQueue removeObjectAtIndex:0];
|
||||
[inputCondition unlock];
|
||||
return line;
|
||||
}
|
||||
|
||||
- (void)addOutputLine:(NSString *)line {
|
||||
[outputCondition lock];
|
||||
[outputQueue addObject:line];
|
||||
[log addObject:line];
|
||||
[outputCondition signal]; // Notify that a new output line is available
|
||||
[outputCondition unlock];
|
||||
}
|
||||
|
||||
- (NSString *)outputLine {
|
||||
[outputCondition lock];
|
||||
while ([outputQueue count] == 0) {
|
||||
[outputCondition wait];
|
||||
}
|
||||
NSString *line = [outputQueue objectAtIndex:0];
|
||||
[outputQueue removeObjectAtIndex:0];
|
||||
[outputCondition unlock];
|
||||
return line;
|
||||
}
|
||||
@end
|
||||
|
||||
@implementation LlamaSession {
|
||||
std::vector<llama_token> embd_inp;
|
||||
std::vector<llama_chat_msg> chat_msgs;
|
||||
GPTParams *params;
|
||||
GPTSampler *smpl;
|
||||
BOOL isInteracting;
|
||||
|
||||
bool is_antiprompt;
|
||||
bool input_echo;
|
||||
bool display;
|
||||
bool need_to_save_session;
|
||||
|
||||
int n_past;
|
||||
int n_remain;
|
||||
int n_consumed;
|
||||
int n_session_consumed;
|
||||
|
||||
std::vector<int> input_tokens;
|
||||
std::vector<int> output_tokens;;
|
||||
std::ostringstream output_ss;
|
||||
std::stringstream last_output_ss;
|
||||
std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
NSMutableString *pathSession;
|
||||
NSInteger ga_i;
|
||||
NSInteger ga_n;
|
||||
NSInteger ga_w;
|
||||
std::vector<llama_token> session_tokens;
|
||||
// tokenized antiprompts
|
||||
std::vector<std::vector<llama_token>> antiprompt_ids;
|
||||
BOOL need_insert_eot;
|
||||
int n_ctx;
|
||||
}
|
||||
|
||||
- (NSString *)chat_add_and_format:(std::vector<llama_chat_msg> &) chat_msgs role:(const std::string &) role content:(const std::string &) content {
|
||||
llama_chat_msg new_msg{role, content};
|
||||
auto formatted = llama_chat_format_single([self.model cModel], [params params].chat_template, chat_msgs, new_msg, role == "user");
|
||||
chat_msgs.push_back({role, content});
|
||||
os_log_debug(OS_LOG_DEFAULT, "formatted: '%s'\n", formatted.c_str());
|
||||
return [NSString stringWithCString:formatted.c_str() encoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
static BOOL file_is_empty(NSString *path) {
|
||||
NSFileManager *manager = [NSFileManager defaultManager];
|
||||
if ([manager fileExistsAtPath:path]) {
|
||||
NSDictionary *attributes = [manager attributesOfItemAtPath:path error:nil];
|
||||
unsigned long long size = [attributes fileSize];
|
||||
if (attributes && size == 0) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
- (instancetype)initWithParams:(GPTParams *)params {
|
||||
self = [super init];
|
||||
|
||||
self->params = params;
|
||||
// model = llama_init.model;
|
||||
// ctx = llama_init.context;
|
||||
//
|
||||
// if model == nil {
|
||||
// LOG_ERR("%s: error: unable to load model\n", __func__);
|
||||
// return 1;
|
||||
// }
|
||||
//
|
||||
os_log_info(OS_LOG_DEFAULT,
|
||||
"%s: llama threadpool init, n_threads = %d\n",
|
||||
__func__, params.cpuParams.nThreads);
|
||||
|
||||
if (params.embedding) {
|
||||
os_log_error(OS_LOG_DEFAULT,
|
||||
R"(************
|
||||
please use the 'embedding' tool for embedding calculations
|
||||
************)");
|
||||
abort();
|
||||
}
|
||||
|
||||
if (params.nCtx != 0 && params.nCtx < 8) {
|
||||
os_log_info(OS_LOG_DEFAULT, "minimum context size is 8, using minimum size.");
|
||||
params.nCtx = 8;
|
||||
}
|
||||
|
||||
if (params.ropeFreqBase != 0) {
|
||||
os_log_info(OS_LOG_DEFAULT, "changing RoPE frequency base to \(params.ropeFreqBase)");
|
||||
}
|
||||
|
||||
if (params.ropeFreqScale != 0.0) {
|
||||
os_log_info(OS_LOG_DEFAULT, "scaling RoPE frequency by \(params.ropeFreqScale)");
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(ggml_numa_strategy(params.numaStrategy));
|
||||
auto llama_init = llama_init_from_gpt_params([params params]);
|
||||
|
||||
auto tpp_batch = params.cpuParamsBatch.ggmlThreadpoolParams;
|
||||
auto tpp = params.cpuParams.ggmlThreadpoolParams;
|
||||
|
||||
set_process_priority(ggml_sched_priority(params.cpuParams.priority));
|
||||
|
||||
GGMLThreadpool *threadpool_batch;
|
||||
if (tpp != tpp_batch) {
|
||||
threadpool_batch = [tpp_batch threadpool];
|
||||
if (!threadpool_batch) {
|
||||
[NSException raise:@"batch threadpool create failed"
|
||||
format:@"batch threadpool create failed"];
|
||||
}
|
||||
|
||||
// Start the non-batch threadpool in the paused state
|
||||
tpp.paused = true;
|
||||
}
|
||||
|
||||
GGMLThreadpool *threadpool = [tpp threadpool];
|
||||
if (!threadpool) {
|
||||
[NSException raise:@"threadpool create failed"
|
||||
format:@"threadpool create failed"];
|
||||
}
|
||||
|
||||
self.ctx = [[LlamaContext alloc] initWithContext:llama_init.context];
|
||||
[self.ctx attachThreadpool:threadpool threadpoolBatch:threadpool_batch];
|
||||
self.model = [[LlamaModel alloc] init:llama_init.model];
|
||||
const int n_ctx_train = [self.model nCtxTrain];
|
||||
n_ctx = [self.ctx nCtx];
|
||||
//
|
||||
if (n_ctx > n_ctx_train) {
|
||||
os_log_info(OS_LOG_DEFAULT, "%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
||||
}
|
||||
|
||||
// print chat template example in conversation mode
|
||||
if (params.conversation) {
|
||||
if (params.enableChatTemplate) {
|
||||
os_log_info(OS_LOG_DEFAULT, "%s: chat template example:\n%s\n", __func__,
|
||||
[[self.model formatExample:params.chatTemplate] cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
} else {
|
||||
os_log_info(OS_LOG_DEFAULT, "%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
||||
}
|
||||
}
|
||||
// print system information
|
||||
@autoreleasepool {
|
||||
NSLog(@"%s", gpt_params_get_system_info([params params]).c_str());
|
||||
}
|
||||
|
||||
pathSession = [[NSMutableString alloc] initWithString:params.pathPromptCache];
|
||||
|
||||
NSFileManager *fileManager = [NSFileManager defaultManager];
|
||||
|
||||
if ([pathSession length] != 0) {
|
||||
os_log_info(OS_LOG_DEFAULT, "%s: attempting to load saved session from '%s'\n", __func__, [pathSession cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
if (![fileManager fileExistsAtPath:pathSession]) {
|
||||
os_log_info(OS_LOG_DEFAULT, "%s: session file does not exist, will create.\n", __func__);
|
||||
} else if (file_is_empty(pathSession)) {
|
||||
os_log_info(OS_LOG_DEFAULT,"%s: The session file is empty. A new session will be initialized.\n", __func__);
|
||||
} else {
|
||||
// The file exists and is not empty
|
||||
session_tokens.resize(n_ctx);
|
||||
size_t n_token_count_out = 0;
|
||||
if (![self.ctx loadStateFile:pathSession tokensOut:session_tokens.data() nTokenCpacity:session_tokens.capacity() nTokenCountOut:&n_token_count_out]) {
|
||||
[NSException raise:@"SessionLoadFailure" format:@"%s: failed to load session file '%s'\n", __func__, [pathSession cStringUsingEncoding:NSUTF8StringEncoding]];
|
||||
}
|
||||
session_tokens.resize(n_token_count_out);
|
||||
os_log_info(OS_LOG_DEFAULT,"%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
||||
}
|
||||
}
|
||||
|
||||
BOOL addBOS = [self.model addBOSToken];
|
||||
if (![self.model hasEncoder]) {
|
||||
GGML_ASSERT(![self.model addEOSToken]);
|
||||
}
|
||||
|
||||
os_log_debug(OS_LOG_DEFAULT, "n_ctx: %d, add_bos: %d\n", n_ctx, addBOS);
|
||||
|
||||
|
||||
{
|
||||
auto prompt = (params.conversation && params.enableChatTemplate && params.prompt.length > 0)
|
||||
? [self chat_add_and_format:chat_msgs role:"system" content:[params params].prompt] // format the system prompt in conversation mode
|
||||
: params.prompt;
|
||||
if (params.interactiveFirst || [params.prompt length] > 0 || session_tokens.empty()) {
|
||||
os_log_debug(OS_LOG_DEFAULT, "tokenize the prompt\n");
|
||||
embd_inp = [self.ctx tokenize:prompt addSpecial:true parseSpecial:true];
|
||||
} else {
|
||||
os_log_debug(OS_LOG_DEFAULT,"use session tokens\n");
|
||||
embd_inp = session_tokens;
|
||||
}
|
||||
|
||||
os_log_debug(OS_LOG_DEFAULT,"prompt: \"%s\"\n", [prompt cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
os_log_debug(OS_LOG_DEFAULT,"tokens: %s\n", [self.ctx convertTokensToString:embd_inp].c_str());
|
||||
}
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
if (addBOS) {
|
||||
embd_inp.push_back([self.model tokenBOS]);
|
||||
// LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
} else {
|
||||
[NSException raise:@"InputEmptyError" format:@"input is empty"];
|
||||
}
|
||||
}
|
||||
|
||||
// Tokenize negative prompt
|
||||
if (embd_inp.size() > n_ctx - 4) {
|
||||
[NSException raise:@"PromptError" format:@"%s: prompt is too long (%d tokens, max %d)\n", __func__, (int)embd_inp.size(), n_ctx - 4];
|
||||
}
|
||||
|
||||
// debug message about similarity of saved session, if applicable
|
||||
size_t n_matching_session_tokens = 0;
|
||||
if (!session_tokens.empty()) {
|
||||
for (llama_token id : session_tokens) {
|
||||
if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
|
||||
break;
|
||||
}
|
||||
n_matching_session_tokens++;
|
||||
}
|
||||
if ([params.prompt length] == 0 && n_matching_session_tokens == embd_inp.size()) {
|
||||
// LOG_INF("%s: using full prompt from session file\n", __func__);
|
||||
} else if (n_matching_session_tokens >= embd_inp.size()) {
|
||||
// LOG_INF("%s: session file has exact match for prompt!\n", __func__);
|
||||
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
|
||||
// LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
||||
// __func__, n_matching_session_tokens, embd_inp.size());
|
||||
} else {
|
||||
// LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
|
||||
// __func__, n_matching_session_tokens, embd_inp.size());
|
||||
}
|
||||
|
||||
// remove any "future" tokens that we might have inherited from the previous session
|
||||
llama_kv_cache_seq_rm([self.ctx cContext], -1, n_matching_session_tokens, -1);
|
||||
}
|
||||
//
|
||||
// os_log_debug(OS_LOG_DEFAULT, "recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
|
||||
// embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
|
||||
//
|
||||
// if we will use the cache for the full prompt without reaching the end of the cache, force
|
||||
// reevaluation of the last token to recalculate the cached logits
|
||||
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
|
||||
// os_log_debug(OS_LOG_DEFAULT, "recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
|
||||
|
||||
session_tokens.resize(embd_inp.size() - 1);
|
||||
}
|
||||
|
||||
// number of tokens to keep when resetting context
|
||||
if (params.nKeep < 0 || params.nKeep > (int) embd_inp.size()) {
|
||||
params.nKeep = (int)embd_inp.size();
|
||||
} else {
|
||||
params.nKeep += addBOS; // always keep the BOS token
|
||||
}
|
||||
|
||||
if (params.conversation) {
|
||||
params.interactiveFirst = true;
|
||||
}
|
||||
|
||||
// enable interactive mode if interactive start is specified
|
||||
if (params.interactiveFirst) {
|
||||
params.interactive = true;
|
||||
}
|
||||
|
||||
if (params.verbosePrompt) {
|
||||
// LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
// LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n", embd_inp[i],
|
||||
[[self.ctx tokenToPiece:embd_inp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
}
|
||||
|
||||
if (params.nKeep > addBOS) {
|
||||
// LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.nKeep; i++) {
|
||||
os_log_debug(OS_LOG_DEFAULT, "%s",
|
||||
[[self.ctx tokenToPiece:embd_inp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
}
|
||||
// LOG("'\n");
|
||||
}
|
||||
// LOG_INF("\n");
|
||||
}
|
||||
//
|
||||
// // ctrl+C handling
|
||||
// {
|
||||
//#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
// struct sigaction sigint_action;
|
||||
// sigint_action.sa_handler = sigint_handler;
|
||||
// sigemptyset (&sigint_action.sa_mask);
|
||||
// sigint_action.sa_flags = 0;
|
||||
// sigaction(SIGINT, &sigint_action, NULL);
|
||||
//#elif defined (_WIN32)
|
||||
// auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
// return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||
// };
|
||||
// SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
//#endif
|
||||
// }
|
||||
//
|
||||
if (params.interactive) {
|
||||
os_log_info(OS_LOG_DEFAULT, "%s: interactive mode on.\n", __func__);
|
||||
|
||||
if ([params.antiPrompts count] > 0) {
|
||||
for (NSString *antiprompt in params.antiPrompts) {
|
||||
os_log_info(OS_LOG_DEFAULT, "Reverse prompt: '%s'\n", [antiprompt cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
if (params.verbosePrompt) {
|
||||
auto tmp = [_ctx tokenize:antiprompt
|
||||
addSpecial:false
|
||||
parseSpecial:true];
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n", tmp[i], [[self.ctx tokenToPiece:tmp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (params.inputPrefixBOS) {
|
||||
os_log_info(OS_LOG_DEFAULT, "Input prefix with BOS\n");
|
||||
}
|
||||
|
||||
if ([params.inputPrefix length] > 0) {
|
||||
os_log_info(OS_LOG_DEFAULT, "Input prefix: '%s'\n", [params.inputPrefix cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
if (params.verbosePrompt) {
|
||||
auto tmp = [_ctx tokenize:params.inputPrefix addSpecial:true parseSpecial:true];
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n",
|
||||
tmp[i], [[self.ctx tokenToPiece:tmp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ([params.inputSuffix length] > 0) {
|
||||
os_log_info(OS_LOG_DEFAULT, "Input suffix: '%s'\n", [params.inputSuffix cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
if (params.verbosePrompt) {
|
||||
auto tmp = [_ctx tokenize:params.inputSuffix addSpecial:false parseSpecial:true];
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
os_log_info(OS_LOG_DEFAULT, "%6d -> '%s'\n",
|
||||
tmp[i], [[self.ctx tokenToPiece:tmp[i]] cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
smpl = [[GPTSampler alloc] init:_model gptSamplerParams:[params samplerParams]];
|
||||
if (!smpl) {
|
||||
[NSException raise:@"SamplingFailure" format:@"failed to initialize sampling subsystem"];
|
||||
}
|
||||
|
||||
os_log_info(OS_LOG_DEFAULT, "sampler seed: %u\n", [smpl seed]);
|
||||
// LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
||||
// LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
|
||||
//
|
||||
// LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
//
|
||||
// group-attention state
|
||||
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
|
||||
|
||||
ga_n = params.grpAttnN;
|
||||
ga_w = params.grpAttnW;
|
||||
|
||||
if (ga_n != 1) {
|
||||
GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT
|
||||
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
|
||||
os_log_info(OS_LOG_DEFAULT, "self-extend: n_ctx_train = %d, grp_attn_n = %ld, grp_attn_w = %ld\n", n_ctx_train, static_cast<long>(ga_n), static_cast<long>(ga_w));
|
||||
}
|
||||
|
||||
if (params.interactive) {
|
||||
const char * control_message;
|
||||
if (params.multilineInput) {
|
||||
control_message = " - To return control to the AI, end your input with '\\'.\n"
|
||||
" - To return control without starting a new line, end your input with '/'.\n";
|
||||
} else {
|
||||
control_message = " - Press Return to return control to the AI.\n"
|
||||
" - To return control without starting a new line, end your input with '/'.\n"
|
||||
" - If you want to submit another line, end your input with '\\'.\n";
|
||||
}
|
||||
|
||||
isInteracting = params.interactiveFirst;
|
||||
}
|
||||
|
||||
is_antiprompt = false;
|
||||
input_echo = true;
|
||||
display = true;
|
||||
need_to_save_session = [pathSession length] > 0 && n_matching_session_tokens < embd_inp.size();
|
||||
n_remain = params.nPredict;
|
||||
|
||||
// // the first thing we will do is to output the prompt, so set color accordingly
|
||||
// console::set_display(console::prompt);
|
||||
// display = params.display_prompt;
|
||||
//
|
||||
|
||||
|
||||
|
||||
|
||||
antiprompt_ids.reserve([params.antiPrompts count]);
|
||||
for (NSString *antiprompt in params.antiPrompts) {
|
||||
antiprompt_ids.emplace_back([self.ctx tokenize:antiprompt addSpecial:false parseSpecial:true]);
|
||||
}
|
||||
|
||||
if ([self.model hasEncoder]) {
|
||||
int enc_input_size = embd_inp.size();
|
||||
llama_token * enc_input_buf = embd_inp.data();
|
||||
|
||||
if ([_ctx encode:llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0)]) {
|
||||
[NSException raise:@"EvalFailure" format:@"failed to eval"];
|
||||
}
|
||||
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token([self.model cModel]);
|
||||
if (decoder_start_token_id == -1) {
|
||||
decoder_start_token_id = [self.model tokenBOS];
|
||||
}
|
||||
|
||||
embd_inp.clear();
|
||||
embd_inp.push_back(decoder_start_token_id);
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (void)start:(BlockingLineQueue *)queue {
|
||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||
// predict
|
||||
if (!embd.empty()) {
|
||||
// Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
|
||||
// --prompt or --file which uses the same value.
|
||||
int max_embd_size = n_ctx - 4;
|
||||
|
||||
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
|
||||
if ((int) embd.size() > max_embd_size) {
|
||||
const int skipped_tokens = (int) embd.size() - max_embd_size;
|
||||
embd.resize(max_embd_size);
|
||||
|
||||
// console::set_display(console::error);
|
||||
os_log_error(OS_LOG_DEFAULT, "<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||
// console::set_display(console::reset);
|
||||
}
|
||||
|
||||
if (params.grpAttnN == 1) {
|
||||
// infinite text generation via context shifting
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
|
||||
if (n_past + (int) embd.size() >= [_ctx nCtx]) {
|
||||
if (!params.ctxShift) {
|
||||
os_log_debug(OS_LOG_DEFAULT, "\n\n%s: context full and context shift is disabled => stopping\n", __func__);
|
||||
break;
|
||||
} else {
|
||||
if (params.nPredict == -2) {
|
||||
os_log_debug(OS_LOG_DEFAULT, "\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.nPredict);
|
||||
break;
|
||||
}
|
||||
|
||||
const int n_left = n_past - params.nKeep;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
os_log_debug(OS_LOG_DEFAULT, "context full, swapping: n_past = %d, n_left = %d, n_ctx = %lu, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, static_cast<unsigned long>([_ctx nCtx]), params.nKeep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm ([self.ctx cContext], 0, params.nKeep , params.nKeep + n_discard);
|
||||
llama_kv_cache_seq_add([self.ctx cContext], 0, params.nKeep + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
os_log_debug(OS_LOG_DEFAULT, "after swap: n_past = %d\n", n_past);
|
||||
|
||||
os_log_debug(OS_LOG_DEFAULT, "embd: %s\n", [self.ctx convertTokensToString:embd].c_str());
|
||||
|
||||
os_log_debug(OS_LOG_DEFAULT, "clear session path\n");
|
||||
[pathSession setString:@""];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// context extension via Self-Extend
|
||||
while (n_past >= ga_i + ga_w) {
|
||||
const int ib = (ga_n*ga_i)/ga_w;
|
||||
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
||||
const int dd = (ga_w/ga_n) - ib*bd - ga_w;
|
||||
|
||||
os_log_debug(OS_LOG_DEFAULT, "\n");
|
||||
os_log_debug(OS_LOG_DEFAULT, "shift: [%6ld, %6d] + %6d -> [%6ld, %6d]\n", static_cast<long>(ga_i), n_past, ib*bd, static_cast<long>(ga_i + ib*bd), n_past + ib*bd);
|
||||
os_log_debug(OS_LOG_DEFAULT, "div: [%6ld, %6ld] / %6ld -> [%6ld, %6ld]\n", static_cast<long>(ga_i + ib*bd), static_cast<long>(ga_i + ib*bd + ga_w), static_cast<long>(ga_n), static_cast<long>((ga_i + ib*bd)/ga_n), static_cast<long>((ga_i + ib*bd + ga_w)/ga_n));
|
||||
os_log_debug(OS_LOG_DEFAULT, "shift: [%6ld, %6d] + %6d -> [%6ld, %6d]\n", static_cast<long>(ga_i + ib*bd + ga_w), n_past + ib*bd, dd, static_cast<long>(ga_i + ib*bd + ga_w + dd), n_past + ib*bd + dd);
|
||||
|
||||
[self.ctx kvCacheSeqAdd:0 p0:ga_i p1:n_past delta:ib*bd];
|
||||
[self.ctx kvCacheSeqDiv:0 p0:ga_i + ib*bd p1:ga_i + ib*bd + ga_w delta:ga_n];
|
||||
[self.ctx kvCacheSeqAdd:0 p0:ga_i + ib*bd + ga_w p1:n_past + ib*bd delta:dd];
|
||||
|
||||
n_past -= bd;
|
||||
|
||||
ga_i += ga_w/ga_n;
|
||||
|
||||
os_log_debug(OS_LOG_DEFAULT, "\nn_past_old = %d, n_past = %d, ga_i = %ld\n\n", n_past + bd, n_past, static_cast<long>(ga_i));
|
||||
}
|
||||
}
|
||||
|
||||
// try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
|
||||
if (n_session_consumed < (int) session_tokens.size()) {
|
||||
size_t i = 0;
|
||||
for ( ; i < embd.size(); i++) {
|
||||
if (embd[i] != session_tokens[n_session_consumed]) {
|
||||
session_tokens.resize(n_session_consumed);
|
||||
break;
|
||||
}
|
||||
|
||||
n_past++;
|
||||
n_session_consumed++;
|
||||
|
||||
if (n_session_consumed >= (int) session_tokens.size()) {
|
||||
++i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i > 0) {
|
||||
embd.erase(embd.begin(), embd.begin() + i);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int) embd.size(); i += params.nBatch) {
|
||||
int n_eval = (int) embd.size() - i;
|
||||
if (n_eval > params.nBatch) {
|
||||
n_eval = params.nBatch;
|
||||
}
|
||||
|
||||
os_log_debug(OS_LOG_DEFAULT, "eval: %s\n", [self.ctx convertTokensToString:embd].c_str());
|
||||
|
||||
|
||||
if ([self.ctx decode:[[LlamaBatch alloc] initWithBatch:llama_batch_get_one(&embd[i], n_eval, n_past, 0)] ]) {
|
||||
[NSException raise:@"EvalFailure" format:@"failed to eval"];
|
||||
}
|
||||
|
||||
n_past += n_eval;
|
||||
|
||||
os_log_debug(OS_LOG_DEFAULT, "n_past = %d\n", n_past);
|
||||
// Display total tokens alongside total time
|
||||
if (params.nPrint > 0 && n_past % params.nPrint == 0) {
|
||||
os_log_debug(OS_LOG_DEFAULT, "\n\033[31mTokens consumed so far = %d / %lu \033[0m\n", n_past, static_cast<unsigned long>([self.ctx nCtx]));
|
||||
}
|
||||
}
|
||||
|
||||
if (!embd.empty() && [pathSession length] > 0) {
|
||||
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
|
||||
n_session_consumed = session_tokens.size();
|
||||
}
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
|
||||
if ((int) embd_inp.size() <= n_consumed && !isInteracting) {
|
||||
// optionally save the session on first sample (for faster prompt loading next time)
|
||||
if ([pathSession length] > 0 && need_to_save_session && !params.promptCacheRO) {
|
||||
need_to_save_session = false;
|
||||
[self.ctx saveStateFile:pathSession tokens:session_tokens.data() nTokenCount:session_tokens.size()];
|
||||
// llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
|
||||
os_log_debug(OS_LOG_DEFAULT, "saved session to %s\n", [pathSession cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
}
|
||||
|
||||
const llama_token idToken = [smpl sample:self.ctx index:-1];
|
||||
|
||||
[smpl accept:idToken acceptGrammar:true];
|
||||
|
||||
// os_log_debug(OS_LOG_DEFAULT, "last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
||||
|
||||
embd.push_back(idToken);
|
||||
|
||||
// echo this to console
|
||||
input_echo = true;
|
||||
|
||||
// decrement remaining sampling budget
|
||||
--n_remain;
|
||||
|
||||
os_log_debug(OS_LOG_DEFAULT, "n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
// some user input remains from prompt or interaction, forward it to processing
|
||||
os_log_debug(OS_LOG_DEFAULT, "embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||
while ((int) embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
|
||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||
// for the prompt, we don't apply grammar rules
|
||||
[smpl accept:embd_inp[n_consumed] acceptGrammar:false];
|
||||
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.nBatch) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// display text
|
||||
if (input_echo && display) {
|
||||
// std::cout<< "DISPLAYING TEXT" << std::endl;
|
||||
|
||||
for (auto idToken : embd) {
|
||||
NSString *token_str = [self.ctx tokenToPiece:idToken special:params.special];
|
||||
|
||||
// Console/Stream Output
|
||||
os_log_info(OS_LOG_DEFAULT, "%s", [token_str cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
|
||||
// Record Displayed Tokens To Log
|
||||
// Note: Generated tokens are created one by one hence this check
|
||||
if (embd.size() > 1) {
|
||||
// Incoming Requested Tokens
|
||||
input_tokens.push_back(idToken);
|
||||
|
||||
} else {
|
||||
// Outgoing Generated Tokens
|
||||
output_tokens.push_back(idToken);
|
||||
output_ss << [token_str cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
last_output_ss << [token_str cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
}
|
||||
if (!last_output_ss.str().empty()) {
|
||||
// queue->addOutputLine(last_output_ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
// reset color to default if there is no pending user input
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
if (!last_output_ss.str().empty()) {
|
||||
// queue->addOutputLine(last_output_ss.str());
|
||||
}
|
||||
// console::set_display(console::reset);
|
||||
display = true;
|
||||
}
|
||||
|
||||
// if not currently processing queued inputs;
|
||||
if ((int) embd_inp.size() <= n_consumed) {
|
||||
// check for reverse prompt in the last n_prev tokens
|
||||
if ([params.antiPrompts count] > 0) {
|
||||
const int n_prev = 32;
|
||||
NSString *last_output = [smpl previousString:self.ctx n:n_prev];
|
||||
|
||||
is_antiprompt = false;
|
||||
// Check if each of the reverse prompts appears at the end of the output.
|
||||
// If we're not running interactively, the reverse prompt might be tokenized with some following characters
|
||||
// so we'll compensate for that by widening the search window a bit.
|
||||
for (NSString *antiprompt in params.antiPrompts) {
|
||||
size_t extra_padding = params.interactive ? 0 : 2;
|
||||
size_t search_start_pos = [last_output length] > static_cast<size_t>([antiprompt length] + extra_padding)
|
||||
? [last_output length] - static_cast<size_t>([antiprompt length] + extra_padding)
|
||||
: 0;
|
||||
|
||||
// TODO: Check if correct
|
||||
if ([last_output rangeOfString:antiprompt options:0 range:NSMakeRange(search_start_pos, last_output.length - search_start_pos)].location != NSNotFound) {
|
||||
if (params.interactive) {
|
||||
isInteracting = true;
|
||||
}
|
||||
is_antiprompt = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// check for reverse prompt using special tokens
|
||||
llama_token last_token = [smpl last];
|
||||
for (std::vector<llama_token> ids : antiprompt_ids) {
|
||||
if (ids.size() == 1 && last_token == ids[0]) {
|
||||
if (params.interactive) {
|
||||
isInteracting = true;
|
||||
}
|
||||
is_antiprompt = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_antiprompt) {
|
||||
os_log_debug(OS_LOG_DEFAULT, "found antiprompt: %s\n", [last_output cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
}
|
||||
}
|
||||
|
||||
// deal with end of generation tokens in interactive mode
|
||||
|
||||
if ([self.model tokenIsEOG:[smpl last]]) {
|
||||
os_log_debug(OS_LOG_DEFAULT, "found an EOG token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
if ([[params antiPrompts] count] > 0) {
|
||||
// tokenize and inject first reverse prompt
|
||||
|
||||
const auto first_antiprompt = [self.ctx tokenize:params.antiPrompts[0] addSpecial:false parseSpecial:true];
|
||||
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
|
||||
is_antiprompt = true;
|
||||
}
|
||||
|
||||
if (params.enableChatTemplate) {
|
||||
[self chat_add_and_format:chat_msgs
|
||||
role:"assistant"
|
||||
content:assistant_ss.str()];
|
||||
}
|
||||
isInteracting = true;
|
||||
// LOG("\n");
|
||||
}
|
||||
}
|
||||
|
||||
// if current token is not EOG, we add it to current assistant message
|
||||
if (params.conversation) {
|
||||
const auto idToken = [smpl last];
|
||||
assistant_ss << [[self.ctx tokenToPiece:idToken special:false] cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
if (n_past > 0 && isInteracting) {
|
||||
os_log_debug(OS_LOG_DEFAULT, "waiting for user input\n");
|
||||
|
||||
if (params.conversation) {
|
||||
// osLog_("\n> ");
|
||||
}
|
||||
|
||||
if (params.inputPrefixBOS) {
|
||||
os_log_debug(OS_LOG_DEFAULT, "adding input prefix BOS token\n");
|
||||
embd_inp.push_back([self.model tokenBOS]);
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
if ([params.inputPrefix length] > 0 && !params.conversation) {
|
||||
os_log_debug(OS_LOG_DEFAULT, "appending input prefix: '%s'\n", [params.inputPrefix cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
os_log_info(OS_LOG_DEFAULT, "%s", [params.inputPrefix cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
}
|
||||
|
||||
// color user input only
|
||||
// console::set_display(console::user_input);
|
||||
display = params.displayPrompt;
|
||||
|
||||
std::string line;
|
||||
// bool another_line = true;
|
||||
static int read_one = 0;
|
||||
// if (!read_one) {
|
||||
// do {
|
||||
// another_line = false;// console::readline(line, params.multiline_input);
|
||||
// buffer += "What is the weather in New York?";//line;
|
||||
// } while (another_line);
|
||||
// read_one++;
|
||||
// }
|
||||
// else {
|
||||
if (!last_output_ss.str().empty()) {
|
||||
auto str = last_output_ss.str();
|
||||
last_output_ss.str("");
|
||||
[queue addOutputLine:[NSString stringWithCString:str.c_str() encoding:NSUTF8StringEncoding]];
|
||||
}
|
||||
|
||||
buffer = [[queue inputLine] cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
// do {
|
||||
// another_line = console::readline(line, params.multiline_input);
|
||||
// buffer += line;
|
||||
// } while (another_line);
|
||||
// }
|
||||
// done taking input, reset color
|
||||
// console::set_display(console::reset);
|
||||
display = true;
|
||||
|
||||
// Add tokens to embd only if the input buffer is non-empty
|
||||
// Entering a empty line lets the user pass control back
|
||||
if (buffer.length() > 1) {
|
||||
// append input suffix if any
|
||||
if ([params.inputSuffix length] > 0 && !params.conversation) {
|
||||
os_log_debug(OS_LOG_DEFAULT, "appending input suffix: '%s'\n", [params.inputSuffix cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
os_log_info(OS_LOG_DEFAULT, "%s", [params.inputSuffix cStringUsingEncoding:NSUTF8StringEncoding]);
|
||||
}
|
||||
|
||||
os_log_debug(OS_LOG_DEFAULT, "buffer: '%s'\n", buffer.c_str());
|
||||
|
||||
const size_t original_size = embd_inp.size();
|
||||
|
||||
if (params.escapeSequences) {
|
||||
string_process_escapes(buffer);
|
||||
}
|
||||
|
||||
bool format_chat = params.conversation && params.enableChatTemplate;
|
||||
std::string user_inp = format_chat
|
||||
? [[self chat_add_and_format:chat_msgs role:"user" content:std::move(buffer)] cStringUsingEncoding:NSUTF8StringEncoding]
|
||||
: std::move(buffer);
|
||||
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
||||
const auto line_pfx = [self.ctx tokenize:params.inputPrefix addSpecial:false parseSpecial:true];
|
||||
const auto line_inp = [self.ctx tokenize:[NSString stringWithCString:user_inp.c_str()
|
||||
encoding:NSUTF8StringEncoding]
|
||||
addSpecial:false
|
||||
parseSpecial:format_chat];
|
||||
const auto line_sfx = [self.ctx tokenize:params.inputSuffix
|
||||
addSpecial:false
|
||||
parseSpecial:true];
|
||||
|
||||
os_log_debug(OS_LOG_DEFAULT, "input tokens: %s\n", [self.ctx convertTokensToString:line_inp].c_str());
|
||||
|
||||
// if user stop generation mid-way, we must add EOT to finish model's last response
|
||||
if (need_insert_eot && format_chat) {
|
||||
llama_token eot = [self.model tokenEOT];
|
||||
embd_inp.push_back(eot == -1 ? [self.model tokenEOS] : eot);
|
||||
need_insert_eot = false;
|
||||
}
|
||||
|
||||
embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
|
||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||
embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
|
||||
|
||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||
const llama_token token = embd_inp[i];
|
||||
output_tokens.push_back(token);
|
||||
output_ss << [[self.ctx tokenToPiece:token] cStringUsingEncoding:NSUTF8StringEncoding];
|
||||
}
|
||||
|
||||
// reset assistant message
|
||||
assistant_ss.str("");
|
||||
|
||||
n_remain -= line_inp.size();
|
||||
os_log_debug(OS_LOG_DEFAULT, "n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
os_log_debug(OS_LOG_DEFAULT, "empty line, passing control back\n");
|
||||
}
|
||||
|
||||
input_echo = false; // do not echo this again
|
||||
}
|
||||
|
||||
if (n_past > 0) {
|
||||
if (isInteracting) {
|
||||
[smpl reset];
|
||||
}
|
||||
isInteracting = false;
|
||||
}
|
||||
}
|
||||
|
||||
// end of generation
|
||||
if (!embd.empty() && [self.model tokenIsEOG:embd.back()] && !(params.interactive)) {
|
||||
os_log_info(OS_LOG_DEFAULT, " [end of text]\n");
|
||||
break;
|
||||
}
|
||||
|
||||
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
||||
// We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
|
||||
if (params.interactive && n_remain <= 0 && params.nPredict >= 0) {
|
||||
n_remain = params.nPredict;
|
||||
isInteracting = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@end
|
264
objc/include/GPTParams.h
Normal file
264
objc/include/GPTParams.h
Normal file
|
@ -0,0 +1,264 @@
|
|||
#ifndef GPTParams_h
|
||||
#define GPTParams_h
|
||||
|
||||
@class LlamaModelParams;
|
||||
@class LlamaContextParams;
|
||||
@class GGMLThreadpool;
|
||||
|
||||
// Define the ggml_sched_priority enum
|
||||
typedef NS_ENUM(NSInteger, GGMLSchedPriority) {
|
||||
GGMLSchedPriorityNormal = 0, // Normal priority
|
||||
GGMLSchedPriorityMedium = 1, // Medium priority
|
||||
GGMLSchedPriorityHigh = 2, // High priority
|
||||
GGMLSchedPriorityRealtime = 3 // Realtime priority
|
||||
};
|
||||
|
||||
@interface GGMLThreadpoolParams : NSObject
|
||||
|
||||
@property (nonatomic, assign) int nThreads;
|
||||
@property (nonatomic, assign) GGMLSchedPriority priority;
|
||||
@property (nonatomic, assign) uint32_t poll;
|
||||
@property (nonatomic, assign) BOOL strictCPU;
|
||||
@property (nonatomic, assign) BOOL paused;
|
||||
|
||||
// Custom access methods for the cpumask array
|
||||
- (BOOL)getCpuMaskAtIndex:(NSUInteger)index;
|
||||
- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index;
|
||||
- (GGMLThreadpool *)threadpool;
|
||||
|
||||
@end
|
||||
|
||||
@interface GGMLThreadpool : NSObject
|
||||
@end
|
||||
|
||||
@interface CPUParams : NSObject
|
||||
|
||||
// Properties
|
||||
@property (nonatomic, assign) int nThreads;
|
||||
@property (nonatomic, assign) BOOL maskValid;
|
||||
@property (nonatomic, assign) GGMLSchedPriority priority;
|
||||
@property (nonatomic, assign) BOOL strictCPU;
|
||||
@property (nonatomic, assign) uint32_t poll;
|
||||
|
||||
// Custom methods to access or manipulate the cpumask array
|
||||
- (BOOL)getCpuMaskAtIndex:(NSUInteger)index;
|
||||
- (void)setCpuMask:(BOOL)value atIndex:(NSUInteger)index;
|
||||
- (GGMLThreadpoolParams *)ggmlThreadpoolParams;
|
||||
|
||||
@end
|
||||
|
||||
@interface GPTSamplerParams : NSObject
|
||||
|
||||
// Properties corresponding to C++ struct fields
|
||||
@property (nonatomic, assign) uint32_t seed;
|
||||
@property (nonatomic, assign) int32_t nPrev;
|
||||
@property (nonatomic, assign) int32_t nProbs;
|
||||
@property (nonatomic, assign) int32_t minKeep;
|
||||
@property (nonatomic, assign) int32_t topK;
|
||||
@property (nonatomic, assign) float topP;
|
||||
@property (nonatomic, assign) float minP;
|
||||
@property (nonatomic, assign) float tfsZ;
|
||||
@property (nonatomic, assign) float typP;
|
||||
@property (nonatomic, assign) float temp;
|
||||
@property (nonatomic, assign) float dynatempRange;
|
||||
@property (nonatomic, assign) float dynatempExponent;
|
||||
@property (nonatomic, assign) int32_t penaltyLastN;
|
||||
@property (nonatomic, assign) float penaltyRepeat;
|
||||
@property (nonatomic, assign) float penaltyFreq;
|
||||
@property (nonatomic, assign) float penaltyPresent;
|
||||
@property (nonatomic, assign) int32_t mirostat;
|
||||
@property (nonatomic, assign) float mirostatTau;
|
||||
@property (nonatomic, assign) float mirostatEta;
|
||||
@property (nonatomic, assign) BOOL penalizeNl;
|
||||
@property (nonatomic, assign) BOOL ignoreEos;
|
||||
@property (nonatomic, assign) BOOL noPerf;
|
||||
|
||||
// Arrays and Strings
|
||||
@property (nonatomic, strong) NSArray<NSNumber *> *samplers; // Samplers mapped to NSArray of NSNumber (for enums)
|
||||
@property (nonatomic, copy) NSString *grammar; // Grammar as NSString
|
||||
@property (nonatomic, strong) NSArray<NSNumber *> *logitBias; // Logit biases mapped to NSArray of NSNumber
|
||||
|
||||
// Method to print the parameters into a string
|
||||
- (NSString *)print;
|
||||
|
||||
@end
|
||||
|
||||
@interface GPTParams : NSObject
|
||||
|
||||
@property (nonatomic, assign) int32_t nPredict;
|
||||
@property (nonatomic, assign) NSInteger nCtx;
|
||||
@property (nonatomic, assign) int32_t nBatch;
|
||||
@property (nonatomic, assign) int32_t nUBatch;
|
||||
@property (nonatomic, assign) int32_t nKeep;
|
||||
@property (nonatomic, assign) int32_t nDraft;
|
||||
@property (nonatomic, assign) int32_t nChunks;
|
||||
@property (nonatomic, assign) int32_t nParallel;
|
||||
@property (nonatomic, assign) int32_t nSequences;
|
||||
@property (nonatomic, assign) float pSplit;
|
||||
@property (nonatomic, assign) int32_t nGpuLayers;
|
||||
@property (nonatomic, assign) int32_t nGpuLayersDraft;
|
||||
@property (nonatomic, assign) int32_t mainGpu;
|
||||
@property (nonatomic, strong) NSMutableArray<NSNumber *> *tensorSplit; // Fixed-size array, stays the same
|
||||
@property (nonatomic, assign) int32_t grpAttnN;
|
||||
@property (nonatomic, assign) int32_t grpAttnW;
|
||||
@property (nonatomic, assign) int32_t nPrint;
|
||||
@property (nonatomic, assign) float ropeFreqBase;
|
||||
@property (nonatomic, assign) float ropeFreqScale;
|
||||
@property (nonatomic, assign) float yarnExtFactor;
|
||||
@property (nonatomic, assign) float yarnAttnFactor;
|
||||
@property (nonatomic, assign) float yarnBetaFast;
|
||||
@property (nonatomic, assign) float yarnBetaSlow;
|
||||
@property (nonatomic, assign) int32_t yarnOrigCtx;
|
||||
@property (nonatomic, assign) float defragThold;
|
||||
|
||||
// You need to replace your C++ struct "cpu_params" with an Objective-C class or struct accordingly
|
||||
@property (nonatomic, strong) CPUParams *cpuParams;
|
||||
@property (nonatomic, strong) CPUParams *cpuParamsBatch;
|
||||
@property (nonatomic, strong) CPUParams *draftCpuParams;
|
||||
@property (nonatomic, strong) CPUParams *draftCpuParamsBatch;
|
||||
|
||||
// Callbacks (assuming they are blocks in Objective-C)
|
||||
@property (nonatomic, copy) void (^cbEval)(void *);
|
||||
@property (nonatomic, assign) void *cbEvalUserData;
|
||||
|
||||
@property (nonatomic, assign) NSInteger numaStrategy; // Enumerations
|
||||
|
||||
@property (nonatomic, assign) NSInteger splitMode;
|
||||
@property (nonatomic, assign) NSInteger ropeScalingType;
|
||||
@property (nonatomic, assign) NSInteger poolingType;
|
||||
@property (nonatomic, assign) NSInteger attentionType;
|
||||
|
||||
// Sampler parameters would also be converted to an Objective-C object
|
||||
@property (nonatomic, strong) GPTSamplerParams *samplerParams;
|
||||
|
||||
@property (nonatomic, copy) NSString *modelPath;
|
||||
@property (nonatomic, copy) NSString *modelDraft;
|
||||
@property (nonatomic, copy) NSString *modelAlias;
|
||||
@property (nonatomic, copy) NSString *modelURL;
|
||||
@property (nonatomic, copy) NSString *hfToken;
|
||||
@property (nonatomic, copy) NSString *hfRepo;
|
||||
@property (nonatomic, copy) NSString *hfFile;
|
||||
@property (nonatomic, copy) NSString *prompt;
|
||||
@property (nonatomic, copy) NSString *promptFile;
|
||||
@property (nonatomic, copy) NSString *pathPromptCache;
|
||||
@property (nonatomic, copy) NSString *inputPrefix;
|
||||
@property (nonatomic, copy) NSString *inputSuffix;
|
||||
@property (nonatomic, copy) NSString *logdir;
|
||||
@property (nonatomic, copy) NSString *lookupCacheStatic;
|
||||
@property (nonatomic, copy) NSString *lookupCacheDynamic;
|
||||
@property (nonatomic, copy) NSString *logitsFile;
|
||||
@property (nonatomic, copy) NSString *rpcServers;
|
||||
|
||||
// Arrays in Objective-C are represented with `NSArray`
|
||||
@property (nonatomic, strong) NSArray<NSString *> *inputFiles;
|
||||
@property (nonatomic, strong) NSArray<NSString *> *antiPrompts;
|
||||
@property (nonatomic, strong) NSArray *kvOverrides;
|
||||
|
||||
// Boolean values (in Objective-C, use `BOOL`)
|
||||
@property (nonatomic, assign) BOOL loraInitWithoutApply;
|
||||
@property (nonatomic, strong) NSArray *loraAdapters;
|
||||
@property (nonatomic, strong) NSArray *controlVectors;
|
||||
|
||||
// Control params
|
||||
@property (nonatomic, assign) int32_t verbosity;
|
||||
@property (nonatomic, assign) int32_t controlVectorLayerStart;
|
||||
@property (nonatomic, assign) int32_t controlVectorLayerEnd;
|
||||
|
||||
// Performance and configuration params
|
||||
@property (nonatomic, assign) int32_t pplStride;
|
||||
@property (nonatomic, assign) int32_t pplOutputType;
|
||||
|
||||
@property (nonatomic, assign) BOOL hellaswag;
|
||||
@property (nonatomic, assign) size_t hellaswagTasks;
|
||||
@property (nonatomic, assign) BOOL winogrande;
|
||||
@property (nonatomic, assign) size_t winograndeTasks;
|
||||
@property (nonatomic, assign) BOOL multipleChoice;
|
||||
@property (nonatomic, assign) size_t multipleChoiceTasks;
|
||||
@property (nonatomic, assign) BOOL klDivergence;
|
||||
|
||||
@property (nonatomic, assign) BOOL usage;
|
||||
@property (nonatomic, assign) BOOL useColor;
|
||||
@property (nonatomic, assign) BOOL special;
|
||||
@property (nonatomic, assign) BOOL interactive;
|
||||
@property (nonatomic, assign) BOOL interactiveFirst;
|
||||
@property (nonatomic, assign) BOOL conversation;
|
||||
@property (nonatomic, assign) BOOL promptCacheAll;
|
||||
@property (nonatomic, assign) BOOL promptCacheRO;
|
||||
|
||||
@property (nonatomic, assign) BOOL escapeSequences;
|
||||
@property (nonatomic, assign) BOOL multilineInput;
|
||||
@property (nonatomic, assign) BOOL simpleIO;
|
||||
@property (nonatomic, assign) BOOL continuousBatching;
|
||||
@property (nonatomic, assign) BOOL flashAttention;
|
||||
@property (nonatomic, assign) BOOL noPerformanceMetrics;
|
||||
@property (nonatomic, assign) BOOL contextShift;
|
||||
|
||||
// Server and I/O settings
|
||||
@property (nonatomic, assign) int32_t port;
|
||||
@property (nonatomic, assign) int32_t timeoutRead;
|
||||
@property (nonatomic, assign) int32_t timeoutWrite;
|
||||
@property (nonatomic, assign) int32_t httpThreads;
|
||||
|
||||
@property (nonatomic, copy) NSString *hostname;
|
||||
@property (nonatomic, copy) NSString *publicPath;
|
||||
@property (nonatomic, copy) NSString *chatTemplate;
|
||||
@property (nonatomic, copy) NSString *systemPrompt;
|
||||
@property (nonatomic, assign) BOOL enableChatTemplate;
|
||||
|
||||
@property (nonatomic, strong) NSArray<NSString *> *apiKeys;
|
||||
|
||||
@property (nonatomic, copy) NSString *sslFileKey;
|
||||
@property (nonatomic, copy) NSString *sslFileCert;
|
||||
|
||||
@property (nonatomic, assign) BOOL endpointSlots;
|
||||
@property (nonatomic, assign) BOOL endpointMetrics;
|
||||
@property (nonatomic, assign) BOOL logJSON;
|
||||
|
||||
@property (nonatomic, copy) NSString *slotSavePath;
|
||||
@property (nonatomic, assign) float slotPromptSimilarity;
|
||||
|
||||
// batched-bench params
|
||||
@property (nonatomic, assign) BOOL isPPShared;
|
||||
@property (nonatomic, strong) NSArray<NSNumber *> *nPP;
|
||||
@property (nonatomic, strong) NSArray<NSNumber *> *nTG;
|
||||
@property (nonatomic, strong) NSArray<NSNumber *> *nPL;
|
||||
|
||||
// retrieval params
|
||||
@property (nonatomic, strong) NSArray<NSString *> *contextFiles;
|
||||
@property (nonatomic, assign) int32_t chunkSize;
|
||||
@property (nonatomic, copy) NSString *chunkSeparator;
|
||||
|
||||
// passkey params
|
||||
@property (nonatomic, assign) int32_t nJunk;
|
||||
@property (nonatomic, assign) int32_t iPos;
|
||||
|
||||
// imatrix params
|
||||
@property (nonatomic, copy) NSString *outFile;
|
||||
@property (nonatomic, assign) int32_t nOutFreq;
|
||||
@property (nonatomic, assign) int32_t nSaveFreq;
|
||||
@property (nonatomic, assign) int32_t iChunk;
|
||||
@property (nonatomic, assign) BOOL processOutput;
|
||||
@property (nonatomic, assign) BOOL computePPL;
|
||||
|
||||
// cvector-generator params
|
||||
@property (nonatomic, assign) int nPCABatch;
|
||||
@property (nonatomic, assign) int nPCAIterations;
|
||||
@property (nonatomic, assign) int cvectorDimreMethod;
|
||||
@property (nonatomic, copy) NSString *cvectorOutfile;
|
||||
@property (nonatomic, copy) NSString *cvectorPositiveFile;
|
||||
@property (nonatomic, copy) NSString *cvectorNegativeFile;
|
||||
|
||||
@property (nonatomic, assign) BOOL spmInfill;
|
||||
@property (nonatomic, copy) NSString *loraOutfile;
|
||||
@property (nonatomic, assign) BOOL embedding;
|
||||
@property (nonatomic, assign) BOOL verbosePrompt; // print prompt tokens before generation
|
||||
@property (nonatomic, assign) BOOL batchedBenchOutputJSONL;
|
||||
@property (nonatomic, assign) BOOL inputPrefixBOS; // prefix BOS to user inputs, preceding input_prefix
|
||||
@property (nonatomic, assign) BOOL ctxShift; // context shift on inifinite text generation
|
||||
@property (nonatomic, assign) BOOL displayPrompt; // print prompt before generation
|
||||
- (LlamaModelParams *)llamaModelParams;
|
||||
- (LlamaContextParams *)llamaContextParams;
|
||||
|
||||
@end
|
||||
|
||||
#endif /* GPTParams_h */
|
25
objc/include/GPTParams_Private.hpp
Normal file
25
objc/include/GPTParams_Private.hpp
Normal file
|
@ -0,0 +1,25 @@
|
|||
#ifndef GPTParams_Private_hpp
|
||||
#define GPTParams_Private_hpp
|
||||
|
||||
#import "GPTParams.h"
|
||||
#import "ggml.h"
|
||||
#import "../../common/common.h"
|
||||
|
||||
@interface GGMLThreadpool()
|
||||
|
||||
- (ggml_threadpool *)threadpool;
|
||||
|
||||
@end
|
||||
|
||||
@interface GPTParams()
|
||||
|
||||
- (gpt_params&)params;
|
||||
|
||||
@end
|
||||
|
||||
@interface GPTSamplerParams()
|
||||
|
||||
- (gpt_sampler_params&)cParams;
|
||||
|
||||
@end
|
||||
#endif /* GPTParams_Private_hpp */
|
55
objc/include/GPTSampler.h
Normal file
55
objc/include/GPTSampler.h
Normal file
|
@ -0,0 +1,55 @@
|
|||
#ifndef GPTSampler_h
|
||||
#define GPTSampler_h
|
||||
|
||||
@class LlamaModel;
|
||||
@class GPTSamplerParams;
|
||||
@class LlamaContext;
|
||||
typedef int32_t LlamaToken;
|
||||
|
||||
@interface GPTSampler : NSObject
|
||||
|
||||
- (instancetype)init:(LlamaModel *)model gptSamplerParams:(GPTSamplerParams *)gptSamplerParams;
|
||||
- (uint32_t)seed;
|
||||
|
||||
// extended sampling implementation:
|
||||
//
|
||||
// - set logits
|
||||
// - apply the configured sampler chain
|
||||
// - check if the token fits the grammar (if any)
|
||||
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
||||
//
|
||||
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
||||
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
||||
//
|
||||
- (LlamaToken)sample:(LlamaContext *)context
|
||||
index:(NSInteger) index;
|
||||
|
||||
// extended sampling implementation:
|
||||
//
|
||||
// - set logits
|
||||
// - apply the configured sampler chain
|
||||
// - check if the token fits the grammar (if any)
|
||||
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
||||
//
|
||||
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
||||
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
||||
//
|
||||
- (LlamaToken)sample:(LlamaContext *)context
|
||||
index:(NSInteger) index
|
||||
grammarFirst:(BOOL)grammarFirst;
|
||||
|
||||
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
|
||||
- (void)accept:(LlamaToken)token
|
||||
acceptGrammar:(BOOL)acceptGrammar;
|
||||
|
||||
// get a string representation of the last accepted tokens
|
||||
- (NSString *)previousString:(LlamaContext *)context n:(NSInteger)n;
|
||||
|
||||
// get the last accepted token
|
||||
- (LlamaToken)last;
|
||||
|
||||
- (void)reset;
|
||||
|
||||
@end
|
||||
|
||||
#endif /* GPTSampler_h */
|
34
objc/include/LlamaBatch.h
Normal file
34
objc/include/LlamaBatch.h
Normal file
|
@ -0,0 +1,34 @@
|
|||
#ifndef LlamaBatch_h
|
||||
#define LlamaBatch_h
|
||||
|
||||
typedef NSInteger LlamaSequenceId;
|
||||
typedef NSInteger LlamaPosition;
|
||||
typedef int32_t LlamaToken;
|
||||
|
||||
// Input data for llama_decode
|
||||
// A llama_batch object can contain input about one or many sequences
|
||||
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
||||
//
|
||||
// - token : the token ids of the input (used when embd is NULL)
|
||||
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
||||
// - pos : the positions of the respective token in the sequence
|
||||
// - seq_id : the sequence to which the respective token belongs
|
||||
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
||||
@interface LlamaBatch : NSObject
|
||||
|
||||
@property (nonatomic, assign) NSInteger nTokens;
|
||||
@property (nonatomic, assign) LlamaToken *tokens;
|
||||
@property (nonatomic, assign) float *embd;
|
||||
@property (nonatomic, assign) LlamaPosition *pos;
|
||||
@property (nonatomic, assign) int32_t *nSeqId;
|
||||
@property (nonatomic, assign) LlamaSequenceId **seqId;
|
||||
@property (nonatomic, assign) NSData *output;
|
||||
|
||||
// Helpers for smooth API transition (optional usage in the interface)
|
||||
@property (nonatomic, assign) LlamaPosition allPos0;
|
||||
@property (nonatomic, assign) LlamaPosition allPos1;
|
||||
@property (nonatomic, assign) LlamaSequenceId allSeqId;
|
||||
|
||||
@end
|
||||
|
||||
#endif /* LlamaBatch_h */
|
13
objc/include/LlamaBatch_Private.hpp
Normal file
13
objc/include/LlamaBatch_Private.hpp
Normal file
|
@ -0,0 +1,13 @@
|
|||
#ifndef LlamaBatch_Private_hpp
|
||||
#define LlamaBatch_Private_hpp
|
||||
#import "LlamaBatch.h"
|
||||
#import "llama.h"
|
||||
|
||||
@interface LlamaBatch()
|
||||
|
||||
- (instancetype)initWithBatch:(llama_batch)batch;
|
||||
- (llama_batch)cBatch;
|
||||
|
||||
@end
|
||||
|
||||
#endif /* LlamaBatch_Private_hpp */
|
57
objc/include/LlamaContext.h
Normal file
57
objc/include/LlamaContext.h
Normal file
|
@ -0,0 +1,57 @@
|
|||
#ifndef LlamaContext_h
|
||||
#define LlamaContext_h
|
||||
|
||||
@class GGMLThreadpool;
|
||||
@class LlamaBatch;
|
||||
|
||||
typedef NSInteger LlamaSequenceId;
|
||||
typedef NSInteger LlamaPosition;
|
||||
typedef int32_t LlamaToken;
|
||||
|
||||
@interface LlamaContext : NSObject
|
||||
|
||||
- (void)attachThreadpool:(GGMLThreadpool *)threadpool
|
||||
threadpoolBatch:(GGMLThreadpool *)threadpoolBatch;
|
||||
|
||||
- (NSUInteger)nCtx;
|
||||
|
||||
// Positive return values does not mean a fatal error, but rather a warning.
|
||||
// 0 - success
|
||||
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
||||
// < 0 - error
|
||||
- (NSInteger)decode:(LlamaBatch *)batch;
|
||||
|
||||
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||
// - lazily on next llama_decode()
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
- (void)kvCacheSeqAdd:(LlamaSequenceId)sequenceId
|
||||
p0:(LlamaPosition)p0
|
||||
p1:(LlamaPosition)p1
|
||||
delta:(LlamaPosition)delta;
|
||||
|
||||
// Integer division of the positions by factor of `d > 1`
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||
// - lazily on next llama_decode()
|
||||
// - explicitly with llama_kv_cache_update()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
- (void)kvCacheSeqDiv:(LlamaSequenceId)sequenceId
|
||||
p0:(LlamaPosition)p0
|
||||
p1:(LlamaPosition)p1
|
||||
delta:(LlamaPosition)delta;
|
||||
|
||||
// tokenizes a token into a piece, optionally renders special/control tokens
|
||||
// should work similar to Python's `tokenizer.id_to_piece`
|
||||
- (NSString *)tokenToPiece:(LlamaToken)token;
|
||||
- (NSString *)tokenToPiece:(LlamaToken)token special:(BOOL)special;
|
||||
|
||||
- (BOOL)saveStateFile:(NSString *)pathSession
|
||||
tokens:(const LlamaToken *)tokens
|
||||
nTokenCount:(size_t)nTokenCount;
|
||||
|
||||
@end
|
||||
|
||||
#endif /* LlamaContext_h */
|
28
objc/include/LlamaContext_Private.hpp
Normal file
28
objc/include/LlamaContext_Private.hpp
Normal file
|
@ -0,0 +1,28 @@
|
|||
#ifndef LlamaContext_Private_hpp
|
||||
#define LlamaContext_Private_hpp
|
||||
|
||||
#import "LlamaContext.h"
|
||||
#import "../../common/common.h"
|
||||
|
||||
@interface LlamaContext()
|
||||
|
||||
- (instancetype)initWithContext:(llama_context *)context;
|
||||
|
||||
- (std::vector<llama_token>)tokenize:(NSString *)text
|
||||
addSpecial:(BOOL)addSpecial
|
||||
parseSpecial:(BOOL)parseSpecial;
|
||||
|
||||
- (BOOL)loadStateFile:(NSString *)pathSession
|
||||
tokensOut:(llama_token *)tokensOut
|
||||
nTokenCpacity:(size_t)nTokenCapacity
|
||||
nTokenCountOut:(size_t *)nTokenCountOut;
|
||||
|
||||
- (std::string)convertTokensToString:(const std::vector<llama_token>&)tokens;
|
||||
|
||||
- (llama_context *)cContext;
|
||||
|
||||
- (int32_t)encode:(llama_batch)batch;
|
||||
|
||||
@end
|
||||
|
||||
#endif /* LlamaContext_Private_hpp */
|
35
objc/include/LlamaModel.h
Normal file
35
objc/include/LlamaModel.h
Normal file
|
@ -0,0 +1,35 @@
|
|||
#ifndef LlamaModel_h
|
||||
#define LlamaModel_h
|
||||
|
||||
@class GPTParams;
|
||||
@class GGMLThreadpool;
|
||||
@class LlamaContext;
|
||||
|
||||
typedef int32_t LlamaToken;
|
||||
|
||||
@interface LlamaChatMessage : NSObject
|
||||
|
||||
@property (nonatomic, copy) NSString *role;
|
||||
@property (nonatomic, copy) NSString *content;
|
||||
|
||||
@end
|
||||
|
||||
@interface LlamaContextParams : NSObject
|
||||
@end
|
||||
|
||||
@interface LlamaModel : NSObject
|
||||
|
||||
- (LlamaContext *)context:(LlamaContextParams *)params;
|
||||
- (LlamaToken)tokenBOS;
|
||||
- (LlamaToken)tokenEOT;
|
||||
- (LlamaToken)tokenEOS;
|
||||
- (BOOL)tokenIsEOG:(LlamaToken)token;
|
||||
- (int32_t)nCtxTrain;
|
||||
- (BOOL)addBOSToken;
|
||||
- (BOOL)addEOSToken;
|
||||
- (BOOL)hasEncoder;
|
||||
- (NSString *)formatExample:(NSString *)tmpl;
|
||||
|
||||
@end
|
||||
|
||||
#endif /* LlamaModel_h */
|
15
objc/include/LlamaModel_Private.hpp
Normal file
15
objc/include/LlamaModel_Private.hpp
Normal file
|
@ -0,0 +1,15 @@
|
|||
#ifndef LlamaModel_Private_hpp
|
||||
#define LlamaModel_Private_hpp
|
||||
|
||||
#import "LlamaModel.h"
|
||||
#import "llama.h"
|
||||
|
||||
@interface LlamaModel()
|
||||
|
||||
- (instancetype)init:(llama_model *)model;
|
||||
|
||||
- (llama_model *)cModel;
|
||||
|
||||
@end
|
||||
|
||||
#endif /* LlamaModel_Private_hpp */
|
13
objc/include/LlamaObjC.h
Normal file
13
objc/include/LlamaObjC.h
Normal file
|
@ -0,0 +1,13 @@
|
|||
#ifndef LlamaObjC_h
|
||||
#define LlamaObjC_h
|
||||
|
||||
#include <Foundation/Foundation.h>
|
||||
#include <llama.h>
|
||||
#include <LlamaModel.h>
|
||||
#include <LlamaContext.h>
|
||||
#include <LlamaSession.h>
|
||||
#include <GPTParams.h>
|
||||
|
||||
|
||||
|
||||
#endif /* LlamaObjC_h */
|
27
objc/include/LlamaSession.h
Normal file
27
objc/include/LlamaSession.h
Normal file
|
@ -0,0 +1,27 @@
|
|||
#ifndef LlamaSession_h
|
||||
#define LlamaSession_h
|
||||
|
||||
@class GPTParams;
|
||||
@class LlamaModel;
|
||||
@class LlamaContext;
|
||||
|
||||
@interface BlockingLineQueue : NSObject
|
||||
|
||||
- (void)addInputLine:(NSString *)line;
|
||||
- (NSString *)inputLine;
|
||||
- (void)addOutputLine:(NSString *)line;
|
||||
- (NSString *)outputLine;
|
||||
|
||||
@end
|
||||
|
||||
@interface LlamaSession : NSObject
|
||||
|
||||
@property (nonatomic, strong) LlamaModel *model;
|
||||
@property (nonatomic, strong) LlamaContext *ctx;
|
||||
|
||||
- (instancetype)initWithParams:(GPTParams *)params;
|
||||
- (void)start:(BlockingLineQueue *)queue;
|
||||
|
||||
@end
|
||||
|
||||
#endif /* Header_h */
|
10
objc/include/LlamaSession_Private.hpp
Normal file
10
objc/include/LlamaSession_Private.hpp
Normal file
|
@ -0,0 +1,10 @@
|
|||
#ifndef LlamaSession_Private_hpp
|
||||
#define LlamaSession_Private_hpp
|
||||
|
||||
#import "LlamaSession.h"
|
||||
|
||||
@interface LlamaSession()
|
||||
|
||||
@end
|
||||
|
||||
#endif /* LlamaSession_Private_hpp */
|
1
objc/include/ggml-metal.h
Symbolic link
1
objc/include/ggml-metal.h
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../ggml/include/ggml-metal.h
|
102
swift/JSONSchema/Grammar.swift
Normal file
102
swift/JSONSchema/Grammar.swift
Normal file
|
@ -0,0 +1,102 @@
|
|||
import Foundation
|
||||
import RegexBuilder
|
||||
|
||||
let SPACE_RULE = "\" \"?"
|
||||
|
||||
let PRIMITIVE_RULES: [String: String] = [
|
||||
"boolean": "(\"true\" | \"false\") space",
|
||||
"number": "\"-\"? ([0-9] | [1-9] [0-9]*) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space",
|
||||
"integer": "\"-\"? ([0-9] | [1-9] [0-9]*) space",
|
||||
"string": "\"\\\"\" ([^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]))* \"\\\"\" space",
|
||||
"null": "\"null\" space",
|
||||
]
|
||||
|
||||
let INVALID_RULE_CHARS_RE = try! NSRegularExpression(pattern: "[^a-zA-Z0-9-]+")
|
||||
let GRAMMAR_LITERAL_ESCAPE_RE = try! NSRegularExpression(pattern: "[\r\n\"]")
|
||||
let GRAMMAR_LITERAL_ESCAPES: [String: String] = ["\r": "\\r", "\n": "\\n", "\"": "\\\""]
|
||||
|
||||
public class SchemaConverter {
|
||||
private var propOrder: [String]
|
||||
private var rules: [String: String] = ["space": SPACE_RULE]
|
||||
|
||||
public init(propOrder: [String]) {
|
||||
self.propOrder = propOrder
|
||||
}
|
||||
|
||||
private func formatLiteral(_ literal: Any) -> String {
|
||||
// let escaped = GRAMMAR_LITERAL_ESCAPES.reduce("\(literal)", {
|
||||
// let regex = Regex("[\r\n\"]")
|
||||
let escaped = GRAMMAR_LITERAL_ESCAPES.reduce("\(literal)") {
|
||||
$0.replacingOccurrences(of: $1.key, with: $1.value)
|
||||
}
|
||||
|
||||
return "\\\"\(escaped)\\\""
|
||||
}
|
||||
|
||||
private func addRule(name: String, rule: String) -> String {
|
||||
let escName = INVALID_RULE_CHARS_RE.stringByReplacingMatches(
|
||||
in: name,
|
||||
options: [],
|
||||
range: NSRange(location: 0, length: name.count),
|
||||
withTemplate: "-"
|
||||
)
|
||||
|
||||
var key = escName
|
||||
if let existingRule = rules[escName], existingRule != rule {
|
||||
var i = 0
|
||||
while rules["\(escName)\(i)"] != nil {
|
||||
i += 1
|
||||
}
|
||||
key = "\(escName)\(i)"
|
||||
}
|
||||
|
||||
rules[key] = rule
|
||||
return key
|
||||
}
|
||||
|
||||
public func visit(schema: [String: Any], name: String?) -> String {
|
||||
let schemaType = schema["type"] as? String
|
||||
let ruleName = name ?? "root"
|
||||
|
||||
if let oneOf = schema["oneOf"] as? [[String: Any]] ?? schema["anyOf"] as? [[String: Any]] {
|
||||
let rule = oneOf.enumerated().map { (i, altSchema) in
|
||||
visit(schema: altSchema, name: "\(name ?? "")\(name != nil ? "-" : "")\(i)")
|
||||
}.joined(separator: " | ")
|
||||
return addRule(name: ruleName, rule: rule)
|
||||
} else if let constValue = schema["const"] {
|
||||
return addRule(name: ruleName, rule: formatLiteral(constValue))
|
||||
} else if let enumValues = schema["enum"] as? [Any] {
|
||||
let rule = enumValues.map { "\"\(formatLiteral($0))\"" }.joined(separator: " | ")
|
||||
return addRule(name: ruleName, rule: rule)
|
||||
} else if schemaType == "object", let properties = schema["properties"] as? [String: Any] {
|
||||
let propPairs = properties.sorted { (kv1, kv2) in
|
||||
let idx1 = propOrder.firstIndex(of: kv1.key) ?? propOrder.count
|
||||
let idx2 = propOrder.firstIndex(of: kv2.key) ?? propOrder.count
|
||||
return (idx1, kv1.key) < (idx2, kv2.key)
|
||||
}
|
||||
|
||||
var rule = "\"{\" space"
|
||||
for (i, (propName, propSchema)) in propPairs.enumerated() {
|
||||
let propRuleName = visit(schema: propSchema as! [String : Any], name: "\(name ?? "")\(name != nil ? "-" : "")\(propName)")
|
||||
if i > 0 {
|
||||
rule += " \",\" space"
|
||||
}
|
||||
rule += " \"\(formatLiteral(propName))\" space \":\" space \(propRuleName)"
|
||||
}
|
||||
rule += " \"}\" space"
|
||||
|
||||
return addRule(name: ruleName, rule: rule)
|
||||
} else if schemaType == "array", let items = schema["items"] {
|
||||
let itemRuleName = visit(schema: items as! [String : Any], name: "\(name ?? "")\(name != nil ? "-" : "")item")
|
||||
let rule = "\"[\" space (\(itemRuleName) (\",\" space \(itemRuleName))*)? \"]\" space"
|
||||
return addRule(name: ruleName, rule: rule)
|
||||
} else {
|
||||
assert(PRIMITIVE_RULES.keys.contains(schemaType ?? ""), "Unrecognized schema: \(schema)")
|
||||
return addRule(name: ruleName == "root" ? "root" : schemaType!, rule: PRIMITIVE_RULES[schemaType!]!)
|
||||
}
|
||||
}
|
||||
|
||||
public func formatGrammar() -> String {
|
||||
return rules.map { (name, rule) in "\(name) ::= \(rule)" }.joined(separator: "\n") + "\n"
|
||||
}
|
||||
}
|
187
swift/JSONSchema/JSONSchema.swift
Normal file
187
swift/JSONSchema/JSONSchema.swift
Normal file
|
@ -0,0 +1,187 @@
|
|||
import Foundation
|
||||
//import SwiftSyntaxMacros
|
||||
|
||||
public struct JSONSchema : Codable {
|
||||
public struct Items : Codable {
|
||||
let type: String
|
||||
let `enum`: [String]?
|
||||
|
||||
public init(type: String, `enum`: [String]?) {
|
||||
self.type = type
|
||||
self.enum = `enum`
|
||||
}
|
||||
}
|
||||
public struct Property : Codable {
|
||||
let type: String
|
||||
let items: Items?
|
||||
let description: String?
|
||||
|
||||
public init(type: String, items: Items?, description: String?) {
|
||||
self.type = type
|
||||
self.items = items
|
||||
self.description = description
|
||||
}
|
||||
}
|
||||
let type: String
|
||||
let items: Items?
|
||||
let properties: [String : Property]?
|
||||
|
||||
public init(type: String, items: Items?, properties: [String : Property]?) {
|
||||
self.type = type
|
||||
self.items = items
|
||||
self.properties = properties
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public struct _JSONFunctionSchema: Codable {
|
||||
public struct Items: Codable {
|
||||
let type: String
|
||||
let `enum`: [String]?
|
||||
|
||||
public init(type: Any.Type, `enum`: [String]?) {
|
||||
self.type = String(describing: type)
|
||||
self.enum = `enum`
|
||||
}
|
||||
}
|
||||
|
||||
public struct Property: Codable {
|
||||
let type: String
|
||||
let items: Items?
|
||||
let `enum`: [String]?
|
||||
let description: String?
|
||||
|
||||
public init(type: String.Type, description: String?) {
|
||||
self.type = "string"
|
||||
self.description = description
|
||||
self.items = nil
|
||||
self.enum = nil
|
||||
}
|
||||
|
||||
public init<T: CaseIterable>(type: T.Type, description: String?) where T: RawRepresentable,
|
||||
T: StringProtocol {
|
||||
self.type = "string"
|
||||
self.enum = Array(type.allCases.map { $0.rawValue as! String })
|
||||
self.description = description
|
||||
self.items = nil
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public struct Parameters: Codable {
|
||||
public let properties: [String: Property]
|
||||
public let required: [String]
|
||||
public let type = "object"
|
||||
|
||||
public init(properties: [String : Property], required: [String]) {
|
||||
self.properties = properties
|
||||
self.required = required
|
||||
}
|
||||
}
|
||||
|
||||
let name: String
|
||||
let description: String
|
||||
let parameters: Parameters
|
||||
|
||||
public init(name: String, description: String, parameters: Parameters) {
|
||||
self.name = name
|
||||
self.description = description
|
||||
self.parameters = parameters
|
||||
}
|
||||
}
|
||||
|
||||
public protocol JSONSchemaConvertible : Codable {
|
||||
static var type: String { get }
|
||||
static var jsonSchema: [String : Any] { get }
|
||||
static func decode<K: CodingKey>(from container: KeyedDecodingContainer<K>,
|
||||
forKey key: K) throws -> Self
|
||||
}
|
||||
|
||||
extension RawRepresentable where Self : CaseIterable, RawValue : JSONSchemaConvertible, Self: Codable {
|
||||
public static var type: String {
|
||||
RawValue.type
|
||||
}
|
||||
public static var jsonSchema: [String: Any] {
|
||||
[
|
||||
"type": RawValue.type,
|
||||
"enum": Self.allCases.map(\.rawValue)
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
extension JSONSchemaConvertible {
|
||||
public static var items: JSONSchema.Items? {
|
||||
nil
|
||||
}
|
||||
public static var properties: [JSONSchema.Property]? {
|
||||
nil
|
||||
}
|
||||
public static var `enum`: [String]? {
|
||||
nil
|
||||
}
|
||||
public static func decode<K: CodingKey>(from container: KeyedDecodingContainer<K>, forKey key: K) throws -> Self {
|
||||
return try container.decode(Self.self, forKey: key)
|
||||
}
|
||||
}
|
||||
extension String : JSONSchemaConvertible {
|
||||
public static var type: String { "string" }
|
||||
public static var jsonSchema: [String: Any] {
|
||||
[
|
||||
"type": "string"
|
||||
]
|
||||
}
|
||||
}
|
||||
extension Int : JSONSchemaConvertible {
|
||||
public static var type: String { "number" }
|
||||
public static var jsonSchema: [String: Any] {
|
||||
[
|
||||
"type": "integer"
|
||||
]
|
||||
}
|
||||
}
|
||||
extension Double : JSONSchemaConvertible {
|
||||
public static var type: String { "number" }
|
||||
public static var jsonSchema: [String: Any] {
|
||||
[
|
||||
"type": "number"
|
||||
]
|
||||
}
|
||||
}
|
||||
extension Date : JSONSchemaConvertible {
|
||||
public static var type: String { "string" }
|
||||
|
||||
public static var jsonSchema: [String: Any] {
|
||||
[
|
||||
"type": "string"
|
||||
]
|
||||
}
|
||||
|
||||
public static func decode<K: CodingKey>(from container: KeyedDecodingContainer<K>, forKey key: K) throws -> Self {
|
||||
let value = try container.decode(String.self, forKey: key)
|
||||
let detector = try? NSDataDetector(types: NSTextCheckingResult.CheckingType.date.rawValue)
|
||||
let matches = detector?.matches(in: value, options: [], range: NSMakeRange(0, value.utf16.count))
|
||||
return matches!.first!.date!
|
||||
// return ISO8601DateFormatter().date(from: value)!
|
||||
}
|
||||
}
|
||||
|
||||
extension Array : JSONSchemaConvertible where Element : JSONSchemaConvertible {
|
||||
public static var type: String { "array" }
|
||||
public static var items: JSONSchema.Items? {
|
||||
JSONSchema.Items(type: Element.type, enum: Element.enum)
|
||||
}
|
||||
public static var jsonSchema: [String : Any] {
|
||||
[
|
||||
"type": "array",
|
||||
"items": Element.jsonSchema
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@attached(member, names: arbitrary)
|
||||
@attached(extension, conformances: JSONSchemaConvertible, CaseIterable, names: arbitrary)
|
||||
public macro JSONSchema() = #externalMacro(module: "JSONSchemaMacros",
|
||||
type: "JSONSchemaMacro")
|
||||
|
||||
//@attached(member, names: arbitrary)
|
||||
|
229
swift/JSONSchemaMacros/JSONSchemaMacros.swift
Normal file
229
swift/JSONSchemaMacros/JSONSchemaMacros.swift
Normal file
|
@ -0,0 +1,229 @@
|
|||
import SwiftSyntaxMacros
|
||||
import SwiftCompilerPlugin
|
||||
import SwiftSyntax
|
||||
|
||||
private struct MemberView {
|
||||
let name: String
|
||||
let type: String
|
||||
var attributeKey: String?
|
||||
var assignment: String?
|
||||
}
|
||||
|
||||
private func view(for member: MemberBlockItemListSyntax.Element) throws -> MemberView? {
|
||||
guard let decl = member.decl.as(VariableDeclSyntax.self),
|
||||
let binding = decl.bindings.compactMap({
|
||||
$0.pattern.as(IdentifierPatternSyntax.self)
|
||||
}).first,
|
||||
let type = decl.bindings.compactMap({
|
||||
$0.typeAnnotation?.type
|
||||
}).first,
|
||||
!(type.syntaxNodeType is StructDeclSyntax.Type) else {
|
||||
return nil
|
||||
}
|
||||
var memberView = MemberView(name: "\(binding.identifier)", type: "\(type)", attributeKey: nil)
|
||||
if let macroName = decl.attributes.first?.as(AttributeSyntax.self)?
|
||||
.arguments?.as(LabeledExprListSyntax.self)?.first?.expression.as(StringLiteralExprSyntax.self) {
|
||||
memberView.attributeKey = "\(macroName.segments)"
|
||||
}
|
||||
if let assignment = decl.bindings.compactMap({
|
||||
$0.initializer?.value
|
||||
}).first {
|
||||
memberView.assignment = "\(assignment)"
|
||||
}
|
||||
return memberView
|
||||
}
|
||||
|
||||
struct JSONSchemaMacro: ExtensionMacro, MemberMacro {
|
||||
static func expansion(of node: AttributeSyntax, providingMembersOf declaration: some DeclGroupSyntax, conformingTo protocols: [TypeSyntax], in context: some MacroExpansionContext) throws -> [DeclSyntax] {
|
||||
let members = try declaration.memberBlock.members.compactMap(view(for:))
|
||||
if declaration is EnumDeclSyntax {
|
||||
return []
|
||||
}
|
||||
return [
|
||||
"""
|
||||
enum CodingKeys: CodingKey {
|
||||
case \(raw: members.map(\.name).joined(separator: ", "))
|
||||
}
|
||||
""",
|
||||
"""
|
||||
init(from decoder: Decoder) throws {
|
||||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||||
\(raw: members.map {
|
||||
"""
|
||||
self.\($0.name) = try \($0.type).decode(from: container, forKey: .\($0.name))
|
||||
"""
|
||||
}.joined(separator: "\n"))
|
||||
}
|
||||
"""
|
||||
]
|
||||
}
|
||||
|
||||
static func expansion(of node: SwiftSyntax.AttributeSyntax,
|
||||
attachedTo declaration: some SwiftSyntax.DeclGroupSyntax,
|
||||
providingExtensionsOf type: some SwiftSyntax.TypeSyntaxProtocol,
|
||||
conformingTo protocols: [SwiftSyntax.TypeSyntax],
|
||||
in context: some SwiftSyntaxMacros.MacroExpansionContext) throws -> [SwiftSyntax.ExtensionDeclSyntax] {
|
||||
let members = try declaration.memberBlock.members.compactMap(view(for:))
|
||||
var inheritedTypes: [InheritedTypeSyntax] = []
|
||||
inheritedTypes.append(InheritedTypeSyntax(type: TypeSyntax("JSONSchemaConvertible")))
|
||||
if declaration is EnumDeclSyntax {
|
||||
inheritedTypes.append(InheritedTypeSyntax(type: TypeSyntax(", CaseIterable")))
|
||||
}
|
||||
let properties = members.map {
|
||||
"""
|
||||
"\($0.name)": \($0.type).jsonSchema
|
||||
"""
|
||||
}
|
||||
if !(declaration is EnumDeclSyntax) {
|
||||
return [
|
||||
ExtensionDeclSyntax(extendedType: type,
|
||||
inheritanceClause: .init(inheritedTypes: .init(inheritedTypes)),
|
||||
memberBlock: """
|
||||
{
|
||||
static var type: String {
|
||||
"object"
|
||||
}
|
||||
static var jsonSchema: [String: Any] {
|
||||
[
|
||||
"type": "object",
|
||||
"properties": [
|
||||
\(raw: properties.joined(separator: ","))
|
||||
]
|
||||
]
|
||||
}
|
||||
}
|
||||
""")
|
||||
]
|
||||
} else {
|
||||
return [
|
||||
ExtensionDeclSyntax(extendedType: type,
|
||||
inheritanceClause: .init(inheritedTypes: .init(inheritedTypes)),
|
||||
memberBlock: """
|
||||
{
|
||||
public static func decode<K: CodingKey>(from container: KeyedDecodingContainer<K>, forKey key: K) throws -> Self {
|
||||
if RawValue.self is Int.Type {
|
||||
return Self(rawValue: Int(try container.decode(String.self, forKey: key)) as! Self.RawValue)!
|
||||
} else {
|
||||
return try container.decode(Self.self, forKey: key)
|
||||
}
|
||||
}
|
||||
}
|
||||
""")
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum TestError: Error {
|
||||
case message(String)
|
||||
}
|
||||
|
||||
struct LlamaActorMacro: ExtensionMacro, MemberMacro {
|
||||
static func expansion(of node: AttributeSyntax, providingMembersOf declaration: some DeclGroupSyntax, conformingTo protocols: [TypeSyntax], in context: some MacroExpansionContext) throws -> [DeclSyntax] {
|
||||
[
|
||||
"""
|
||||
let session: LlamaToolSession
|
||||
|
||||
public init(params: GPTParams) async throws {
|
||||
self.session = try await LlamaToolSession(params: params, tools: Self.tools)
|
||||
}
|
||||
"""
|
||||
]
|
||||
}
|
||||
|
||||
static func expansion(of node: AttributeSyntax,
|
||||
attachedTo declaration: some DeclGroupSyntax,
|
||||
providingExtensionsOf type: some TypeSyntaxProtocol,
|
||||
conformingTo protocols: [TypeSyntax],
|
||||
in context: some MacroExpansionContext) throws -> [ExtensionDeclSyntax] {
|
||||
var tools: [
|
||||
(name: String,
|
||||
description: String,
|
||||
parameters: [(name: String,
|
||||
type: String,
|
||||
description: String)],
|
||||
callableString: String,
|
||||
callableName: String)
|
||||
] = []
|
||||
for member in declaration.memberBlock.members {
|
||||
let comments = member.leadingTrivia.filter { $0.isComment }
|
||||
|
||||
guard let member = member.decl.as(FunctionDeclSyntax.self) else {
|
||||
continue
|
||||
}
|
||||
let name = member.name
|
||||
guard case var .docLineComment(description) = comments.first else {
|
||||
throw TestError.message("Missing comment")
|
||||
}
|
||||
description = String(description.dropFirst(3))
|
||||
var parameters: [(name: String, type: String, description: String)] = []
|
||||
var index = 0
|
||||
for parameter in member.signature.parameterClause.parameters {
|
||||
let firstName = parameter.firstName.text
|
||||
let typeName = parameter.type.as(IdentifierTypeSyntax.self)!.name.text
|
||||
guard case var .docLineComment(description) = comments[index + 1] else {
|
||||
throw TestError.message("Missing comment for \(firstName)")
|
||||
}
|
||||
description = String(description.dropFirst(3))
|
||||
parameters.append((name: firstName, type: typeName, description: description))
|
||||
index += 1
|
||||
}
|
||||
let callableName = context.makeUniqueName(name.text)
|
||||
let callableString = """
|
||||
@dynamicCallable struct \(callableName.text): DynamicCallable {
|
||||
@discardableResult
|
||||
func dynamicallyCall(withKeywordArguments args: [String: Any]) async throws -> String {
|
||||
\(parameters.map {
|
||||
"var \($0.name): \($0.type)!"
|
||||
}.joined(separator: "\n"))
|
||||
for (key, value) in args {
|
||||
\(parameters.map {
|
||||
"if key == \"\($0.name)\" { \($0.name) = value as! \($0.type) }"
|
||||
}.joined(separator: "\n"))
|
||||
}
|
||||
|
||||
let returnValue = try await \(name.text)(\(parameters.map { "\($0.name): \($0.name)" }.joined(separator: ",")))
|
||||
let jsonValue = try JSONEncoder().encode(returnValue)
|
||||
return String(data: jsonValue, encoding: .utf8)!
|
||||
}
|
||||
}
|
||||
"""
|
||||
tools.append((name: name.text, description: description,
|
||||
parameters: parameters,
|
||||
callableString: callableString,
|
||||
callableName: callableName.text))
|
||||
}
|
||||
|
||||
|
||||
return [
|
||||
.init(extendedType: type,
|
||||
inheritanceClause: .init(inheritedTypes: InheritedTypeListSyntax.init(arrayLiteral: .init(type: IdentifierTypeSyntax(name: "LlamaActor")))),
|
||||
memberBlock: """
|
||||
{
|
||||
\(raw: tools.map {
|
||||
$0.callableString
|
||||
}.joined(separator: "\n"))
|
||||
|
||||
static var tools: [String: (DynamicCallable, _JSONFunctionSchema)] {
|
||||
[\(raw: tools.map { tool in
|
||||
"""
|
||||
"\(tool.name)": (\(tool.callableName)(), _JSONFunctionSchema(name: "\(tool.name)", description: "\(tool.description)", parameters: _JSONFunctionSchema.Parameters(properties: \(tool.parameters.count == 0 ? "[:]" : "[" + tool.parameters.map { parameter in
|
||||
"""
|
||||
"\(parameter.name)": _JSONFunctionSchema.Property(type: \(parameter.type).self, description: "\(parameter.description)"),
|
||||
"""
|
||||
}.joined() + "]"), required: [])))
|
||||
"""
|
||||
}.joined(separator: ","))]
|
||||
}
|
||||
}
|
||||
""")
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@main
|
||||
struct JSONSchemaMacrosPlugin: CompilerPlugin {
|
||||
let providingMacros: [Macro.Type] = [
|
||||
JSONSchemaMacro.self, LlamaActorMacro.self
|
||||
]
|
||||
}
|
189
swift/LlamaKit/LlamaKit.swift
Normal file
189
swift/LlamaKit/LlamaKit.swift
Normal file
|
@ -0,0 +1,189 @@
|
|||
import Foundation
|
||||
@_exported import JSONSchema
|
||||
@_exported import LlamaObjC
|
||||
|
||||
public protocol DynamicCallable: Sendable {
|
||||
@discardableResult
|
||||
func dynamicallyCall(withKeywordArguments args: [String: Any]) async throws -> String
|
||||
}
|
||||
|
||||
|
||||
struct ToolCall: Decodable {
|
||||
let id: Int
|
||||
let name: String
|
||||
let arguments: [String: String]
|
||||
}
|
||||
|
||||
struct ToolResponse<T: Encodable>: Encodable {
|
||||
let id: Int
|
||||
let result: T
|
||||
}
|
||||
|
||||
// MARK: LlamaChatSession
|
||||
/// Standard chat session for a given LLM.
|
||||
public actor LlamaChatSession {
|
||||
private let queue = BlockingLineQueue()
|
||||
private let session: LlamaObjC.LlamaSession
|
||||
|
||||
public init(params: GPTParams, flush: Bool = true) async throws {
|
||||
session = LlamaObjC.LlamaSession(params: params)
|
||||
Task.detached { [session, queue] in
|
||||
session.start(queue)
|
||||
}
|
||||
|
||||
// flush
|
||||
guard flush else { return }
|
||||
_ = queue.outputLine()
|
||||
}
|
||||
|
||||
public func chat(message: String) async -> String {
|
||||
queue.addInputLine(message)
|
||||
return queue.outputLine()
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: LlamaGrammarSession
|
||||
public actor LlamaSession<T: JSONSchemaConvertible> {
|
||||
private let session: LlamaChatSession
|
||||
|
||||
public init(params: GPTParams) async throws {
|
||||
let converter = SchemaConverter(propOrder: [])
|
||||
_ = converter.visit(schema: T.jsonSchema, name: nil)
|
||||
params.samplerParams.grammar = converter.formatGrammar()
|
||||
session = try await LlamaChatSession(params: params)
|
||||
}
|
||||
|
||||
public func chat(message: String) async throws -> T {
|
||||
let output = await session.chat(message: message).data(using: .utf8)!
|
||||
return try JSONDecoder().decode(T.self, from: output)
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: LlamaToolSession
|
||||
public actor LlamaToolSession {
|
||||
private let session: LlamaChatSession
|
||||
|
||||
private struct GetIpAddress: DynamicCallable {
|
||||
func dynamicallyCall(withKeywordArguments args: [String : Any]) async throws -> String {
|
||||
getIPAddress()
|
||||
}
|
||||
}
|
||||
|
||||
internal static func getIPAddress() -> String {
|
||||
var address: String!
|
||||
|
||||
// Get list of all interfaces on the local machine:
|
||||
var ifaddr: UnsafeMutablePointer<ifaddrs>? = nil
|
||||
if getifaddrs(&ifaddr) == 0 {
|
||||
// Loop through linked list of interfaces
|
||||
var ptr = ifaddr
|
||||
while ptr != nil {
|
||||
let interface = ptr!.pointee
|
||||
|
||||
// Check if the interface is IPv4 or IPv6:
|
||||
let addrFamily = interface.ifa_addr.pointee.sa_family
|
||||
if addrFamily == UInt8(AF_INET) || addrFamily == UInt8(AF_INET6) {
|
||||
|
||||
// Convert interface name to String:
|
||||
let name = String(cString: interface.ifa_name)
|
||||
|
||||
// Only consider non-loopback interfaces (e.g., "en0" for Wi-Fi)
|
||||
if name == "en0" { // Typically en0 is the Wi-Fi interface
|
||||
// Convert the address to a readable format:
|
||||
var hostname = [CChar](repeating: 0, count: Int(NI_MAXHOST))
|
||||
if getnameinfo(interface.ifa_addr, socklen_t(interface.ifa_addr.pointee.sa_len),
|
||||
&hostname, socklen_t(hostname.count),
|
||||
nil, socklen_t(0), NI_NUMERICHOST) == 0 {
|
||||
address = String(cString: hostname)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ptr = interface.ifa_next
|
||||
}
|
||||
|
||||
freeifaddrs(ifaddr)
|
||||
}
|
||||
|
||||
return address
|
||||
}
|
||||
|
||||
public private(set) var tools: [String: (DynamicCallable, _JSONFunctionSchema)]
|
||||
|
||||
public init(params: GPTParams,
|
||||
tools: [String: (DynamicCallable, _JSONFunctionSchema)]) async throws {
|
||||
self.tools = tools
|
||||
let ipFnSchema = _JSONFunctionSchema(name: "getIpAddress", description: "Get the IP Address for this system", parameters: _JSONFunctionSchema.Parameters(properties: [:], required: []))
|
||||
self.tools["getIpAddress"] = (GetIpAddress(), ipFnSchema)
|
||||
let encoded = try JSONEncoder().encode(self.tools.values.map(\.1))
|
||||
let prompt = """
|
||||
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
|
||||
<tool_call>
|
||||
{"name": <function-name>,"arguments": <args-dict>}
|
||||
</tool_call>
|
||||
|
||||
Here are the available tools:
|
||||
<tools> \(String(data: encoded, encoding: .utf8)!) </tools><|eot_id|>
|
||||
"""
|
||||
params.prompt = prompt
|
||||
params.interactive = true
|
||||
params.antiPrompts.append("<|eot_id|>");
|
||||
params.inputPrefix = "<|start_header_id|>user<|end_header_id|>";
|
||||
params.inputSuffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>";
|
||||
session = try await LlamaChatSession(params: params, flush: false)
|
||||
let fn = await session.chat(message: "What is my IP address?")
|
||||
let toolCall = try JSONDecoder().decode(ToolCall.self, from: fn.data(using: .utf8)!)
|
||||
guard let tool = self.tools[toolCall.name] else {
|
||||
fatalError()
|
||||
}
|
||||
let resp = try await tool.0.dynamicallyCall(withKeywordArguments: toolCall.arguments)
|
||||
print(resp)
|
||||
|
||||
let output = await session.chat(message: """
|
||||
<tool_response>
|
||||
{"id": \(toolCall.id), result: \(resp)}
|
||||
</tool_response>
|
||||
""")
|
||||
print(output)
|
||||
}
|
||||
|
||||
public func chat(message: String) async throws -> String {
|
||||
var nxt = await session.chat(message: message)
|
||||
let fn = nxt
|
||||
// try to see if the output is a function call
|
||||
do {
|
||||
let toolCall = try JSONDecoder().decode(ToolCall.self, from: fn.data(using: .utf8)!)
|
||||
guard let tool = tools[toolCall.name] else {
|
||||
fatalError()
|
||||
}
|
||||
let callable = tool.0
|
||||
let resp = try await callable.dynamicallyCall(withKeywordArguments: toolCall.arguments)
|
||||
print("tool response: \(resp)")
|
||||
nxt = await session.chat(message: """
|
||||
<tool_response>
|
||||
{"id": \(toolCall.id), result: \(resp)}
|
||||
</tool_response>
|
||||
""")
|
||||
print(nxt)
|
||||
} catch {
|
||||
print(error)
|
||||
}
|
||||
return nxt
|
||||
}
|
||||
}
|
||||
|
||||
public protocol LlamaActor: Actor {
|
||||
static var tools: [String: (DynamicCallable, _JSONFunctionSchema)] { get }
|
||||
var session: LlamaToolSession { get }
|
||||
}
|
||||
|
||||
public extension LlamaActor {
|
||||
func chat(_ message: String) async throws -> String {
|
||||
try await session.chat(message: message)
|
||||
}
|
||||
}
|
||||
|
||||
@attached(member, names: arbitrary)
|
||||
@attached(extension, conformances: LlamaActor, names: arbitrary)
|
||||
public macro llamaActor() = #externalMacro(module: "JSONSchemaMacros",
|
||||
type: "LlamaActorMacro")
|
76
swift/main/main.swift
Normal file
76
swift/main/main.swift
Normal file
|
@ -0,0 +1,76 @@
|
|||
import LlamaKit
|
||||
import WeatherKit
|
||||
import CoreLocation
|
||||
|
||||
@llamaActor actor MyLlama {
|
||||
struct CurrentWeather: Codable {
|
||||
let temperature: Double
|
||||
let condition: WeatherCondition
|
||||
}
|
||||
|
||||
/// Get the current weather in a given location.
|
||||
/// - parameter location: The city and state, e.g. San Francisco, CA
|
||||
/// - parameter unit: The unit of temperature
|
||||
public static func getCurrentWeather(location: String, unit: String) async throws -> CurrentWeather {
|
||||
let weather = try await WeatherService().weather(for: CLGeocoder().geocodeAddressString(location)[0].location!)
|
||||
var temperature = weather.currentWeather.temperature
|
||||
temperature.convert(to: .fahrenheit)
|
||||
return CurrentWeather(temperature: temperature.value,
|
||||
condition: weather.currentWeather.condition)
|
||||
}
|
||||
}
|
||||
|
||||
func downloadFile() async throws -> String {
|
||||
let fm = FileManager.default
|
||||
let tmpDir = fm.temporaryDirectory
|
||||
let destinationURL = tmpDir.appending(path: "llama_groq_gguf.gguf")
|
||||
|
||||
guard !fm.fileExists(atPath: destinationURL.path()) else {
|
||||
return destinationURL.path()
|
||||
}
|
||||
print("Downloading Llama Tools, this may take a while...")
|
||||
// Define the URL
|
||||
guard let url = URL(string: "https://huggingface.co/bartowski/Llama-3-Groq-8B-Tool-Use-GGUF/resolve/main/Llama-3-Groq-8B-Tool-Use-Q5_K_M.gguf?download=true") else {
|
||||
print("Invalid URL.")
|
||||
throw URLError(.badURL)
|
||||
}
|
||||
|
||||
// Start the async download
|
||||
let (tempURL, _) = try await URLSession.shared.download(from: url)
|
||||
|
||||
// Define the destination path in the documents directory
|
||||
|
||||
|
||||
// Move the downloaded file to the destination
|
||||
try fm.moveItem(at: tempURL, to: destinationURL)
|
||||
print("File downloaded to: \(destinationURL.path())")
|
||||
return destinationURL.path()
|
||||
}
|
||||
|
||||
let params = GPTParams()
|
||||
params.modelPath = try await downloadFile()
|
||||
params.nPredict = 512
|
||||
params.nCtx = 4096
|
||||
params.cpuParams.nThreads = 8
|
||||
params.cpuParamsBatch.nThreads = 8
|
||||
params.nBatch = 1024
|
||||
params.nGpuLayers = 1024
|
||||
let llama = try await MyLlama(params: params)
|
||||
|
||||
while true {
|
||||
print("Enter input: ", terminator: "")
|
||||
|
||||
// Read user input
|
||||
if let userInput = readLine() {
|
||||
if userInput.lowercased() == "exit" {
|
||||
print("Exiting the loop.")
|
||||
break
|
||||
} else {
|
||||
print("🧔🏽♂️: \(userInput)")
|
||||
let response = try await llama.chat(userInput)
|
||||
print("🤖: \(response)")
|
||||
}
|
||||
} else {
|
||||
print("Failed to read input.")
|
||||
}
|
||||
}
|
140
swift/test/LlamaKitTests.swift
Normal file
140
swift/test/LlamaKitTests.swift
Normal file
|
@ -0,0 +1,140 @@
|
|||
import Foundation
|
||||
import Testing
|
||||
@testable import LlamaKit
|
||||
import JSONSchema
|
||||
|
||||
// MARK: LlamaGrammarSession Suite
|
||||
@Suite("LlamaGrammarSession Suite")
|
||||
struct LlamaGrammarSessionSuite {
|
||||
@JSONSchema struct Trip {
|
||||
let location: String
|
||||
let startDate: TimeInterval
|
||||
let durationInDays: Int
|
||||
}
|
||||
|
||||
func downloadFile() async throws -> String {
|
||||
let fm = FileManager.default
|
||||
let tmpDir = fm.temporaryDirectory
|
||||
let destinationURL = tmpDir.appending(path: "tinyllama.gguf")
|
||||
|
||||
guard !fm.fileExists(atPath: destinationURL.path()) else {
|
||||
return destinationURL.path()
|
||||
}
|
||||
print("Downloading TinyLlama, this may take a while...")
|
||||
// Define the URL
|
||||
guard let url = URL(string: "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q3_K_L.gguf?download=true") else {
|
||||
print("Invalid URL.")
|
||||
throw URLError(.badURL)
|
||||
}
|
||||
|
||||
// Start the async download
|
||||
let (tempURL, _) = try await URLSession.shared.download(from: url)
|
||||
|
||||
// Define the destination path in the documents directory
|
||||
|
||||
|
||||
// Move the downloaded file to the destination
|
||||
try fm.moveItem(at: tempURL, to: destinationURL)
|
||||
print("File downloaded to: \(destinationURL.path())")
|
||||
return destinationURL.path()
|
||||
}
|
||||
|
||||
@Test func llamaGrammarSession() async throws {
|
||||
let params = GPTParams()
|
||||
params.modelPath = try await downloadFile()
|
||||
params.nPredict = 256
|
||||
params.nCtx = 1024
|
||||
params.cpuParams.nThreads = 4
|
||||
params.cpuParamsBatch.nThreads = 4
|
||||
params.nBatch = 1024
|
||||
params.nGpuLayers = 128
|
||||
params.chatTemplate = """
|
||||
<|system|>
|
||||
{system_message}</s>
|
||||
<|user|>
|
||||
{prompt}</s>
|
||||
<|assistant|>
|
||||
"""
|
||||
params.prompt = """
|
||||
You are a travel agent. The current date epoch \(Date.now.timeIntervalSince1970).
|
||||
Responses should have the following fields:
|
||||
|
||||
location: the location of the trip
|
||||
startDate: the start of the trip as the unix epoch since 1970
|
||||
durationInDays: the duration of the trip in days
|
||||
|
||||
"""
|
||||
params.interactive = true
|
||||
let session = try await LlamaSession<Trip>(params: params)
|
||||
await #expect(throws: Never.self) {
|
||||
let trip = try await session.chat(message: "Please create a trip for me to New York City that starts two weeks from now. The duration of the trip MUST be 3 days long.")
|
||||
#expect(trip.location.contains("New York"))
|
||||
// TODO: Testing the other fields is difficult considering model size
|
||||
// TODO: so for now, we are just asserting the grammar works
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
import WeatherKit
|
||||
import CoreLocation
|
||||
|
||||
@llamaActor actor MyLlama {
|
||||
struct CurrentWeather: Codable {
|
||||
let temperature: Double
|
||||
let condition: WeatherCondition
|
||||
}
|
||||
|
||||
/// Get the current weather in a given location.
|
||||
/// - parameter location: The city and state, e.g. San Francisco, CA
|
||||
/// - parameter unit: The unit of temperature
|
||||
public static func getCurrentWeather(location: String, unit: String) async throws -> CurrentWeather {
|
||||
let weather = try await WeatherService().weather(for: CLGeocoder().geocodeAddressString(location)[0].location!)
|
||||
var temperature = weather.currentWeather.temperature
|
||||
temperature.convert(to: .fahrenheit)
|
||||
return CurrentWeather(temperature: temperature.value,
|
||||
condition: weather.currentWeather.condition)
|
||||
}
|
||||
}
|
||||
|
||||
func downloadFile() async throws -> String {
|
||||
let fm = FileManager.default
|
||||
let tmpDir = fm.temporaryDirectory
|
||||
let destinationURL = tmpDir.appending(path: "llama_groq_gguf.gguf")
|
||||
|
||||
guard !fm.fileExists(atPath: destinationURL.path()) else {
|
||||
return destinationURL.path()
|
||||
}
|
||||
print("Downloading Llama Tools, this may take a while...")
|
||||
// Define the URL
|
||||
guard let url = URL(string: "https://huggingface.co/bartowski/Llama-3-Groq-8B-Tool-Use-GGUF/resolve/main/Llama-3-Groq-8B-Tool-Use-Q5_K_M.gguf?download=true") else {
|
||||
print("Invalid URL.")
|
||||
throw URLError(.badURL)
|
||||
}
|
||||
|
||||
// Start the async download
|
||||
let (tempURL, _) = try await URLSession.shared.download(from: url)
|
||||
|
||||
// Define the destination path in the documents directory
|
||||
|
||||
|
||||
// Move the downloaded file to the destination
|
||||
try fm.moveItem(at: tempURL, to: destinationURL)
|
||||
print("File downloaded to: \(destinationURL.path())")
|
||||
return destinationURL.path()
|
||||
}
|
||||
|
||||
@Test func llamaToolSession() async throws {
|
||||
let params = GPTParams()
|
||||
params.modelPath = try await downloadFile()
|
||||
params.nPredict = 512
|
||||
params.nCtx = 4096
|
||||
params.cpuParams.nThreads = 8
|
||||
params.cpuParamsBatch.nThreads = 8
|
||||
params.nBatch = 1024
|
||||
params.nGpuLayers = 1024
|
||||
let llama = try await MyLlama(params: params)
|
||||
let currentWeather = try await MyLlama.getCurrentWeather(location: "San Francisco, CA", unit: "farenheit")
|
||||
let output = try await llama.chat("What's the weather (in farenheit) in San Francisco, CA?")
|
||||
#expect(output.contains(String(format: "%.2f", currentWeather.temperature)))
|
||||
// #expect(output.contains(currentWeather.condition.rawValue))
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue