Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
c879b6d183
10 changed files with 64 additions and 71 deletions
|
@ -2,33 +2,14 @@
|
||||||
|
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
#if arch(arm) || arch(arm64)
|
|
||||||
let platforms: [SupportedPlatform]? = [
|
|
||||||
.macOS(.v12),
|
|
||||||
.iOS(.v14),
|
|
||||||
.watchOS(.v4),
|
|
||||||
.tvOS(.v14)
|
|
||||||
]
|
|
||||||
let exclude: [String] = []
|
|
||||||
let resources: [Resource] = [
|
|
||||||
.process("ggml-metal.metal")
|
|
||||||
]
|
|
||||||
let additionalSources: [String] = ["ggml-metal.m"]
|
|
||||||
let additionalSettings: [CSetting] = [
|
|
||||||
.unsafeFlags(["-fno-objc-arc"]),
|
|
||||||
.define("GGML_USE_METAL")
|
|
||||||
]
|
|
||||||
#else
|
|
||||||
let platforms: [SupportedPlatform]? = nil
|
|
||||||
let exclude: [String] = ["ggml-metal.metal"]
|
|
||||||
let resources: [Resource] = []
|
|
||||||
let additionalSources: [String] = []
|
|
||||||
let additionalSettings: [CSetting] = []
|
|
||||||
#endif
|
|
||||||
|
|
||||||
let package = Package(
|
let package = Package(
|
||||||
name: "llama",
|
name: "llama",
|
||||||
platforms: platforms,
|
platforms: [
|
||||||
|
.macOS(.v12),
|
||||||
|
.iOS(.v14),
|
||||||
|
.watchOS(.v4),
|
||||||
|
.tvOS(.v14)
|
||||||
|
],
|
||||||
products: [
|
products: [
|
||||||
.library(name: "llama", targets: ["llama"]),
|
.library(name: "llama", targets: ["llama"]),
|
||||||
],
|
],
|
||||||
|
@ -36,25 +17,30 @@ let package = Package(
|
||||||
.target(
|
.target(
|
||||||
name: "llama",
|
name: "llama",
|
||||||
path: ".",
|
path: ".",
|
||||||
exclude: exclude,
|
exclude: [],
|
||||||
sources: [
|
sources: [
|
||||||
"ggml.c",
|
"ggml.c",
|
||||||
"llama.cpp",
|
"llama.cpp",
|
||||||
"ggml-alloc.c",
|
"ggml-alloc.c",
|
||||||
"ggml-backend.c",
|
"ggml-backend.c",
|
||||||
"ggml-quants.c",
|
"ggml-quants.c",
|
||||||
] + additionalSources,
|
"ggml-metal.m",
|
||||||
resources: resources,
|
],
|
||||||
|
resources: [
|
||||||
|
.process("ggml-metal.metal")
|
||||||
|
],
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [
|
cSettings: [
|
||||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||||
.define("GGML_USE_ACCELERATE")
|
.define("GGML_USE_ACCELERATE"),
|
||||||
|
.unsafeFlags(["-fno-objc-arc"]),
|
||||||
|
.define("GGML_USE_METAL"),
|
||||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||||
// We should consider add this in the future when we drop support for iOS 14
|
// We should consider add this in the future when we drop support for iOS 14
|
||||||
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||||
// .define("ACCELERATE_NEW_LAPACK"),
|
// .define("ACCELERATE_NEW_LAPACK"),
|
||||||
// .define("ACCELERATE_LAPACK_ILP64")
|
// .define("ACCELERATE_LAPACK_ILP64")
|
||||||
] + additionalSettings,
|
],
|
||||||
linkerSettings: [
|
linkerSettings: [
|
||||||
.linkedFramework("Accelerate")
|
.linkedFramework("Accelerate")
|
||||||
]
|
]
|
||||||
|
|
|
@ -190,7 +190,7 @@ namespace grammar_parser {
|
||||||
pos = parse_space(pos + 1, is_nested);
|
pos = parse_space(pos + 1, is_nested);
|
||||||
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
|
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
|
||||||
if (last_sym_start == out_elements.size()) {
|
if (last_sym_start == out_elements.size()) {
|
||||||
throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
|
throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
// apply transformation to previous symbol (last_sym_start to end) according to
|
// apply transformation to previous symbol (last_sym_start to end) according to
|
||||||
|
|
|
@ -215,9 +215,10 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end
|
||||||
llama_print_timings(context)
|
llama_print_timings(context)
|
||||||
|
|
||||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||||
let n_tokens = text.count + (add_bos ? 1 : 0)
|
let utf8Count = text.utf8.count
|
||||||
|
let n_tokens = utf8Count + (add_bos ? 1 : 0)
|
||||||
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||||
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
|
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
|
||||||
var swiftTokens: [llama_token] = []
|
var swiftTokens: [llama_token] = []
|
||||||
for i in 0 ..< tokenCount {
|
for i in 0 ..< tokenCount {
|
||||||
swiftTokens.append(tokens[Int(i)])
|
swiftTokens.append(tokens[Int(i)])
|
||||||
|
|
|
@ -11,6 +11,8 @@ actor LlamaContext {
|
||||||
private var context: OpaquePointer
|
private var context: OpaquePointer
|
||||||
private var batch: llama_batch
|
private var batch: llama_batch
|
||||||
private var tokens_list: [llama_token]
|
private var tokens_list: [llama_token]
|
||||||
|
/// This variable is used to store temporarily invalid cchars
|
||||||
|
private var temporary_invalid_cchars: [CChar]
|
||||||
|
|
||||||
var n_len: Int32 = 512
|
var n_len: Int32 = 512
|
||||||
var n_cur: Int32 = 0
|
var n_cur: Int32 = 0
|
||||||
|
@ -21,6 +23,7 @@ actor LlamaContext {
|
||||||
self.context = context
|
self.context = context
|
||||||
self.tokens_list = []
|
self.tokens_list = []
|
||||||
self.batch = llama_batch_init(512, 0, 1)
|
self.batch = llama_batch_init(512, 0, 1)
|
||||||
|
self.temporary_invalid_cchars = []
|
||||||
}
|
}
|
||||||
|
|
||||||
deinit {
|
deinit {
|
||||||
|
@ -61,6 +64,7 @@ actor LlamaContext {
|
||||||
print("attempting to complete \"\(text)\"")
|
print("attempting to complete \"\(text)\"")
|
||||||
|
|
||||||
tokens_list = tokenize(text: text, add_bos: true)
|
tokens_list = tokenize(text: text, add_bos: true)
|
||||||
|
temporary_invalid_cchars = []
|
||||||
|
|
||||||
let n_ctx = llama_n_ctx(context)
|
let n_ctx = llama_n_ctx(context)
|
||||||
let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
|
let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
|
||||||
|
@ -72,7 +76,7 @@ actor LlamaContext {
|
||||||
}
|
}
|
||||||
|
|
||||||
for id in tokens_list {
|
for id in tokens_list {
|
||||||
print(token_to_piece(token: id))
|
print(String(cString: token_to_piece(token: id) + [0]))
|
||||||
}
|
}
|
||||||
|
|
||||||
// batch = llama_batch_init(512, 0) // done in init()
|
// batch = llama_batch_init(512, 0) // done in init()
|
||||||
|
@ -115,10 +119,25 @@ actor LlamaContext {
|
||||||
|
|
||||||
if new_token_id == llama_token_eos(context) || n_cur == n_len {
|
if new_token_id == llama_token_eos(context) || n_cur == n_len {
|
||||||
print("\n")
|
print("\n")
|
||||||
return ""
|
let new_token_str = String(cString: temporary_invalid_cchars + [0])
|
||||||
|
temporary_invalid_cchars.removeAll()
|
||||||
|
return new_token_str
|
||||||
}
|
}
|
||||||
|
|
||||||
let new_token_str = token_to_piece(token: new_token_id)
|
let new_token_cchars = token_to_piece(token: new_token_id)
|
||||||
|
temporary_invalid_cchars.append(contentsOf: new_token_cchars)
|
||||||
|
let new_token_str: String
|
||||||
|
if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) {
|
||||||
|
temporary_invalid_cchars.removeAll()
|
||||||
|
new_token_str = string
|
||||||
|
} else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) {
|
||||||
|
// in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string
|
||||||
|
let string = String(cString: temporary_invalid_cchars + [0])
|
||||||
|
temporary_invalid_cchars.removeAll()
|
||||||
|
new_token_str = string
|
||||||
|
} else {
|
||||||
|
new_token_str = ""
|
||||||
|
}
|
||||||
print(new_token_str)
|
print(new_token_str)
|
||||||
// tokens_list.append(new_token_id)
|
// tokens_list.append(new_token_id)
|
||||||
|
|
||||||
|
@ -144,12 +163,14 @@ actor LlamaContext {
|
||||||
|
|
||||||
func clear() {
|
func clear() {
|
||||||
tokens_list.removeAll()
|
tokens_list.removeAll()
|
||||||
|
temporary_invalid_cchars.removeAll()
|
||||||
}
|
}
|
||||||
|
|
||||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||||
let n_tokens = text.count + (add_bos ? 1 : 0)
|
let utf8Count = text.utf8.count
|
||||||
|
let n_tokens = utf8Count + (add_bos ? 1 : 0)
|
||||||
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||||
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false)
|
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
|
||||||
|
|
||||||
var swiftTokens: [llama_token] = []
|
var swiftTokens: [llama_token] = []
|
||||||
for i in 0..<tokenCount {
|
for i in 0..<tokenCount {
|
||||||
|
@ -161,7 +182,8 @@ actor LlamaContext {
|
||||||
return swiftTokens
|
return swiftTokens
|
||||||
}
|
}
|
||||||
|
|
||||||
private func token_to_piece(token: llama_token) -> String {
|
/// - note: The result does not contain null-terminator
|
||||||
|
private func token_to_piece(token: llama_token) -> [CChar] {
|
||||||
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
|
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
|
||||||
result.initialize(repeating: Int8(0), count: 8)
|
result.initialize(repeating: Int8(0), count: 8)
|
||||||
defer {
|
defer {
|
||||||
|
@ -175,10 +197,12 @@ actor LlamaContext {
|
||||||
defer {
|
defer {
|
||||||
newResult.deallocate()
|
newResult.deallocate()
|
||||||
}
|
}
|
||||||
_ = llama_token_to_piece(model, token, newResult, -nTokens)
|
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
|
||||||
return String(cString: newResult)
|
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
|
||||||
|
return Array(bufferPointer)
|
||||||
} else {
|
} else {
|
||||||
return String(cString: result)
|
let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
|
||||||
|
return Array(bufferPointer)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -70,6 +70,7 @@ def make_postData(body, chat=False, stream=False):
|
||||||
if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
|
if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
|
||||||
if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
|
if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
|
||||||
if(is_present(body, "seed")): postData["seed"] = body["seed"]
|
if(is_present(body, "seed")): postData["seed"] = body["seed"]
|
||||||
|
if(is_present(body, "grammar")): postData["grammar"] = body["grammar"]
|
||||||
if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
|
if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
|
||||||
if (args.stop != ""):
|
if (args.stop != ""):
|
||||||
postData["stop"] = [args.stop]
|
postData["stop"] = [args.stop]
|
||||||
|
|
|
@ -2410,9 +2410,7 @@ json oaicompat_completion_params_parse(
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle 'stop' field
|
// Handle 'stop' field
|
||||||
if (body["stop"].is_null()) {
|
if (body.contains("stop") && body["stop"].is_string()) {
|
||||||
llama_params["stop"] = json::array({});
|
|
||||||
} else if (body["stop"].is_string()) {
|
|
||||||
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
|
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
|
||||||
} else {
|
} else {
|
||||||
llama_params["stop"] = json_value(body, "stop", json::array());
|
llama_params["stop"] = json_value(body, "stop", json::array());
|
||||||
|
|
|
@ -75,7 +75,7 @@ int main(int argc, char ** argv) {
|
||||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||||
if (n_kv_req > n_ctx) {
|
if (n_kv_req > n_ctx) {
|
||||||
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
||||||
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1083,7 +1083,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
||||||
// to the matrix-vector kernel
|
// to the matrix-vector kernel
|
||||||
int ne11_mm_min = 1;
|
int ne11_mm_min = src0t == GGML_TYPE_F16 ? 1 : 16;
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
// the numbers below are measured on M2 Ultra for 7B and 13B models
|
// the numbers below are measured on M2 Ultra for 7B and 13B models
|
||||||
|
|
28
ggml.c
28
ggml.c
|
@ -15629,7 +15629,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_DIAG_MASK_ZERO:
|
case GGML_OP_DIAG_MASK_ZERO:
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
case GGML_OP_SOFT_MAX:
|
|
||||||
case GGML_OP_SOFT_MAX_BACK:
|
case GGML_OP_SOFT_MAX_BACK:
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
case GGML_OP_ROPE_BACK:
|
case GGML_OP_ROPE_BACK:
|
||||||
|
@ -15645,6 +15644,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
{
|
{
|
||||||
n_tasks = 1; //TODO
|
n_tasks = 1; //TODO
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_SOFT_MAX:
|
||||||
|
{
|
||||||
|
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
||||||
|
} break;
|
||||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
@ -15876,18 +15879,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
|
|
||||||
// thread scheduling for the different operations + work buffer size estimation
|
// thread scheduling for the different operations + work buffer size estimation
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
int n_tasks = 1;
|
|
||||||
|
|
||||||
struct ggml_tensor * node = cgraph->nodes[i];
|
struct ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
|
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
||||||
|
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
|
|
||||||
switch (node->op) {
|
switch (node->op) {
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
if (ggml_is_quantized(node->type)) {
|
if (ggml_is_quantized(node->type)) {
|
||||||
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
|
@ -15895,16 +15896,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
case GGML_OP_ADD1:
|
case GGML_OP_ADD1:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
if (ggml_is_quantized(node->src[0]->type)) {
|
if (ggml_is_quantized(node->src[0]->type)) {
|
||||||
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
if (ggml_is_quantized(node->src[0]->type)) {
|
if (ggml_is_quantized(node->src[0]->type)) {
|
||||||
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
|
@ -15932,16 +15929,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_OUT_PROD:
|
case GGML_OP_OUT_PROD:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
if (ggml_is_quantized(node->src[0]->type)) {
|
if (ggml_is_quantized(node->src[0]->type)) {
|
||||||
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
{
|
{
|
||||||
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
|
||||||
|
|
||||||
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
|
@ -15971,7 +15964,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||||
{
|
{
|
||||||
|
@ -15989,8 +15981,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_FLASH_ATTN:
|
case GGML_OP_FLASH_ATTN:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
||||||
|
|
||||||
if (node->src[1]->type == GGML_TYPE_F32) {
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
||||||
|
@ -16003,8 +15993,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_FLASH_FF:
|
case GGML_OP_FLASH_FF:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
if (node->src[1]->type == GGML_TYPE_F32) {
|
if (node->src[1]->type == GGML_TYPE_F32) {
|
||||||
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
||||||
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
||||||
|
@ -16015,8 +16003,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_FLASH_ATTN_BACK:
|
case GGML_OP_FLASH_ATTN_BACK:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
const int64_t D = node->src[0]->ne[0];
|
const int64_t D = node->src[0]->ne[0];
|
||||||
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
||||||
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
||||||
|
@ -16031,8 +16017,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
|
|
||||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
|
||||||
|
|
||||||
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_COUNT:
|
case GGML_OP_COUNT:
|
||||||
|
|
|
@ -5744,8 +5744,7 @@ static int llama_decode_internal(
|
||||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||||
// after enough generations, the benefit from this heuristic disappears
|
// after enough generations, the benefit from this heuristic disappears
|
||||||
// if we start defragmenting the cache, the benefit from this will be more important
|
// if we start defragmenting the cache, the benefit from this will be more important
|
||||||
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
||||||
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
|
||||||
|
|
||||||
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue