llama : pimpl llama_model

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-01-07 15:36:39 +02:00
parent a16daa9552
commit a15e22537f
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
6 changed files with 358 additions and 327 deletions

View file

@ -1,5 +1,6 @@
#include "llama-adapter.h" #include "llama-adapter.h"
#include "llama-mmap.h"
#include "llama-model.h" #include "llama-model.h"
#include <algorithm> #include <algorithm>

View file

@ -1,5 +1,7 @@
#include "llama-context.h" #include "llama-context.h"
#include "llama-mmap.h"
#include <cassert> #include <cassert>
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
@ -504,7 +506,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
auto * buft = ggml_backend_cpu_buffer_type(); auto * buft = ggml_backend_cpu_buffer_type();
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
auto * output_dev = lctx.model.dev_output.dev; auto * output_dev = lctx.model.dev_output();
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
if (output_dev_host_buft) { if (output_dev_host_buft) {
buft = output_dev_host_buft; buft = output_dev_host_buft;

View file

@ -79,7 +79,7 @@ bool llama_kv_cache_init(
ggml_backend_buffer_type_t buft; ggml_backend_buffer_type_t buft;
if (offload) { if (offload) {
auto * dev = model.dev_layer.at(i).dev; auto * dev = model.dev_layer(i);
buft = ggml_backend_dev_buffer_type(dev); buft = ggml_backend_dev_buffer_type(dev);
} else { } else {
buft = ggml_backend_cpu_buffer_type(); buft = ggml_backend_cpu_buffer_type();

View file

@ -1,14 +1,18 @@
#include "llama-model.h" #include "llama-model.h"
#include "llama-impl.h" #include "llama-impl.h"
#include "llama-mmap.h"
#include "llama-model-loader.h" #include "llama-model-loader.h"
#include "ggml-cpp.h"
#include "unicode.h" // TODO: remove #include "unicode.h" // TODO: remove
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <cstring> #include <cstring>
#include <functional> #include <functional>
#include <map>
#include <sstream> #include <sstream>
#include <stdexcept> #include <stdexcept>
@ -106,12 +110,273 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
} }
llama_model::llama_model(const struct llama_model_params & params) : params(params) { // checks if the weight tensor can be used with the specified buffer type and device
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
GGML_ASSERT(w != nullptr);
if (op == GGML_OP_NONE) {
return true;
}
ggml_init_params params = {
/*.mem_size =*/ ggml_tensor_overhead()*8,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx_ptr { ggml_init(params) };
if (!ctx_ptr) {
throw std::runtime_error(format("failed to create ggml context"));
}
ggml_context * ctx = ctx_ptr.get();
ggml_tensor * op_tensor = nullptr;
switch (op) {
case GGML_OP_GET_ROWS:
{
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
op_tensor = ggml_get_rows(ctx, w, b);
} break;
case GGML_OP_MUL_MAT:
{
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
op_tensor = ggml_mul_mat(ctx, w, b);
} break;
case GGML_OP_MUL_MAT_ID:
{
int n_expert_used = hparams.n_expert_used;
ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
} break;
case GGML_OP_ADD:
{
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
op_tensor = ggml_add(ctx, a, w);
} break;
case GGML_OP_MUL:
{
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
op_tensor = ggml_mul(ctx, a, w);
} break;
case GGML_OP_DIV:
{
ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
op_tensor = ggml_div(ctx, a, w);
} break;
case GGML_OP_ROPE:
{
int n_embd_head = hparams.n_embd_head_v;
int n_head = hparams.n_head();
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
op_tensor = ggml_rope_ext(
ctx, a, b, w,
0, 0, 0, 0, 0,
0, 0, 0, 0
);
} break;
case GGML_OP_SSM_CONV:
{
// FIXME
ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
op_tensor = ggml_ssm_conv(ctx, conv_x, w);
} break;
case GGML_OP_SSM_SCAN:
{
// FIXME
const int64_t d_state = w->ne[0];
const int64_t d_inner = w->ne[1];
const int64_t n_seq_tokens = 512;
const int64_t n_seqs = 1;
ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
} break;
case GGML_OP_RWKV_WKV6:
{
// FIXME
const int64_t S = 123;
const int64_t H = 123;
const int64_t n_tokens = 123;
const int64_t n_seqs = 123;
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
ggml_tensor * tf = w;
ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
} break;
case GGML_OP_IM2COL:
{
const int n_embd = hparams.n_embd;
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
} break;
default:
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
}
// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
GGML_ASSERT(w->buffer == nullptr);
w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
ggml_backend_buffer_free(w->buffer);
w->buffer = nullptr;
return op_supported;
} }
// lists of buffer types used for each layer
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
// find the first buffer type in the list that can use the tensor
static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
GGML_ASSERT(!buft_list.empty());
for (const auto & cur : buft_list) {
ggml_backend_dev_t cur_dev = cur.first;
ggml_backend_buffer_type_t cur_buft = cur.second;
if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
return cur_buft;
}
}
return nullptr;
}
// CPU: ACCEL -> CPU extra -> GPU host -> CPU
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
buft_list_t buft_list;
// add ACCEL buffer types
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
auto * buft = ggml_backend_dev_buffer_type(dev);
// skip
if (buft != ggml_backend_cpu_buffer_type()) {
buft_list.emplace_back(dev, buft);
}
}
}
// add extra buffer types
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
if (ggml_backend_dev_get_extra_bufts_fn) {
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
while (extra_bufts && *extra_bufts) {
buft_list.emplace_back(cpu_dev, *extra_bufts);
++extra_bufts;
}
}
// add a host buffer type
// storing the tensors in a host buffer is useful when the processing of large batches
// is offloaded to a GPU device, since it reduces the time spent on data transfers
// generally, this will be done using the first device in the list
// a better approach would be to handle this on a weight-by-weight basis using the offload_op
// function of the device to determine if it would benefit from being stored in a host buffer
for (auto * dev : devices) {
ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
if (buft) {
buft_list.emplace_back(dev, buft);
break;
}
}
// add the CPU buffer type
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
}
}
return buft_list;
}
// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_mode split_mode, const float * tensor_split) {
buft_list_t buft_list;
// add the device split buffer type if requested and available
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
if (ggml_backend_split_buffer_type_fn) {
size_t dev_index = [&]() {
auto * reg = ggml_backend_dev_backend_reg(dev);
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
if (ggml_backend_reg_dev_get(reg, i) == dev) {
return i;
}
}
throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
}();
auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
if (buft != nullptr) {
buft_list.emplace_back(dev, buft);
}
}
}
// add the device default buffer type
buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
return buft_list;
}
struct llama_model::impl {
uint64_t n_elements = 0;
size_t n_bytes = 0;
std::string desc_str;
// model memory mapped files
llama_mmaps mappings;
// objects representing data potentially being locked in memory
llama_mlocks mlock_bufs;
llama_mlocks mlock_mmaps;
// contexts where the model tensors metadata is stored
std::vector<ggml_context_ptr> ctxs;
// the model memory buffers for the tensor data
std::vector<ggml_backend_buffer_ptr> bufs;
buft_list_t cpu_buft_list;
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
struct layer_dev {
ggml_backend_dev_t dev;
buft_list_t * buft_list;
};
layer_dev dev_input = {};
layer_dev dev_output = {};
std::vector<layer_dev> dev_layer;
};
llama_model::llama_model(const struct llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
}
llama_model::~llama_model() = default;
void llama_model::load_stats(llama_model_loader & ml) { void llama_model::load_stats(llama_model_loader & ml) {
n_elements = ml.n_elements; pimpl->n_elements = ml.n_elements;
n_bytes = ml.n_bytes; pimpl->n_bytes = ml.n_bytes;
} }
void llama_model::load_arch(llama_model_loader & ml) { void llama_model::load_arch(llama_model_loader & ml) {
@ -976,9 +1241,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: throw std::runtime_error("unsupported model architecture"); default: throw std::runtime_error("unsupported model architecture");
} }
n_bytes = ml.n_bytes; pimpl->n_bytes = ml.n_bytes;
desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name(); pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
if (hparams.f_max_alibi_bias > 0.0f) { if (hparams.f_max_alibi_bias > 0.0f) {
hparams.use_alibi = true; hparams.use_alibi = true;
@ -1698,128 +1963,6 @@ void llama_model::load_vocab(llama_model_loader & ml) {
} }
} }
// checks if the weight tensor can be used with the specified buffer type and device
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
GGML_ASSERT(w != nullptr);
if (op == GGML_OP_NONE) {
return true;
}
ggml_init_params params = {
/*.mem_size =*/ ggml_tensor_overhead()*8,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx_ptr { ggml_init(params) };
if (!ctx_ptr) {
throw std::runtime_error(format("failed to create ggml context"));
}
ggml_context * ctx = ctx_ptr.get();
ggml_tensor * op_tensor = nullptr;
switch (op) {
case GGML_OP_GET_ROWS:
{
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
op_tensor = ggml_get_rows(ctx, w, b);
} break;
case GGML_OP_MUL_MAT:
{
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
op_tensor = ggml_mul_mat(ctx, w, b);
} break;
case GGML_OP_MUL_MAT_ID:
{
int n_expert_used = hparams.n_expert_used;
ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
} break;
case GGML_OP_ADD:
{
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
op_tensor = ggml_add(ctx, a, w);
} break;
case GGML_OP_MUL:
{
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
op_tensor = ggml_mul(ctx, a, w);
} break;
case GGML_OP_DIV:
{
ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
op_tensor = ggml_div(ctx, a, w);
} break;
case GGML_OP_ROPE:
{
int n_embd_head = hparams.n_embd_head_v;
int n_head = hparams.n_head();
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
op_tensor = ggml_rope_ext(
ctx, a, b, w,
0, 0, 0, 0, 0,
0, 0, 0, 0
);
} break;
case GGML_OP_SSM_CONV:
{
// FIXME
ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
op_tensor = ggml_ssm_conv(ctx, conv_x, w);
} break;
case GGML_OP_SSM_SCAN:
{
// FIXME
const int64_t d_state = w->ne[0];
const int64_t d_inner = w->ne[1];
const int64_t n_seq_tokens = 512;
const int64_t n_seqs = 1;
ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
} break;
case GGML_OP_RWKV_WKV6:
{
// FIXME
const int64_t S = 123;
const int64_t H = 123;
const int64_t n_tokens = 123;
const int64_t n_seqs = 123;
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
ggml_tensor * tf = w;
ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
} break;
case GGML_OP_IM2COL:
{
const int n_embd = hparams.n_embd;
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
} break;
default:
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
}
// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
GGML_ASSERT(w->buffer == nullptr);
w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
ggml_backend_buffer_free(w->buffer);
w->buffer = nullptr;
return op_supported;
}
bool llama_model::load_tensors(llama_model_loader & ml) { bool llama_model::load_tensors(llama_model_loader & ml) {
const auto & split_mode = params.split_mode; const auto & split_mode = params.split_mode;
const auto & n_gpu_layers = params.n_gpu_layers; const auto & n_gpu_layers = params.n_gpu_layers;
@ -1831,20 +1974,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const bool use_mmap_buffer = true; const bool use_mmap_buffer = true;
// build a list of buffer types for the CPU and GPU devices // build a list of buffer types for the CPU and GPU devices
cpu_buft_list = make_cpu_buft_list(); pimpl->cpu_buft_list = make_cpu_buft_list(devices);
for (auto * dev : devices) { for (auto * dev : devices) {
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split); buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
// add CPU buffer types as a fallback // add CPU buffer types as a fallback
buft_list.insert(buft_list.end(), cpu_buft_list.begin(), cpu_buft_list.end()); buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
gpu_buft_list.emplace(dev, std::move(buft_list)); pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
} }
// calculate the split points // calculate the split points
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_device(), [](float x) { return x == 0.0f; }); bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
std::vector<float> splits(n_device()); std::vector<float> splits(n_devices());
if (all_zero) { if (all_zero) {
// default split, by free memory // default split, by free memory
for (size_t i = 0; i < n_device(); ++i) { for (size_t i = 0; i < n_devices(); ++i) {
ggml_backend_dev_t dev = devices[i]; ggml_backend_dev_t dev = devices[i];
size_t total; size_t total;
size_t free; size_t free;
@ -1852,42 +1995,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
splits[i] = free; splits[i] = free;
} }
} else { } else {
std::copy(tensor_split, tensor_split + n_device(), splits.begin()); std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
} }
// sum and normalize the splits to get the split points // sum and normalize the splits to get the split points
float split_sum = 0.0f; float split_sum = 0.0f;
for (size_t i = 0; i < n_device(); ++i) { for (size_t i = 0; i < n_devices(); ++i) {
split_sum += splits[i]; split_sum += splits[i];
splits[i] = split_sum; splits[i] = split_sum;
} }
for (size_t i = 0; i < n_device(); ++i) { for (size_t i = 0; i < n_devices(); ++i) {
splits[i] /= split_sum; splits[i] /= split_sum;
} }
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0); const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1); const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
auto get_layer_buft_list = [&](int il) -> llama_model::layer_dev { auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
return {cpu_dev, &cpu_buft_list}; return {cpu_dev, &pimpl->cpu_buft_list};
} }
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_device(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin(); const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
auto * dev = devices.at(layer_gpu); auto * dev = devices.at(layer_gpu);
return {dev, &gpu_buft_list.at(dev)}; return {dev, &pimpl->gpu_buft_list.at(dev)};
}; };
// assign the input layer // assign the input layer
// there is very little benefit to offloading the input layer, so always keep it on the CPU // there is very little benefit to offloading the input layer, so always keep it on the CPU
dev_input = { cpu_dev, &cpu_buft_list }; pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
// assign the repeating layers to the devices according to the splits // assign the repeating layers to the devices according to the splits
dev_layer.resize(n_layer); pimpl->dev_layer.resize(n_layer);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
dev_layer[il] = get_layer_buft_list(il); pimpl->dev_layer[il] = get_layer_buft_list(il);
} }
// assign the output layer // assign the output layer
dev_output = get_layer_buft_list(n_layer); pimpl->dev_output = get_layer_buft_list(n_layer);
// one ggml context per buffer type // one ggml context per buffer type
int max_n_tensors = ml.n_tensors; int max_n_tensors = ml.n_tensors;
@ -1904,12 +2048,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
/*.mem_buffer =*/ NULL, /*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true, /*.no_alloc =*/ true,
}; };
ggml_context * ctx = ggml_init(params); ggml_context * ctx = ggml_init(params);
if (!ctx) { if (!ctx) {
throw std::runtime_error(format("failed to create ggml context")); throw std::runtime_error(format("failed to create ggml context"));
} }
ctx_map[buft] = ctx; ctx_map[buft] = ctx;
ctxs.emplace_back(ctx); pimpl->ctxs.emplace_back(ctx);
return ctx; return ctx;
} }
return it->second; return it->second;
@ -1992,22 +2139,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} }
// select the buffer type for this tensor // select the buffer type for this tensor
llama_model::buft_list_t * buft_list; buft_list_t * buft_list;
switch (info.layer) { switch (info.layer) {
case LLM_TENSOR_LAYER_INPUT: case LLM_TENSOR_LAYER_INPUT:
buft_list = dev_input.buft_list; buft_list = pimpl->dev_input.buft_list;
break; break;
case LLM_TENSOR_LAYER_OUTPUT: case LLM_TENSOR_LAYER_OUTPUT:
buft_list = dev_output.buft_list; buft_list = pimpl->dev_output.buft_list;
break; break;
case LLM_TENSOR_LAYER_REPEATING: case LLM_TENSOR_LAYER_REPEATING:
buft_list = dev_layer.at(tn.bid).buft_list; buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
break; break;
default: default:
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str()); GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
} }
ggml_backend_buffer_type_t buft = select_weight_buft(t_meta, op, *buft_list); ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list);
if (!buft) { if (!buft) {
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str())); throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
} }
@ -3924,8 +4071,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
ml.done_getting_tensors(); ml.done_getting_tensors();
ml.init_mappings(true, use_mlock ? &mlock_mmaps : nullptr); ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
mappings.reserve(ml.mappings.size()); pimpl->mappings.reserve(ml.mappings.size());
// create the backend buffers // create the backend buffers
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs; std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
@ -3933,7 +4080,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// Ensure we have enough capacity for the maximum backend buffer we will potentially create // Ensure we have enough capacity for the maximum backend buffer we will potentially create
const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size(); const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
bufs.reserve(n_max_backend_buffer); pimpl->bufs.reserve(n_max_backend_buffer);
for (auto & it : ctx_map) { for (auto & it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first; ggml_backend_buffer_type_t buft = it.first;
@ -3974,7 +4121,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
if (buf == nullptr) { if (buf == nullptr) {
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
} }
bufs.emplace_back(buf); pimpl->bufs.emplace_back(buf);
buf_map.emplace(idx, buf); buf_map.emplace(idx, buf);
} }
} }
@ -3983,10 +4130,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
if (buf == nullptr) { if (buf == nullptr) {
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
} }
bufs.emplace_back(buf); pimpl->bufs.emplace_back(buf);
if (use_mlock && ggml_backend_buffer_is_host(buf)) { if (use_mlock && ggml_backend_buffer_is_host(buf)) {
mlock_bufs.emplace_back(new llama_mlock); pimpl->mlock_bufs.emplace_back(new llama_mlock);
auto & mlock_buf = mlock_bufs.back(); auto & mlock_buf = pimpl->mlock_bufs.back();
mlock_buf->init (ggml_backend_buffer_get_base(buf)); mlock_buf->init (ggml_backend_buffer_get_base(buf));
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
} }
@ -3995,7 +4142,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} }
} }
if (bufs.empty()) { if (pimpl->bufs.empty()) {
throw std::runtime_error("failed to allocate buffer"); throw std::runtime_error("failed to allocate buffer");
} }
@ -4023,12 +4170,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} }
// print memory requirements per buffer type // print memory requirements per buffer type
for (auto & buf : bufs) { for (auto & buf : pimpl->bufs) {
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
} }
// populate tensors_by_name // populate tensors_by_name
for (auto & ctx : ctxs) { for (auto & ctx : pimpl->ctxs) {
for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) { for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
tensors_by_name.emplace_back(ggml_get_name(cur), cur); tensors_by_name.emplace_back(ggml_get_name(cur), cur);
} }
@ -4038,14 +4185,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
for (auto & it : ctx_bufs) { for (auto & it : ctx_bufs) {
ggml_context * ctx = it.first; ggml_context * ctx = it.first;
auto & bufs = it.second; auto & bufs = it.second;
if (!ml.load_all_data(ctx, bufs, use_mlock ? &mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
return false; return false;
} }
} }
if (use_mmap_buffer) { if (use_mmap_buffer) {
for (auto & mapping : ml.mappings) { for (auto & mapping : ml.mappings) {
mappings.emplace_back(std::move(mapping)); pimpl->mappings.emplace_back(std::move(mapping));
} }
} }
@ -4061,21 +4208,25 @@ std::string llama_model::type_name() const {
} }
std::string llama_model::desc() const { std::string llama_model::desc() const {
return desc_str; return pimpl->desc_str;
} }
size_t llama_model::size() const { size_t llama_model::size() const {
return n_bytes; return pimpl->n_bytes;
} }
size_t llama_model::max_nodes() const { size_t llama_model::max_nodes() const {
return std::max<size_t>(8192, tensors_by_name.size()*5); return std::max<size_t>(8192, tensors_by_name.size()*5);
} }
size_t llama_model::n_device() const { size_t llama_model::n_devices() const {
return devices.size(); return devices.size();
} }
uint64_t llama_model::n_elements() const {
return pimpl->n_elements;
}
void llama_model::print_info() const { void llama_model::print_info() const {
const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train); const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
@ -4152,14 +4303,14 @@ void llama_model::print_info() const {
} }
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str()); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
if (n_elements >= 1e12) { if (pimpl->n_elements >= 1e12) {
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, n_elements*1e-12); LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
} else if (n_elements >= 1e9) { } else if (pimpl->n_elements >= 1e9) {
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, n_elements*1e-9); LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
} else if (n_elements >= 1e6) { } else if (pimpl->n_elements >= 1e6) {
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, n_elements*1e-6); LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
} else { } else {
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, n_elements*1e-3); LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
} }
// general kv // general kv
@ -4222,6 +4373,14 @@ void llama_model::print_info() const {
} }
} }
ggml_backend_dev_t llama_model::dev_layer(int il) const {
return pimpl->dev_layer.at(il).dev;
}
ggml_backend_dev_t llama_model::dev_output() const {
return pimpl->dev_output.dev;
}
template<typename F> template<typename F>
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) { static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
ggml_init_params params = { ggml_init_params params = {
@ -4250,7 +4409,7 @@ static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t d
} }
template<typename F> template<typename F>
static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) { static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
for (const auto & cur : buft_list) { for (const auto & cur : buft_list) {
ggml_backend_dev_t cur_dev = cur.first; ggml_backend_dev_t cur_dev = cur.first;
ggml_backend_buffer_type_t cur_buft = cur.second; ggml_backend_buffer_type_t cur_buft = cur.second;
@ -4264,7 +4423,7 @@ static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & b
ggml_backend_buffer_type_t llama_model::select_buft(int il) const { ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
return ::select_buft( return ::select_buft(
*dev_layer.at(il).buft_list, *pimpl->dev_layer.at(il).buft_list,
[&](ggml_context * ctx) { [&](ggml_context * ctx) {
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd); ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd); ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
@ -4301,104 +4460,6 @@ std::string llama_model::token_to_piece(llama_token token, bool special) const {
return piece; return piece;
} }
// find the first buffer type in the list that can use the tensor
ggml_backend_buffer_type_t llama_model::select_weight_buft(ggml_tensor * tensor, ggml_op op, const llama_model::buft_list_t & buft_list) const {
GGML_ASSERT(!buft_list.empty());
for (const auto & cur : buft_list) {
ggml_backend_dev_t cur_dev = cur.first;
ggml_backend_buffer_type_t cur_buft = cur.second;
if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
return cur_buft;
}
}
return nullptr;
}
// CPU: ACCEL -> CPU extra -> GPU host -> CPU
llama_model::buft_list_t llama_model::make_cpu_buft_list() const {
buft_list_t buft_list;
// add ACCEL buffer types
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
auto * buft = ggml_backend_dev_buffer_type(dev);
// skip
if (buft != ggml_backend_cpu_buffer_type()) {
buft_list.emplace_back(dev, buft);
}
}
}
// add extra buffer types
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
if (ggml_backend_dev_get_extra_bufts_fn) {
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
while (extra_bufts && *extra_bufts) {
buft_list.emplace_back(cpu_dev, *extra_bufts);
++extra_bufts;
}
}
// add a host buffer type
// storing the tensors in a host buffer is useful when the processing of large batches
// is offloaded to a GPU device, since it reduces the time spent on data transfers
// generally, this will be done using the first device in the list
// a better approach would be to handle this on a weight-by-weight basis using the offload_op
// function of the device to determine if it would benefit from being stored in a host buffer
for (auto * dev : devices) {
ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
if (buft) {
buft_list.emplace_back(dev, buft);
break;
}
}
// add the CPU buffer type
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
}
}
return buft_list;
}
llama_model::buft_list_t llama_model::make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_mode split_mode, const float * tensor_split) {
buft_list_t buft_list;
// add the device split buffer type if requested and available
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
if (ggml_backend_split_buffer_type_fn) {
size_t dev_index = [&]() {
auto * reg = ggml_backend_dev_backend_reg(dev);
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
if (ggml_backend_reg_dev_get(reg, i) == dev) {
return i;
}
}
throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
}();
auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
if (buft != nullptr) {
buft_list.emplace_back(dev, buft);
}
}
}
// add the device default buffer type
buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
return buft_list;
}
// //
// interface implementation // interface implementation
// //
@ -4590,7 +4651,7 @@ uint64_t llama_model_size(const struct llama_model * model) {
} }
uint64_t llama_model_n_params(const struct llama_model * model) { uint64_t llama_model_n_params(const struct llama_model * model) {
return model->n_elements; return model->n_elements();
} }
bool llama_model_has_encoder(const struct llama_model * model) { bool llama_model_has_encoder(const struct llama_model * model) {

View file

@ -4,10 +4,10 @@
#include "llama-arch.h" #include "llama-arch.h"
#include "llama-hparams.h" #include "llama-hparams.h"
#include "llama-vocab.h" #include "llama-vocab.h"
#include "llama-mmap.h"
#include "ggml-cpp.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector> #include <vector>
struct llama_model_loader; struct llama_model_loader;
@ -319,53 +319,24 @@ struct llama_model {
std::vector<llama_layer> layers; std::vector<llama_layer> layers;
llama_model_params params;
// gguf metadata // gguf metadata
std::unordered_map<std::string, std::string> gguf_kv; std::unordered_map<std::string, std::string> gguf_kv;
llama_model_params params;
std::vector<std::string> rpc_servers; std::vector<std::string> rpc_servers;
// list of devices used in this model // list of devices used in this model
std::vector<ggml_backend_dev_t> devices; std::vector<ggml_backend_dev_t> devices;
// lists of buffer types used for each layer
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
buft_list_t cpu_buft_list;
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
struct layer_dev {
ggml_backend_dev_t dev;
buft_list_t * buft_list;
};
layer_dev dev_input = {};
layer_dev dev_output = {};
std::vector<layer_dev> dev_layer;
// contexts where the model tensors metadata is stored
std::vector<ggml_context_ptr> ctxs;
// the model memory buffers for the tensor data
std::vector<ggml_backend_buffer_ptr> bufs;
// model memory mapped files
llama_mmaps mappings;
// objects representing data potentially being locked in memory
llama_mlocks mlock_bufs;
llama_mlocks mlock_mmaps;
// for quantize-stats only // for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name; std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
int64_t t_load_us = 0; int64_t t_load_us = 0;
int64_t t_start_us = 0; int64_t t_start_us = 0;
// total number of parameters in the model
uint64_t n_elements = 0;
llama_model(const struct llama_model_params & params); llama_model(const struct llama_model_params & params);
~llama_model();
void load_stats (llama_model_loader & ml); void load_stats (llama_model_loader & ml);
void load_arch (llama_model_loader & ml); void load_arch (llama_model_loader & ml);
@ -380,29 +351,25 @@ struct llama_model {
size_t size() const; size_t size() const;
size_t max_nodes() const; size_t max_nodes() const;
size_t n_device() const; size_t n_devices() const;
// total number of parameters in the model
uint64_t n_elements() const;
void print_info() const; void print_info() const;
ggml_backend_dev_t dev_layer(int il) const;
ggml_backend_dev_t dev_output() const;
ggml_backend_buffer_type_t select_buft(int il) const; ggml_backend_buffer_type_t select_buft(int il) const;
const struct ggml_tensor * get_tensor(const char * name) const; const struct ggml_tensor * get_tensor(const char * name) const;
private: private:
size_t n_bytes = 0; struct impl;
std::unique_ptr<impl> pimpl;
std::string desc_str;
std::string token_to_piece(llama_token token, bool special) const; std::string token_to_piece(llama_token token, bool special) const;
// find the first buffer type in the list that can use the tensor
ggml_backend_buffer_type_t select_weight_buft(ggml_tensor * tensor, ggml_op op, const llama_model::buft_list_t & buft_list) const;
// CPU: ACCEL -> CPU extra -> GPU host -> CPU
buft_list_t make_cpu_buft_list() const;
// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_mode split_mode, const float * tensor_split);
}; };
const char * llm_type_name(llm_type type); const char * llm_type_name(llm_type type);

View file

@ -8166,9 +8166,9 @@ static struct ggml_cgraph * llama_build_graph(
const bool full_offload = lctx.model.params.n_gpu_layers > (int) lctx.model.hparams.n_layer; const bool full_offload = lctx.model.params.n_gpu_layers > (int) lctx.model.hparams.n_layer;
if (ubatch.n_tokens < 32 || full_offload) { if (ubatch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) { if (il != -1 && strcmp(name, "norm") == 0) {
const auto & dev_layer = lctx.model.dev_layer.at(il); const auto & dev_layer = lctx.model.dev_layer(il);
for (auto & backend : lctx.backends) { for (auto & backend : lctx.backends) {
if (ggml_backend_get_device(backend.get()) == dev_layer.dev) { if (ggml_backend_get_device(backend.get()) == dev_layer) {
if (ggml_backend_supports_op(backend.get(), cur)) { if (ggml_backend_supports_op(backend.get(), cur)) {
ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, backend.get()); ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, backend.get());
} }
@ -9490,7 +9490,7 @@ struct llama_model * llama_model_load_from_file(
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024); LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
} }
int status = llama_model_load(path_model, *model, params); const int status = llama_model_load(path_model, *model, params);
GGML_ASSERT(status <= 0); GGML_ASSERT(status <= 0);
if (status < 0) { if (status < 0) {
if (status == -1) { if (status == -1) {
@ -9772,7 +9772,7 @@ struct llama_context * llama_new_context_with_model(
// TODO: move these checks to ggml_backend_sched // TODO: move these checks to ggml_backend_sched
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel = bool pipeline_parallel =
model->n_device() > 1 && model->n_devices() > 1 &&
model->params.n_gpu_layers > (int)model->hparams.n_layer && model->params.n_gpu_layers > (int)model->hparams.n_layer &&
model->params.split_mode == LLAMA_SPLIT_MODE_LAYER && model->params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
params.offload_kqv; params.offload_kqv;