Merge branch 'master' into finetuning-acessability
This commit is contained in:
commit
4af9a7d6d9
14 changed files with 1338 additions and 598 deletions
2
Makefile
2
Makefile
|
@ -259,7 +259,7 @@ libllama.so: llama.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h
|
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h
|
||||||
|
|
||||||
#
|
#
|
||||||
# Examples
|
# Examples
|
||||||
|
|
|
@ -331,6 +331,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
} else if (arg == "--low-vram" || arg == "-lv") {
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
params.low_vram = true;
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
} else if (arg == "--no-mmap") {
|
} else if (arg == "--no-mmap") {
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
|
@ -406,6 +412,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
gpt_print_usage(argc, argv, default_params);
|
gpt_print_usage(argc, argv, default_params);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
|
||||||
|
fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
if (escape_prompt) {
|
if (escape_prompt) {
|
||||||
process_escapes(params.prompt);
|
process_escapes(params.prompt);
|
||||||
}
|
}
|
||||||
|
@ -479,6 +493,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
|
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
|
||||||
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
||||||
|
fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
|
||||||
#endif
|
#endif
|
||||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||||
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
||||||
|
@ -528,6 +543,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
||||||
lparams.n_gpu_layers = params.n_gpu_layers;
|
lparams.n_gpu_layers = params.n_gpu_layers;
|
||||||
lparams.main_gpu = params.main_gpu;
|
lparams.main_gpu = params.main_gpu;
|
||||||
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
|
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
|
||||||
|
lparams.low_vram = params.low_vram;
|
||||||
lparams.seed = params.seed;
|
lparams.seed = params.seed;
|
||||||
lparams.f16_kv = params.memory_f16;
|
lparams.f16_kv = params.memory_f16;
|
||||||
lparams.use_mmap = params.use_mmap;
|
lparams.use_mmap = params.use_mmap;
|
||||||
|
|
|
@ -21,15 +21,16 @@
|
||||||
int32_t get_num_physical_cores();
|
int32_t get_num_physical_cores();
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
int32_t seed = -1; // RNG seed
|
int32_t seed = -1; // RNG seed
|
||||||
int32_t n_threads = get_num_physical_cores();
|
int32_t n_threads = get_num_physical_cores();
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 512; // context size
|
||||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
|
bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||||
|
|
|
@ -288,5 +288,6 @@ These options provide extra functionality and customization when running the LLa
|
||||||
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||||
|
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
|
||||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||||
|
|
|
@ -16,6 +16,10 @@ This example allow you to have a llama.cpp http server to interact from a web pa
|
||||||
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
||||||
|
|
||||||
#### Unix-based systems (Linux, macOS, etc.):
|
#### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
Make sure to build with the server option on
|
||||||
|
```bash
|
||||||
|
LLAMA_BUILD_SERVER=1 make
|
||||||
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./server -m models/7B/ggml-model.bin --ctx_size 2048
|
./server -m models/7B/ggml-model.bin --ctx_size 2048
|
||||||
|
@ -289,6 +293,7 @@ Test();
|
||||||
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||||
|
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
|
||||||
- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
|
- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
|
||||||
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
|
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
|
||||||
- `--port`: Set the port to listen. Default: `8080`.
|
- `--port`: Set the port to listen. Default: `8080`.
|
||||||
|
|
|
@ -405,6 +405,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms)
|
||||||
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
||||||
|
fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
|
||||||
#endif
|
#endif
|
||||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||||
|
@ -537,6 +538,14 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
}
|
||||||
|
else if (arg == "--low-vram" || arg == "-lv")
|
||||||
|
{
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
params.low_vram = true;
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
}
|
}
|
||||||
else if (arg == "--main-gpu" || arg == "-mg")
|
else if (arg == "--main-gpu" || arg == "-mg")
|
||||||
|
|
799
ggml-cuda.cu
799
ggml-cuda.cu
File diff suppressed because it is too large
Load diff
|
@ -28,8 +28,10 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
||||||
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
||||||
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
||||||
void ggml_cuda_set_main_device(int main_device);
|
void ggml_cuda_set_main_device(int main_device);
|
||||||
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
||||||
|
void ggml_cuda_free_scratch(void);
|
||||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -55,6 +55,7 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
|
||||||
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
||||||
|
|
||||||
// same as ggml_graph_compute but uses Metal
|
// same as ggml_graph_compute but uses Metal
|
||||||
|
// creates gf->n_threads command buffers in parallel
|
||||||
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
917
ggml-metal.m
917
ggml-metal.m
|
@ -284,528 +284,551 @@ void ggml_metal_get_tensor(
|
||||||
|
|
||||||
void ggml_metal_graph_compute(
|
void ggml_metal_graph_compute(
|
||||||
struct ggml_metal_context * ctx,
|
struct ggml_metal_context * ctx,
|
||||||
struct ggml_cgraph * gf) {
|
struct ggml_cgraph * gf) {
|
||||||
metal_printf("%s: evaluating graph\n", __func__);
|
metal_printf("%s: evaluating graph\n", __func__);
|
||||||
|
|
||||||
size_t offs_src0 = 0;
|
// create multiple command buffers and enqueue them
|
||||||
size_t offs_src1 = 0;
|
// then, we encode the graph into the command buffers in parallel
|
||||||
size_t offs_dst = 0;
|
|
||||||
|
|
||||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
|
const int n_cb = gf->n_threads;
|
||||||
id<MTLComputeCommandEncoder> encoder = nil;
|
|
||||||
|
|
||||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
||||||
//metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
|
||||||
|
|
||||||
struct ggml_tensor * src0 = gf->nodes[i]->src0;
|
for (int i = 0; i < n_cb; ++i) {
|
||||||
struct ggml_tensor * src1 = gf->nodes[i]->src1;
|
command_buffers[i] = [ctx->queue commandBuffer];
|
||||||
struct ggml_tensor * dst = gf->nodes[i];
|
|
||||||
|
|
||||||
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
// enqueue the command buffers in order to specify their execution order
|
||||||
const int64_t ne01 = src0 ? src0->ne[1] : 0;
|
[command_buffers[i] enqueue];
|
||||||
const int64_t ne02 = src0 ? src0->ne[2] : 0;
|
}
|
||||||
const int64_t ne03 = src0 ? src0->ne[3] : 0;
|
|
||||||
|
|
||||||
const uint64_t nb00 = src0 ? src0->nb[0] : 0;
|
// TODO: is this the best way to start threads?
|
||||||
const uint64_t nb01 = src0 ? src0->nb[1] : 0;
|
dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
||||||
const uint64_t nb02 = src0 ? src0->nb[2] : 0;
|
|
||||||
const uint64_t nb03 = src0 ? src0->nb[3] : 0;
|
|
||||||
|
|
||||||
const int64_t ne10 = src1 ? src1->ne[0] : 0;
|
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||||
const int64_t ne11 = src1 ? src1->ne[1] : 0;
|
const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
|
||||||
const int64_t ne12 = src1 ? src1->ne[2] : 0;
|
|
||||||
const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
|
|
||||||
|
|
||||||
const uint64_t nb10 = src1 ? src1->nb[0] : 0;
|
dispatch_async(queue, ^{
|
||||||
const uint64_t nb11 = src1 ? src1->nb[1] : 0;
|
size_t offs_src0 = 0;
|
||||||
const uint64_t nb12 = src1 ? src1->nb[2] : 0;
|
size_t offs_src1 = 0;
|
||||||
const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
|
size_t offs_dst = 0;
|
||||||
|
|
||||||
const int64_t ne0 = dst ? dst->ne[0] : 0;
|
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
|
||||||
const int64_t ne1 = dst ? dst->ne[1] : 0;
|
|
||||||
const int64_t ne2 = dst ? dst->ne[2] : 0;
|
|
||||||
const int64_t ne3 = dst ? dst->ne[3] : 0;
|
|
||||||
|
|
||||||
const uint64_t nb0 = dst ? dst->nb[0] : 0;
|
id<MTLComputeCommandEncoder> encoder = nil;
|
||||||
const uint64_t nb1 = dst ? dst->nb[1] : 0;
|
|
||||||
const uint64_t nb2 = dst ? dst->nb[2] : 0;
|
|
||||||
const uint64_t nb3 = dst ? dst->nb[3] : 0;
|
|
||||||
|
|
||||||
const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
|
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
||||||
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
const int node_end = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
|
||||||
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
|
|
||||||
|
|
||||||
id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
|
for (int i = node_start; i < node_end; ++i) {
|
||||||
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
|
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||||
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil;
|
|
||||||
|
|
||||||
//metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
struct ggml_tensor * src0 = gf->nodes[i]->src0;
|
||||||
//if (src0) {
|
struct ggml_tensor * src1 = gf->nodes[i]->src1;
|
||||||
// metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
|
struct ggml_tensor * dst = gf->nodes[i];
|
||||||
// ggml_is_contiguous(src0), src0->name);
|
|
||||||
//}
|
|
||||||
//if (src1) {
|
|
||||||
// metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
|
|
||||||
// ggml_is_contiguous(src1), src1->name);
|
|
||||||
//}
|
|
||||||
//if (dst) {
|
|
||||||
// metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
|
|
||||||
// dst->name);
|
|
||||||
//}
|
|
||||||
|
|
||||||
switch (dst->op) {
|
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
||||||
case GGML_OP_RESHAPE:
|
const int64_t ne01 = src0 ? src0->ne[1] : 0;
|
||||||
case GGML_OP_VIEW:
|
const int64_t ne02 = src0 ? src0->ne[2] : 0;
|
||||||
case GGML_OP_TRANSPOSE:
|
const int64_t ne03 = src0 ? src0->ne[3] : 0;
|
||||||
case GGML_OP_PERMUTE:
|
|
||||||
{
|
|
||||||
// noop
|
|
||||||
} break;
|
|
||||||
case GGML_OP_ADD:
|
|
||||||
{
|
|
||||||
if (encoder == nil) {
|
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
|
||||||
}
|
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_add];
|
const uint64_t nb00 = src0 ? src0->nb[0] : 0;
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
const uint64_t nb01 = src0 ? src0->nb[1] : 0;
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
const uint64_t nb02 = src0 ? src0->nb[2] : 0;
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
const uint64_t nb03 = src0 ? src0->nb[3] : 0;
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
const int64_t ne10 = src1 ? src1->ne[0] : 0;
|
||||||
|
const int64_t ne11 = src1 ? src1->ne[1] : 0;
|
||||||
|
const int64_t ne12 = src1 ? src1->ne[2] : 0;
|
||||||
|
const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
const uint64_t nb10 = src1 ? src1->nb[0] : 0;
|
||||||
} break;
|
const uint64_t nb11 = src1 ? src1->nb[1] : 0;
|
||||||
case GGML_OP_MUL:
|
const uint64_t nb12 = src1 ? src1->nb[2] : 0;
|
||||||
{
|
const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
|
||||||
if (encoder == nil) {
|
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ggml_nelements(src1) == ne10) {
|
const int64_t ne0 = dst ? dst->ne[0] : 0;
|
||||||
// src1 is a row
|
const int64_t ne1 = dst ? dst->ne[1] : 0;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_row];
|
const int64_t ne2 = dst ? dst->ne[2] : 0;
|
||||||
} else {
|
const int64_t ne3 = dst ? dst->ne[3] : 0;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul];
|
|
||||||
}
|
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
|
||||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
const uint64_t nb0 = dst ? dst->nb[0] : 0;
|
||||||
|
const uint64_t nb1 = dst ? dst->nb[1] : 0;
|
||||||
|
const uint64_t nb2 = dst ? dst->nb[2] : 0;
|
||||||
|
const uint64_t nb3 = dst ? dst->nb[3] : 0;
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
|
||||||
} break;
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
||||||
case GGML_OP_SCALE:
|
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
|
||||||
{
|
|
||||||
if (encoder == nil) {
|
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
|
||||||
}
|
|
||||||
|
|
||||||
const float scale = *(const float *) src1->data;
|
id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
|
||||||
|
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
|
||||||
|
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil;
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_scale];
|
//metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
//if (src0) {
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
// metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
|
||||||
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
// ggml_is_contiguous(src0), src0->name);
|
||||||
|
//}
|
||||||
|
//if (src1) {
|
||||||
|
// metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
|
||||||
|
// ggml_is_contiguous(src1), src1->name);
|
||||||
|
//}
|
||||||
|
//if (dst) {
|
||||||
|
// metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
|
||||||
|
// dst->name);
|
||||||
|
//}
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
switch (dst->op) {
|
||||||
|
case GGML_OP_RESHAPE:
|
||||||
|
case GGML_OP_VIEW:
|
||||||
|
case GGML_OP_TRANSPOSE:
|
||||||
|
case GGML_OP_PERMUTE:
|
||||||
|
{
|
||||||
|
// noop
|
||||||
|
} break;
|
||||||
|
case GGML_OP_ADD:
|
||||||
|
{
|
||||||
|
if (encoder == nil) {
|
||||||
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
|
}
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder setComputePipelineState:ctx->pipeline_add];
|
||||||
} break;
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
case GGML_OP_SILU:
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
{
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
if (encoder == nil) {
|
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
|
||||||
}
|
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_silu];
|
const int64_t n = ggml_nelements(dst);
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
|
} break;
|
||||||
|
case GGML_OP_MUL:
|
||||||
|
{
|
||||||
|
if (encoder == nil) {
|
||||||
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
|
}
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
if (ggml_nelements(src1) == ne10) {
|
||||||
} break;
|
// src1 is a row
|
||||||
case GGML_OP_RELU:
|
[encoder setComputePipelineState:ctx->pipeline_mul_row];
|
||||||
{
|
} else {
|
||||||
if (encoder == nil) {
|
[encoder setComputePipelineState:ctx->pipeline_mul];
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
}
|
||||||
}
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_relu];
|
const int64_t n = ggml_nelements(dst);
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
|
} break;
|
||||||
|
case GGML_OP_SCALE:
|
||||||
|
{
|
||||||
|
if (encoder == nil) {
|
||||||
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
|
}
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
const float scale = *(const float *) src1->data;
|
||||||
} break;
|
|
||||||
case GGML_OP_GELU:
|
|
||||||
{
|
|
||||||
if (encoder == nil) {
|
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
|
||||||
}
|
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_gelu];
|
[encoder setComputePipelineState:ctx->pipeline_scale];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
const int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SILU:
|
||||||
{
|
{
|
||||||
if (encoder == nil) {
|
if (encoder == nil) {
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
}
|
}
|
||||||
|
|
||||||
const int nth = 32;
|
[encoder setComputePipelineState:ctx->pipeline_silu];
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
const int64_t n = ggml_nelements(dst);
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
|
||||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
|
||||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
|
||||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
|
||||||
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_RELU:
|
||||||
{
|
{
|
||||||
if (encoder == nil) {
|
if (encoder == nil) {
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_past = ((int32_t *)(src1->data))[0];
|
[encoder setComputePipelineState:ctx->pipeline_relu];
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
|
const int64_t n = ggml_nelements(dst);
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
|
||||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
|
||||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
|
||||||
[encoder setBytes:&n_past length:sizeof(int) atIndex:4];
|
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_GELU:
|
||||||
{
|
{
|
||||||
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
if (encoder == nil) {
|
||||||
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
|
}
|
||||||
|
|
||||||
GGML_ASSERT(ne00 == ne10);
|
[encoder setComputePipelineState:ctx->pipeline_gelu];
|
||||||
GGML_ASSERT(ne02 == ne12);
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
if (ggml_is_contiguous(src0) &&
|
const int64_t n = ggml_nelements(dst);
|
||||||
ggml_is_contiguous(src1) &&
|
|
||||||
(src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
|
|
||||||
|
|
||||||
if (encoder != nil) {
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
[encoder endEncoding];
|
} break;
|
||||||
encoder = nil;
|
case GGML_OP_SOFT_MAX:
|
||||||
}
|
{
|
||||||
|
if (encoder == nil) {
|
||||||
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
|
}
|
||||||
|
|
||||||
MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
|
const int nth = 32;
|
||||||
MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
|
|
||||||
|
|
||||||
// for F32 x F32 we use MPS
|
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
||||||
MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||||
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||||
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||||
|
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
||||||
|
|
||||||
MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt];
|
} break;
|
||||||
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
|
{
|
||||||
|
if (encoder == nil) {
|
||||||
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
|
}
|
||||||
|
|
||||||
MPSMatrixDescriptor * desc = [MPSMatrixDescriptor
|
const int n_past = ((int32_t *)(src1->data))[0];
|
||||||
matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32];
|
|
||||||
|
|
||||||
MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
|
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
|
||||||
initWithDevice:ctx->device transposeLeft:false transposeRight:true
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||||
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||||
|
[encoder setBytes:&n_past length:sizeof(int) atIndex:4];
|
||||||
|
|
||||||
// we need to do ne02 multiplications
|
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
// TODO: is there a way to do this in parallel - currently very slow ..
|
} break;
|
||||||
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
|
case GGML_OP_MUL_MAT:
|
||||||
for (int64_t i02 = 0; i02 < ne02; ++i02) {
|
{
|
||||||
size_t offs_src0_cur = offs_src0 + i02*nb02;
|
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
||||||
size_t offs_src1_cur = offs_src1 + i02*nb12;
|
|
||||||
size_t offs_dst_cur = offs_dst + i02*nb2;
|
|
||||||
|
|
||||||
MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
|
GGML_ASSERT(ne00 == ne10);
|
||||||
MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
|
GGML_ASSERT(ne02 == ne12);
|
||||||
MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst_cur descriptor:desc ];
|
|
||||||
|
|
||||||
[mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
|
if (ggml_is_contiguous(src0) &&
|
||||||
}
|
ggml_is_contiguous(src1) &&
|
||||||
} else {
|
(src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
|
||||||
if (encoder == nil) {
|
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
|
||||||
}
|
|
||||||
|
|
||||||
int nth0 = 32;
|
if (encoder != nil) {
|
||||||
int nth1 = 1;
|
[encoder endEncoding];
|
||||||
|
encoder = nil;
|
||||||
// use custom matrix x vector kernel
|
|
||||||
switch (src0t) {
|
|
||||||
case GGML_TYPE_F16:
|
|
||||||
{
|
|
||||||
GGML_ASSERT(ne02 == ne12);
|
|
||||||
|
|
||||||
nth0 = 64;
|
|
||||||
nth1 = 1;
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q4_0:
|
|
||||||
{
|
|
||||||
GGML_ASSERT(ne02 == 1);
|
|
||||||
GGML_ASSERT(ne12 == 1);
|
|
||||||
|
|
||||||
nth0 = 8;
|
|
||||||
nth1 = 8;
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q4_1:
|
|
||||||
{
|
|
||||||
GGML_ASSERT(ne02 == 1);
|
|
||||||
GGML_ASSERT(ne12 == 1);
|
|
||||||
|
|
||||||
nth0 = 8;
|
|
||||||
nth1 = 8;
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q2_K:
|
|
||||||
{
|
|
||||||
GGML_ASSERT(ne02 == 1);
|
|
||||||
GGML_ASSERT(ne12 == 1);
|
|
||||||
|
|
||||||
nth0 = 4;
|
|
||||||
nth1 = 16;
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q3_K:
|
|
||||||
{
|
|
||||||
GGML_ASSERT(ne02 == 1);
|
|
||||||
GGML_ASSERT(ne12 == 1);
|
|
||||||
|
|
||||||
nth0 = 4;
|
|
||||||
nth1 = 16;
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q4_K:
|
|
||||||
{
|
|
||||||
GGML_ASSERT(ne02 == 1);
|
|
||||||
GGML_ASSERT(ne12 == 1);
|
|
||||||
|
|
||||||
nth0 = 4;
|
|
||||||
nth1 = 16;
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q5_K:
|
|
||||||
{
|
|
||||||
GGML_ASSERT(ne02 == 1);
|
|
||||||
GGML_ASSERT(ne12 == 1);
|
|
||||||
|
|
||||||
nth0 = 4;
|
|
||||||
nth1 = 16;
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q6_K:
|
|
||||||
{
|
|
||||||
GGML_ASSERT(ne02 == 1);
|
|
||||||
GGML_ASSERT(ne12 == 1);
|
|
||||||
|
|
||||||
nth0 = 4;
|
|
||||||
nth1 = 16;
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
|
|
||||||
} break;
|
|
||||||
default:
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Asserting on type %d\n",(int)src0t);
|
|
||||||
GGML_ASSERT(false && "not implemented");
|
|
||||||
}
|
}
|
||||||
};
|
|
||||||
|
|
||||||
|
MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
|
||||||
|
MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
|
||||||
|
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
// for F32 x F32 we use MPS
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt];
|
||||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
|
||||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
|
||||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
|
|
||||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
|
|
||||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
|
|
||||||
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
|
|
||||||
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
|
|
||||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
|
|
||||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
|
|
||||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
|
|
||||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
|
||||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
|
||||||
|
|
||||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
|
MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
|
||||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt];
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
|
||||||
}
|
|
||||||
else if (src0t == GGML_TYPE_Q2_K ||
|
|
||||||
src0t == GGML_TYPE_Q3_K ||
|
|
||||||
src0t == GGML_TYPE_Q4_K ||
|
|
||||||
src0t == GGML_TYPE_Q5_K ||
|
|
||||||
src0t == GGML_TYPE_Q6_K) {
|
|
||||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
|
||||||
} else {
|
|
||||||
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} break;
|
|
||||||
case GGML_OP_GET_ROWS:
|
|
||||||
{
|
|
||||||
if (encoder == nil) {
|
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (src0->type) {
|
MPSMatrixDescriptor * desc = [MPSMatrixDescriptor
|
||||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32];
|
||||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
|
||||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
|
||||||
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
|
|
||||||
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
|
|
||||||
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
|
|
||||||
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
|
|
||||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
|
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
initWithDevice:ctx->device transposeLeft:false transposeRight:true
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
|
||||||
[encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3];
|
|
||||||
[encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
|
|
||||||
[encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5];
|
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(src1);
|
// we need to do ne02 multiplications
|
||||||
|
// TODO: is there a way to do this in parallel - currently very slow ..
|
||||||
|
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
|
||||||
|
for (int64_t i02 = 0; i02 < ne02; ++i02) {
|
||||||
|
size_t offs_src0_cur = offs_src0 + i02*nb02;
|
||||||
|
size_t offs_src1_cur = offs_src1 + i02*nb12;
|
||||||
|
size_t offs_dst_cur = offs_dst + i02*nb2;
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
|
||||||
} break;
|
MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
|
||||||
case GGML_OP_RMS_NORM:
|
MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst_cur descriptor:desc ];
|
||||||
{
|
|
||||||
if (encoder == nil) {
|
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
|
||||||
}
|
|
||||||
|
|
||||||
const float eps = 1e-6f;
|
[mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (encoder == nil) {
|
||||||
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
|
}
|
||||||
|
|
||||||
const int nth = 256;
|
int nth0 = 32;
|
||||||
|
int nth1 = 1;
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_rms_norm];
|
// use custom matrix x vector kernel
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
switch (src0t) {
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
case GGML_TYPE_F16:
|
||||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
{
|
||||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
GGML_ASSERT(ne02 == ne12);
|
||||||
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
|
||||||
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
|
||||||
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
nth0 = 64;
|
||||||
|
nth1 = 1;
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(ne02 == 1);
|
||||||
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
nth0 = 8;
|
||||||
} break;
|
nth1 = 8;
|
||||||
case GGML_OP_ROPE:
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
|
||||||
{
|
} break;
|
||||||
if (encoder == nil) {
|
case GGML_TYPE_Q4_1:
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
{
|
||||||
}
|
GGML_ASSERT(ne02 == 1);
|
||||||
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
const int n_dims = ((int32_t *) src1->data)[1];
|
nth0 = 8;
|
||||||
const int mode = ((int32_t *) src1->data)[2];
|
nth1 = 8;
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(ne02 == 1);
|
||||||
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
const int n_past = ((int32_t *)(src1->data))[0];
|
nth0 = 4;
|
||||||
|
nth1 = 16;
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_Q3_K:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(ne02 == 1);
|
||||||
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_rope];
|
nth0 = 4;
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
nth1 = 16;
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
|
||||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
} break;
|
||||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
case GGML_TYPE_Q4_K:
|
||||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
{
|
||||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
GGML_ASSERT(ne02 == 1);
|
||||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
GGML_ASSERT(ne12 == 1);
|
||||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
|
||||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
|
||||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
|
||||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
|
||||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
|
||||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
|
||||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
|
||||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
|
||||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
|
||||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
|
||||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
|
||||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
|
|
||||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
|
|
||||||
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
|
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
nth0 = 4;
|
||||||
} break;
|
nth1 = 16;
|
||||||
case GGML_OP_CPY:
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
|
||||||
{
|
} break;
|
||||||
if (encoder == nil) {
|
case GGML_TYPE_Q5_K:
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
{
|
||||||
}
|
GGML_ASSERT(ne02 == 1);
|
||||||
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
const int nth = 32;
|
nth0 = 4;
|
||||||
|
nth1 = 16;
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(ne02 == 1);
|
||||||
|
GGML_ASSERT(ne12 == 1);
|
||||||
|
|
||||||
switch (src0t) {
|
nth0 = 4;
|
||||||
case GGML_TYPE_F32:
|
nth1 = 16;
|
||||||
{
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
|
||||||
switch (dstt) {
|
} break;
|
||||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break;
|
default:
|
||||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break;
|
{
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
fprintf(stderr, "Asserting on type %d\n",(int)src0t);
|
||||||
|
GGML_ASSERT(false && "not implemented");
|
||||||
|
}
|
||||||
};
|
};
|
||||||
} break;
|
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
|
||||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
|
||||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
|
||||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
|
||||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
|
||||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
|
||||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
|
||||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
|
||||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
||||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
||||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
|
||||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
|
||||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
|
||||||
} break;
|
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
||||||
default:
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
}
|
||||||
GGML_ASSERT(false);
|
else if (src0t == GGML_TYPE_Q2_K ||
|
||||||
}
|
src0t == GGML_TYPE_Q3_K ||
|
||||||
|
src0t == GGML_TYPE_Q4_K ||
|
||||||
|
src0t == GGML_TYPE_Q5_K ||
|
||||||
|
src0t == GGML_TYPE_Q6_K) {
|
||||||
|
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
|
} else {
|
||||||
|
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case GGML_OP_GET_ROWS:
|
||||||
|
{
|
||||||
|
if (encoder == nil) {
|
||||||
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
||||||
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
||||||
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
||||||
|
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
|
||||||
|
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
|
||||||
|
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
|
||||||
|
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
|
||||||
|
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
|
||||||
|
default: GGML_ASSERT(false && "not implemented");
|
||||||
|
}
|
||||||
|
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
|
[encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3];
|
||||||
|
[encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
|
||||||
|
[encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5];
|
||||||
|
|
||||||
|
const int64_t n = ggml_nelements(src1);
|
||||||
|
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
|
} break;
|
||||||
|
case GGML_OP_RMS_NORM:
|
||||||
|
{
|
||||||
|
if (encoder == nil) {
|
||||||
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
|
}
|
||||||
|
|
||||||
|
const float eps = 1e-6f;
|
||||||
|
|
||||||
|
const int nth = 256;
|
||||||
|
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_rms_norm];
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||||
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||||
|
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||||
|
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
||||||
|
|
||||||
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
|
} break;
|
||||||
|
case GGML_OP_ROPE:
|
||||||
|
{
|
||||||
|
if (encoder == nil) {
|
||||||
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n_dims = ((int32_t *) src1->data)[1];
|
||||||
|
const int mode = ((int32_t *) src1->data)[2];
|
||||||
|
|
||||||
|
const int n_past = ((int32_t *)(src1->data))[0];
|
||||||
|
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_rope];
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||||
|
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||||
|
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||||
|
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||||
|
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||||
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||||
|
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||||
|
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||||
|
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||||
|
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||||
|
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||||
|
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||||
|
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||||
|
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||||
|
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||||
|
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||||
|
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
|
||||||
|
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
|
||||||
|
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
|
||||||
|
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
|
} break;
|
||||||
|
case GGML_OP_CPY:
|
||||||
|
{
|
||||||
|
if (encoder == nil) {
|
||||||
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
|
}
|
||||||
|
|
||||||
|
const int nth = 32;
|
||||||
|
|
||||||
|
switch (src0t) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
switch (dstt) {
|
||||||
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break;
|
||||||
|
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break;
|
||||||
|
default: GGML_ASSERT(false && "not implemented");
|
||||||
|
};
|
||||||
|
} break;
|
||||||
|
default: GGML_ASSERT(false && "not implemented");
|
||||||
|
}
|
||||||
|
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||||
|
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||||
|
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||||
|
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||||
|
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||||
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||||
|
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||||
|
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||||
|
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||||
|
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||||
|
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||||
|
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||||
|
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||||
|
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||||
|
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||||
|
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||||
|
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (encoder != nil) {
|
||||||
|
[encoder endEncoding];
|
||||||
|
encoder = nil;
|
||||||
|
}
|
||||||
|
|
||||||
|
[command_buffer commit];
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (encoder != nil) {
|
// wait for all threads to finish
|
||||||
[encoder endEncoding];
|
dispatch_barrier_sync(queue, ^{});
|
||||||
encoder = nil;
|
|
||||||
}
|
|
||||||
|
|
||||||
[command_buffer commit];
|
[command_buffers[n_cb - 1] waitUntilCompleted];
|
||||||
[command_buffer waitUntilCompleted];
|
|
||||||
|
|
||||||
{
|
|
||||||
const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
|
|
||||||
UNUSED(time_elapsed);
|
|
||||||
|
|
||||||
metal_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
6
ggml.c
6
ggml.c
|
@ -3939,6 +3939,12 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
||||||
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
|
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
|
|
1
ggml.h
1
ggml.h
|
@ -485,6 +485,7 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
// use this to compute the memory overhead of a tensor
|
// use this to compute the memory overhead of a tensor
|
||||||
GGML_API size_t ggml_tensor_overhead(void);
|
GGML_API size_t ggml_tensor_overhead(void);
|
||||||
|
|
159
llama.cpp
159
llama.cpp
|
@ -165,6 +165,11 @@ struct llama_kv_cache {
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
ggml_cuda_free_data(k);
|
||||||
|
ggml_cuda_free_data(v);
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -210,6 +215,7 @@ struct llama_model {
|
||||||
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
||||||
ggml_cuda_free_data(tensors_by_name[i].second);
|
ggml_cuda_free_data(tensors_by_name[i].second);
|
||||||
}
|
}
|
||||||
|
ggml_cuda_free_scratch();
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
||||||
ggml_cl_free_data(tensors_by_name[i].second);
|
ggml_cl_free_data(tensors_by_name[i].second);
|
||||||
|
@ -867,7 +873,8 @@ static bool kv_cache_init(
|
||||||
const struct llama_hparams & hparams,
|
const struct llama_hparams & hparams,
|
||||||
struct llama_kv_cache & cache,
|
struct llama_kv_cache & cache,
|
||||||
ggml_type wtype,
|
ggml_type wtype,
|
||||||
int n_ctx) {
|
int n_ctx,
|
||||||
|
int n_gpu_layers) {
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_layer = hparams.n_layer;
|
||||||
|
|
||||||
|
@ -893,6 +900,15 @@ static bool kv_cache_init(
|
||||||
ggml_set_name(cache.k, "cache_k");
|
ggml_set_name(cache.k, "cache_k");
|
||||||
ggml_set_name(cache.v, "cache_v");
|
ggml_set_name(cache.v, "cache_v");
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (n_gpu_layers > n_layer + 1) {
|
||||||
|
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
||||||
|
}
|
||||||
|
if (n_gpu_layers > n_layer + 2) {
|
||||||
|
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -903,6 +919,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.gpu_layers =*/ 0,
|
/*.gpu_layers =*/ 0,
|
||||||
/*.main_gpu =*/ 0,
|
/*.main_gpu =*/ 0,
|
||||||
/*.tensor_split =*/ {0},
|
/*.tensor_split =*/ {0},
|
||||||
|
/*.low_vram =*/ false,
|
||||||
/*.seed =*/ -1,
|
/*.seed =*/ -1,
|
||||||
/*.f16_kv =*/ true,
|
/*.f16_kv =*/ true,
|
||||||
/*.logits_all =*/ false,
|
/*.logits_all =*/ false,
|
||||||
|
@ -1011,6 +1028,7 @@ static void llama_model_load_internal(
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
int main_gpu,
|
int main_gpu,
|
||||||
const float * tensor_split,
|
const float * tensor_split,
|
||||||
|
bool low_vram,
|
||||||
ggml_type memory_type,
|
ggml_type memory_type,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool use_mlock,
|
bool use_mlock,
|
||||||
|
@ -1137,18 +1155,34 @@ static void llama_model_load_internal(
|
||||||
ml->ggml_ctx = ctx;
|
ml->ggml_ctx = ctx;
|
||||||
|
|
||||||
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||||
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
|
||||||
|
|
||||||
// "output" tensor
|
// "output" tensor
|
||||||
{
|
{
|
||||||
|
ggml_backend backend_norm;
|
||||||
ggml_backend backend_output;
|
ggml_backend backend_output;
|
||||||
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
||||||
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
||||||
|
// on Windows however this is detrimental unless everything is on the GPU
|
||||||
|
#ifndef _WIN32
|
||||||
|
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||||
|
#else
|
||||||
|
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||||
|
#endif // _WIN32
|
||||||
|
|
||||||
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
||||||
} else {
|
} else {
|
||||||
|
backend_norm = GGML_BACKEND_CPU;
|
||||||
backend_output = GGML_BACKEND_CPU;
|
backend_output = GGML_BACKEND_CPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm);
|
||||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
||||||
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
|
vram_weights += ggml_nbytes(model.norm);
|
||||||
|
}
|
||||||
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
vram_weights += ggml_nbytes(model.output);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
@ -1208,22 +1242,47 @@ static void llama_model_load_internal(
|
||||||
(void) vram_scratch;
|
(void) vram_scratch;
|
||||||
(void) n_batch;
|
(void) n_batch;
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
vram_scratch = n_batch * MB;
|
if (low_vram) {
|
||||||
ggml_cuda_set_scratch_size(vram_scratch);
|
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
||||||
if (n_gpu_layers > 0) {
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
||||||
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
|
} else {
|
||||||
__func__, vram_scratch / MB);
|
vram_scratch = n_batch * MB;
|
||||||
|
ggml_cuda_set_scratch_size(vram_scratch);
|
||||||
|
if (n_gpu_layers > 0) {
|
||||||
|
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
|
||||||
|
__func__, vram_scratch / MB);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||||
|
|
||||||
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
|
fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
||||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||||
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
|
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
||||||
}
|
}
|
||||||
|
size_t vram_kv_cache = 0;
|
||||||
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
||||||
|
if (low_vram) {
|
||||||
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
||||||
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
||||||
|
if (low_vram) {
|
||||||
|
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
||||||
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
||||||
|
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
||||||
|
__func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
|
||||||
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
||||||
__func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
||||||
#else
|
#else
|
||||||
(void) n_gpu_layers;
|
(void) n_gpu_layers;
|
||||||
#endif
|
#endif
|
||||||
|
@ -1262,6 +1321,7 @@ static bool llama_model_load(
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
int main_gpu,
|
int main_gpu,
|
||||||
float * tensor_split,
|
float * tensor_split,
|
||||||
|
bool low_vram,
|
||||||
ggml_type memory_type,
|
ggml_type memory_type,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool use_mlock,
|
bool use_mlock,
|
||||||
|
@ -1269,7 +1329,7 @@ static bool llama_model_load(
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void *progress_callback_user_data) {
|
void *progress_callback_user_data) {
|
||||||
try {
|
try {
|
||||||
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
|
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
||||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||||
return true;
|
return true;
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
|
@ -1345,12 +1405,33 @@ static bool llama_eval_internal(
|
||||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
(void) i_gpu_start;
|
(void) i_gpu_start;
|
||||||
|
|
||||||
|
// offload functions set the tensor output backend to GPU
|
||||||
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
||||||
|
//
|
||||||
|
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
||||||
|
// in that case ggml_cuda_assign_buffers has no effect
|
||||||
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
||||||
|
offload_func_t offload_func_kq = llama_nop;
|
||||||
|
offload_func_t offload_func_v = llama_nop;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (n_gpu_layers > n_layer) {
|
||||||
|
offload_func_nr = ggml_cuda_assign_buffers;
|
||||||
|
}
|
||||||
|
if (n_gpu_layers > n_layer + 1) {
|
||||||
|
offload_func_v = ggml_cuda_assign_buffers;
|
||||||
|
}
|
||||||
|
if (n_gpu_layers > n_layer + 2) {
|
||||||
|
offload_func_kq = ggml_cuda_assign_buffers;
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
offload_func_t offload_func = llama_nop;
|
offload_func_t offload_func = llama_nop;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
if (il >= i_gpu_start) {
|
if (il >= i_gpu_start) {
|
||||||
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
offload_func = ggml_cuda_assign_buffers;
|
||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
|
@ -1373,31 +1454,42 @@ static bool llama_eval_internal(
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
||||||
// offload_func(tmpq);
|
|
||||||
ggml_set_name(tmpq, "tmpq");
|
|
||||||
|
|
||||||
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
// offload_func(tmpk);
|
offload_func_kq(tmpk);
|
||||||
ggml_set_name(tmpk, "tmpk");
|
ggml_set_name(tmpk, "tmpk");
|
||||||
|
|
||||||
|
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
offload_func_kq(tmpq);
|
||||||
|
ggml_set_name(tmpq, "tmpq");
|
||||||
|
|
||||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||||
|
offload_func_kq(Kcur);
|
||||||
ggml_set_name(Kcur, "Kcur");
|
ggml_set_name(Kcur, "Kcur");
|
||||||
|
|
||||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||||
|
offload_func_kq(Qcur);
|
||||||
ggml_set_name(Qcur, "Qcur");
|
ggml_set_name(Qcur, "Qcur");
|
||||||
|
|
||||||
// store key and value to memory
|
// store key and value to memory
|
||||||
{
|
{
|
||||||
// compute the transposed [N, n_embd] V matrix
|
// compute the transposed [N, n_embd] V matrix
|
||||||
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
|
|
||||||
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
offload_func_v(tmpv);
|
||||||
|
ggml_set_name(tmpv, "tmpv");
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
|
||||||
|
offload_func_v(Vcur);
|
||||||
ggml_set_name(Vcur, "Vcur");
|
ggml_set_name(Vcur, "Vcur");
|
||||||
|
|
||||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
||||||
|
offload_func_kq(k);
|
||||||
ggml_set_name(k, "k");
|
ggml_set_name(k, "k");
|
||||||
|
|
||||||
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
||||||
( n_ctx)*ggml_element_size(kv_self.v),
|
( n_ctx)*ggml_element_size(kv_self.v),
|
||||||
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
||||||
|
offload_func_v(v);
|
||||||
ggml_set_name(v, "v");
|
ggml_set_name(v, "v");
|
||||||
|
|
||||||
// important: storing RoPE-ed version of K in the KV cache!
|
// important: storing RoPE-ed version of K in the KV cache!
|
||||||
|
@ -1409,6 +1501,7 @@ static bool llama_eval_internal(
|
||||||
ggml_permute(ctx0,
|
ggml_permute(ctx0,
|
||||||
Qcur,
|
Qcur,
|
||||||
0, 2, 1, 3);
|
0, 2, 1, 3);
|
||||||
|
offload_func_kq(Q);
|
||||||
ggml_set_name(Q, "Q");
|
ggml_set_name(Q, "Q");
|
||||||
|
|
||||||
struct ggml_tensor * K =
|
struct ggml_tensor * K =
|
||||||
|
@ -1417,10 +1510,12 @@ static bool llama_eval_internal(
|
||||||
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
||||||
n_embd/n_head, n_head, n_past + N),
|
n_embd/n_head, n_head, n_past + N),
|
||||||
0, 2, 1, 3);
|
0, 2, 1, 3);
|
||||||
|
offload_func_kq(K);
|
||||||
ggml_set_name(K, "K");
|
ggml_set_name(K, "K");
|
||||||
|
|
||||||
// K * Q
|
// K * Q
|
||||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
offload_func_kq(KQ);
|
||||||
ggml_set_name(KQ, "KQ");
|
ggml_set_name(KQ, "KQ");
|
||||||
|
|
||||||
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||||
|
@ -1429,14 +1524,17 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
||||||
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
||||||
|
offload_func_kq(KQ_scaled);
|
||||||
ggml_set_name(KQ_scaled, "KQ_scaled");
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
||||||
|
|
||||||
// KQ_masked = mask_past(KQ_scaled)
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||||
|
offload_func_kq(KQ_masked);
|
||||||
ggml_set_name(KQ_masked, "KQ_masked");
|
ggml_set_name(KQ_masked, "KQ_masked");
|
||||||
|
|
||||||
// KQ = soft_max(KQ_masked)
|
// KQ = soft_max(KQ_masked)
|
||||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||||
|
offload_func_v(KQ_soft_max);
|
||||||
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
||||||
|
|
||||||
// split cached V into n_head heads
|
// split cached V into n_head heads
|
||||||
|
@ -1446,10 +1544,12 @@ static bool llama_eval_internal(
|
||||||
n_ctx*ggml_element_size(kv_self.v),
|
n_ctx*ggml_element_size(kv_self.v),
|
||||||
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
|
||||||
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
|
||||||
|
offload_func_v(V);
|
||||||
ggml_set_name(V, "V");
|
ggml_set_name(V, "V");
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||||
|
offload_func_v(KQV);
|
||||||
ggml_set_name(KQV, "KQV");
|
ggml_set_name(KQV, "KQV");
|
||||||
#else
|
#else
|
||||||
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
||||||
|
@ -1461,12 +1561,14 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
offload_func_v(KQV_merged);
|
||||||
ggml_set_name(KQV_merged, "KQV_merged");
|
ggml_set_name(KQV_merged, "KQV_merged");
|
||||||
|
|
||||||
// cur = KQV_merged.contiguous().view(n_embd, N)
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
||||||
cur = ggml_cpy(ctx0,
|
cur = ggml_cpy(ctx0,
|
||||||
KQV_merged,
|
KQV_merged,
|
||||||
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||||
|
offload_func_v(cur);
|
||||||
ggml_set_name(cur, "KQV_merged_contiguous");
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
||||||
|
|
||||||
// projection (no bias)
|
// projection (no bias)
|
||||||
|
@ -1478,7 +1580,6 @@ static bool llama_eval_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
lctx.use_buf(ctx0, 1);
|
lctx.use_buf(ctx0, 1);
|
||||||
//ggml_cuda_set_scratch(1);
|
|
||||||
|
|
||||||
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
||||||
offload_func(inpFF);
|
offload_func(inpFF);
|
||||||
|
@ -1536,32 +1637,24 @@ static bool llama_eval_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
lctx.use_buf(ctx0, 0);
|
lctx.use_buf(ctx0, 0);
|
||||||
//ggml_cuda_set_scratch(0);
|
|
||||||
|
|
||||||
// used at the end to optionally extract the embeddings
|
// used at the end to optionally extract the embeddings
|
||||||
struct ggml_tensor * embeddings = NULL;
|
struct ggml_tensor * embeddings = NULL;
|
||||||
|
|
||||||
offload_func_t offload_func = llama_nop;
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
|
||||||
if (n_gpu_layers > n_layer) {
|
|
||||||
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
|
|
||||||
}
|
|
||||||
#endif // GGML_USE_CUBLAS
|
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
cur = ggml_rms_norm(ctx0, inpL);
|
cur = ggml_rms_norm(ctx0, inpL);
|
||||||
offload_func(cur);
|
offload_func_nr(cur);
|
||||||
ggml_set_name(cur, "rms_norm_inpL");
|
ggml_set_name(cur, "rms_norm_inpL");
|
||||||
|
|
||||||
cur = ggml_rms_norm(ctx0, cur);
|
cur = ggml_rms_norm(ctx0, cur);
|
||||||
offload_func(cur);
|
offload_func_nr(cur);
|
||||||
ggml_set_name(cur, "rms_norm_after");
|
ggml_set_name(cur, "rms_norm_after");
|
||||||
|
|
||||||
// cur = cur*norm(broadcasted)
|
// cur = cur*norm(broadcasted)
|
||||||
cur = ggml_mul(ctx0, cur, model.norm);
|
cur = ggml_mul(ctx0, cur, model.norm);
|
||||||
offload_func(cur);
|
offload_func_nr(cur);
|
||||||
ggml_set_name(cur, "result_norm");
|
ggml_set_name(cur, "result_norm");
|
||||||
|
|
||||||
embeddings = cur;
|
embeddings = cur;
|
||||||
|
@ -2552,8 +2645,8 @@ struct llama_context * llama_init_from_file(
|
||||||
|
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
||||||
params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
|
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
||||||
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
||||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
@ -2562,7 +2655,7 @@ struct llama_context * llama_init_from_file(
|
||||||
|
|
||||||
// reserve memory for context buffers
|
// reserve memory for context buffers
|
||||||
if (!params.vocab_only) {
|
if (!params.vocab_only) {
|
||||||
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
||||||
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
1
llama.h
1
llama.h
|
@ -77,6 +77,7 @@ extern "C" {
|
||||||
int n_gpu_layers; // number of layers to store in VRAM
|
int n_gpu_layers; // number of layers to store in VRAM
|
||||||
int main_gpu; // the GPU that is used for scratch and small tensors
|
int main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
||||||
|
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
||||||
int seed; // RNG seed, -1 for random
|
int seed; // RNG seed, -1 for random
|
||||||
|
|
||||||
bool f16_kv; // use fp16 for KV cache
|
bool f16_kv; // use fp16 for KV cache
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue