style : minor fixes, mostly indentations
This commit is contained in:
parent
da730c53bf
commit
d0b6c942fc
4 changed files with 65 additions and 53 deletions
43
ggml-metal.m
43
ggml-metal.m
|
@ -885,32 +885,33 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
const int n_past = ((int32_t *)(src1->data))[0];
|
const int n_past = ((int32_t *)(src1->data))[0];
|
||||||
|
|
||||||
float freq_base, freq_scale;
|
float freq_base;
|
||||||
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
float freq_scale;
|
||||||
|
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
||||||
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_rope];
|
[encoder setComputePipelineState:ctx->pipeline_rope];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
|
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
|
||||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
|
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
|
||||||
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
|
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
|
||||||
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
||||||
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
||||||
|
|
||||||
|
|
16
ggml.c
16
ggml.c
|
@ -6975,7 +6975,7 @@ struct ggml_tensor * ggml_rope_impl(
|
||||||
((int32_t *) b->data)[1] = n_dims;
|
((int32_t *) b->data)[1] = n_dims;
|
||||||
((int32_t *) b->data)[2] = mode;
|
((int32_t *) b->data)[2] = mode;
|
||||||
((int32_t *) b->data)[3] = n_ctx;
|
((int32_t *) b->data)[3] = n_ctx;
|
||||||
memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
|
memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
|
||||||
memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
|
memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
|
||||||
|
|
||||||
ggml_scratch_load(ctx);
|
ggml_scratch_load(ctx);
|
||||||
|
@ -12084,12 +12084,14 @@ static void ggml_compute_forward_rope_f32(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float freq_base;
|
||||||
|
float freq_scale;
|
||||||
|
|
||||||
const int n_past = ((int32_t *) src1->data)[0];
|
const int n_past = ((int32_t *) src1->data)[0];
|
||||||
const int n_dims = ((int32_t *) src1->data)[1];
|
const int n_dims = ((int32_t *) src1->data)[1];
|
||||||
const int mode = ((int32_t *) src1->data)[2];
|
const int mode = ((int32_t *) src1->data)[2];
|
||||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||||
float freq_base, freq_scale;
|
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
||||||
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
|
||||||
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
||||||
|
|
||||||
assert(n_past >= 0);
|
assert(n_past >= 0);
|
||||||
|
@ -12214,12 +12216,14 @@ static void ggml_compute_forward_rope_f16(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float freq_base;
|
||||||
|
float freq_scale;
|
||||||
|
|
||||||
const int n_past = ((int32_t *) src1->data)[0];
|
const int n_past = ((int32_t *) src1->data)[0];
|
||||||
const int n_dims = ((int32_t *) src1->data)[1];
|
const int n_dims = ((int32_t *) src1->data)[1];
|
||||||
const int mode = ((int32_t *) src1->data)[2];
|
const int mode = ((int32_t *) src1->data)[2];
|
||||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||||
float freq_base, freq_scale;
|
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
||||||
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
|
||||||
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
||||||
|
|
||||||
assert(n_past >= 0);
|
assert(n_past >= 0);
|
||||||
|
@ -12322,7 +12326,7 @@ static void ggml_compute_forward_rope_f16(
|
||||||
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
||||||
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
||||||
|
|
||||||
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
||||||
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
52
llama.cpp
52
llama.cpp
|
@ -190,8 +190,10 @@ struct llama_hparams {
|
||||||
uint32_t n_head = 32;
|
uint32_t n_head = 32;
|
||||||
uint32_t n_layer = 32;
|
uint32_t n_layer = 32;
|
||||||
uint32_t n_rot = 64;
|
uint32_t n_rot = 64;
|
||||||
|
|
||||||
float rope_freq_base = 10000.0f;
|
float rope_freq_base = 10000.0f;
|
||||||
float rope_freq_scale = 1.0f;
|
float rope_freq_scale = 1.0f;
|
||||||
|
|
||||||
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams & other) const {
|
||||||
|
@ -843,12 +845,12 @@ struct llama_context_params llama_context_default_params() {
|
||||||
struct llama_context_params result = {
|
struct llama_context_params result = {
|
||||||
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
||||||
/*.n_ctx =*/ 512,
|
/*.n_ctx =*/ 512,
|
||||||
/*.rope_freq_base =*/ 10000.0f,
|
|
||||||
/*.rope_freq_scale =*/ 1.0f,
|
|
||||||
/*.n_batch =*/ 512,
|
/*.n_batch =*/ 512,
|
||||||
/*.gpu_layers =*/ 0,
|
/*.gpu_layers =*/ 0,
|
||||||
/*.main_gpu =*/ 0,
|
/*.main_gpu =*/ 0,
|
||||||
/*.tensor_split =*/ {0},
|
/*.tensor_split =*/ {0},
|
||||||
|
/*.rope_freq_base =*/ 10000.0f,
|
||||||
|
/*.rope_freq_scale =*/ 1.0f,
|
||||||
/*.progress_callback =*/ nullptr,
|
/*.progress_callback =*/ nullptr,
|
||||||
/*.progress_callback_user_data =*/ nullptr,
|
/*.progress_callback_user_data =*/ nullptr,
|
||||||
/*.low_vram =*/ false,
|
/*.low_vram =*/ false,
|
||||||
|
@ -968,12 +970,12 @@ static void llama_model_load_internal(
|
||||||
llama_model & model,
|
llama_model & model,
|
||||||
llama_vocab & vocab,
|
llama_vocab & vocab,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
float rope_freq_base,
|
|
||||||
float rope_freq_scale,
|
|
||||||
int n_batch,
|
int n_batch,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
int main_gpu,
|
int main_gpu,
|
||||||
const float * tensor_split,
|
const float * tensor_split,
|
||||||
|
float rope_freq_base,
|
||||||
|
float rope_freq_scale,
|
||||||
bool low_vram,
|
bool low_vram,
|
||||||
ggml_type memory_type,
|
ggml_type memory_type,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
|
@ -1008,26 +1010,27 @@ static void llama_model_load_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
hparams.n_ctx = n_ctx;
|
hparams.n_ctx = n_ctx;
|
||||||
hparams.rope_freq_base = rope_freq_base;
|
|
||||||
|
hparams.rope_freq_base = rope_freq_base;
|
||||||
hparams.rope_freq_scale = rope_freq_scale;
|
hparams.rope_freq_scale = rope_freq_scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
||||||
|
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
||||||
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||||
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
||||||
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
||||||
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
||||||
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
||||||
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||||
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||||
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
||||||
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
||||||
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
||||||
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
||||||
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
||||||
|
@ -1278,12 +1281,12 @@ static bool llama_model_load(
|
||||||
llama_model & model,
|
llama_model & model,
|
||||||
llama_vocab & vocab,
|
llama_vocab & vocab,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
float rope_freq_base,
|
|
||||||
float rope_freq_scale,
|
|
||||||
int n_batch,
|
int n_batch,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
int main_gpu,
|
int main_gpu,
|
||||||
float * tensor_split,
|
float * tensor_split,
|
||||||
|
float rope_freq_base,
|
||||||
|
float rope_freq_scale,
|
||||||
bool low_vram,
|
bool low_vram,
|
||||||
ggml_type memory_type,
|
ggml_type memory_type,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
|
@ -1292,7 +1295,7 @@ static bool llama_model_load(
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void *progress_callback_user_data) {
|
void *progress_callback_user_data) {
|
||||||
try {
|
try {
|
||||||
llama_model_load_internal(fname, model, vocab, n_ctx, rope_freq_base, rope_freq_scale, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
||||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||||
return true;
|
return true;
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
|
@ -1342,9 +1345,10 @@ static bool llama_eval_internal(
|
||||||
const int n_head = hparams.n_head;
|
const int n_head = hparams.n_head;
|
||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
const int n_rot = hparams.n_embd/hparams.n_head;
|
const int n_rot = hparams.n_embd/hparams.n_head;
|
||||||
|
const int n_gpu_layers = model.n_gpu_layers;
|
||||||
|
|
||||||
const float freq_base = hparams.rope_freq_base;
|
const float freq_base = hparams.rope_freq_base;
|
||||||
const float freq_scale = hparams.rope_freq_scale;
|
const float freq_scale = hparams.rope_freq_scale;
|
||||||
const int n_gpu_layers = model.n_gpu_layers;
|
|
||||||
|
|
||||||
auto & mem_per_token = lctx.mem_per_token;
|
auto & mem_per_token = lctx.mem_per_token;
|
||||||
auto & buf_compute = lctx.buf_compute;
|
auto & buf_compute = lctx.buf_compute;
|
||||||
|
@ -2689,9 +2693,9 @@ struct llama_model * llama_load_model_from_file(
|
||||||
|
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.rope_freq_base, params.rope_freq_scale,
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
||||||
params.n_batch, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.low_vram, memory_type,
|
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
||||||
params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
||||||
params.progress_callback_user_data)) {
|
params.progress_callback_user_data)) {
|
||||||
delete model;
|
delete model;
|
||||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||||
|
|
7
llama.h
7
llama.h
|
@ -85,12 +85,15 @@ extern "C" {
|
||||||
struct llama_context_params {
|
struct llama_context_params {
|
||||||
uint32_t seed; // RNG seed, -1 for random
|
uint32_t seed; // RNG seed, -1 for random
|
||||||
int32_t n_ctx; // text context
|
int32_t n_ctx; // text context
|
||||||
float rope_freq_base; // RoPE base frequency
|
|
||||||
float rope_freq_scale; // RoPE frequency scaling factor
|
|
||||||
int32_t n_batch; // prompt processing batch size
|
int32_t n_batch; // prompt processing batch size
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
||||||
|
|
||||||
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
|
float rope_freq_base; // RoPE base frequency
|
||||||
|
float rope_freq_scale; // RoPE frequency scaling factor
|
||||||
|
|
||||||
// called with a progress value between 0 and 1, pass NULL to disable
|
// called with a progress value between 0 and 1, pass NULL to disable
|
||||||
llama_progress_callback progress_callback;
|
llama_progress_callback progress_callback;
|
||||||
// context pointer passed to the progress callback
|
// context pointer passed to the progress callback
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue