Merge branch 'master' into concedo_experimental
# Conflicts: # Makefile # tests/test-grad0.c
This commit is contained in:
commit
3e68cdd26a
13 changed files with 867 additions and 502 deletions
|
@ -8,6 +8,8 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static const float rms_norm_eps = 1e-6f;
|
||||||
|
|
||||||
float frand() {
|
float frand() {
|
||||||
return (float)rand()/(float)RAND_MAX;
|
return (float)rand()/(float)RAND_MAX;
|
||||||
}
|
}
|
||||||
|
@ -562,7 +564,7 @@ struct ggml_tensor * forward(
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
// cur shape [n_embd,N,1,1]
|
// cur shape [n_embd,N,1,1]
|
||||||
cur = ggml_rms_norm(ctx0, inpL);
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
|
|
||||||
// cur = attention_norm*cur
|
// cur = attention_norm*cur
|
||||||
cur = ggml_mul(ctx0,
|
cur = ggml_mul(ctx0,
|
||||||
|
@ -685,7 +687,7 @@ struct ggml_tensor * forward(
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
// cur shape [n_embd,N,1,1]
|
// cur shape [n_embd,N,1,1]
|
||||||
cur = ggml_rms_norm(ctx0, inpFF);
|
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
||||||
|
|
||||||
// cur = ffn_norm*cur
|
// cur = ffn_norm*cur
|
||||||
// cur shape [n_embd,N,1,1]
|
// cur shape [n_embd,N,1,1]
|
||||||
|
@ -729,7 +731,7 @@ struct ggml_tensor * forward(
|
||||||
{
|
{
|
||||||
|
|
||||||
// inpL shape [n_embd,N,1,1]
|
// inpL shape [n_embd,N,1,1]
|
||||||
inpL = ggml_rms_norm(ctx0, inpL);
|
inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
|
|
||||||
// inpL = norm*inpL
|
// inpL = norm*inpL
|
||||||
// inpL shape [n_embd,N,1,1]
|
// inpL shape [n_embd,N,1,1]
|
||||||
|
@ -817,7 +819,7 @@ struct ggml_tensor * forward_batch(
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
// cur shape [n_embd,N*n_batch,1,1]
|
// cur shape [n_embd,N*n_batch,1,1]
|
||||||
cur = ggml_rms_norm(ctx0, inpL);
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
assert_shape_2d(cur, n_embd, N*n_batch);
|
assert_shape_2d(cur, n_embd, N*n_batch);
|
||||||
|
|
||||||
// cur = attention_norm*cur
|
// cur = attention_norm*cur
|
||||||
|
@ -981,7 +983,7 @@ struct ggml_tensor * forward_batch(
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
// cur shape [n_embd,N*n_batch,1,1]
|
// cur shape [n_embd,N*n_batch,1,1]
|
||||||
cur = ggml_rms_norm(ctx0, inpFF);
|
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
||||||
assert_shape_2d(cur, n_embd, N*n_batch);
|
assert_shape_2d(cur, n_embd, N*n_batch);
|
||||||
|
|
||||||
// cur = ffn_norm*cur
|
// cur = ffn_norm*cur
|
||||||
|
@ -1034,7 +1036,7 @@ struct ggml_tensor * forward_batch(
|
||||||
{
|
{
|
||||||
|
|
||||||
// inpL shape [n_embd,N*n_batch,1,1]
|
// inpL shape [n_embd,N*n_batch,1,1]
|
||||||
inpL = ggml_rms_norm(ctx0, inpL);
|
inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
assert_shape_2d(inpL, n_embd, N*n_batch);
|
assert_shape_2d(inpL, n_embd, N*n_batch);
|
||||||
|
|
||||||
// inpL = norm*inpL
|
// inpL = norm*inpL
|
||||||
|
@ -1104,7 +1106,7 @@ struct ggml_tensor * forward_lora(
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
// cur shape [n_embd,N,1,1]
|
// cur shape [n_embd,N,1,1]
|
||||||
cur = ggml_rms_norm(ctx0, inpL);
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
|
|
||||||
// cur = attention_norm*cur
|
// cur = attention_norm*cur
|
||||||
cur = ggml_mul(ctx0,
|
cur = ggml_mul(ctx0,
|
||||||
|
@ -1251,7 +1253,7 @@ struct ggml_tensor * forward_lora(
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
// cur shape [n_embd,N,1,1]
|
// cur shape [n_embd,N,1,1]
|
||||||
cur = ggml_rms_norm(ctx0, inpFF);
|
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
||||||
|
|
||||||
// cur = ffn_norm*cur
|
// cur = ffn_norm*cur
|
||||||
// cur shape [n_embd,N,1,1]
|
// cur shape [n_embd,N,1,1]
|
||||||
|
@ -1295,7 +1297,7 @@ struct ggml_tensor * forward_lora(
|
||||||
{
|
{
|
||||||
|
|
||||||
// inpL shape [n_embd,N,1,1]
|
// inpL shape [n_embd,N,1,1]
|
||||||
inpL = ggml_rms_norm(ctx0, inpL);
|
inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
|
|
||||||
// inpL = norm*inpL
|
// inpL = norm*inpL
|
||||||
// inpL shape [n_embd,N,1,1]
|
// inpL shape [n_embd,N,1,1]
|
||||||
|
|
|
@ -177,6 +177,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_gqa = std::stoi(argv[i]);
|
params.n_gqa = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "-eps" || arg == "--rms-norm-eps") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.rms_norm_eps = std::stof(argv[i]);
|
||||||
} else if (arg == "--rope-freq-base") {
|
} else if (arg == "--rope-freq-base") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -519,6 +525,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
|
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
|
||||||
|
fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
|
||||||
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
||||||
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
||||||
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
||||||
|
@ -615,6 +622,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
lparams.n_ctx = params.n_ctx;
|
lparams.n_ctx = params.n_ctx;
|
||||||
lparams.n_batch = params.n_batch;
|
lparams.n_batch = params.n_batch;
|
||||||
lparams.n_gqa = params.n_gqa;
|
lparams.n_gqa = params.n_gqa;
|
||||||
|
lparams.rms_norm_eps = params.rms_norm_eps;
|
||||||
lparams.n_gpu_layers = params.n_gpu_layers;
|
lparams.n_gpu_layers = params.n_gpu_layers;
|
||||||
lparams.main_gpu = params.main_gpu;
|
lparams.main_gpu = params.main_gpu;
|
||||||
lparams.tensor_split = params.tensor_split;
|
lparams.tensor_split = params.tensor_split;
|
||||||
|
|
|
@ -22,18 +22,19 @@
|
||||||
int32_t get_num_physical_cores();
|
int32_t get_num_physical_cores();
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = -1; // RNG seed
|
uint32_t seed = -1; // RNG seed
|
||||||
int32_t n_threads = get_num_physical_cores();
|
int32_t n_threads = get_num_physical_cores();
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 512; // context size
|
||||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_gqa = 1; // grouped-query attention factor (TODO: move to hparams)
|
int32_t n_gqa = 1; // grouped-query attention factor (TODO: move to hparams)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
|
float rms_norm_eps = 1e-6; // rms norm epsilon
|
||||||
float rope_freq_base = 10000.0f; // RoPE base frequency
|
float rope_freq_base = 10000.0f; // RoPE base frequency
|
||||||
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -73,6 +73,37 @@
|
||||||
margin: 0;
|
margin: 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fieldset.two {
|
||||||
|
display: grid;
|
||||||
|
grid-template: "a a";
|
||||||
|
gap: 1em;
|
||||||
|
}
|
||||||
|
|
||||||
|
fieldset.three {
|
||||||
|
display: grid;
|
||||||
|
grid-template: "a a a";
|
||||||
|
gap: 1em;
|
||||||
|
}
|
||||||
|
|
||||||
|
details {
|
||||||
|
border: 1px solid #aaa;
|
||||||
|
border-radius: 4px;
|
||||||
|
padding: 0.5em 0.5em 0;
|
||||||
|
margin-top: 0.5em;
|
||||||
|
}
|
||||||
|
|
||||||
|
summary {
|
||||||
|
font-weight: bold;
|
||||||
|
margin: -0.5em -0.5em 0;
|
||||||
|
padding: 0.5em;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
|
||||||
|
details[open] {
|
||||||
|
padding: 0.5em;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
textarea {
|
textarea {
|
||||||
padding: 5px;
|
padding: 5px;
|
||||||
flex-grow: 1;
|
flex-grow: 1;
|
||||||
|
@ -125,10 +156,17 @@
|
||||||
const params = signal({
|
const params = signal({
|
||||||
n_predict: 400,
|
n_predict: 400,
|
||||||
temperature: 0.7,
|
temperature: 0.7,
|
||||||
repeat_last_n: 256,
|
repeat_last_n: 256, // 0 = disable penalty, -1 = context size
|
||||||
repeat_penalty: 1.18,
|
repeat_penalty: 1.18, // 1.0 = disabled
|
||||||
top_k: 40,
|
top_k: 40, // <= 0 to use vocab size
|
||||||
top_p: 0.5,
|
top_p: 0.5, // 1.0 = disabled
|
||||||
|
tfs_z: 1.0, // 1.0 = disabled
|
||||||
|
typical_p: 1.0, // 1.0 = disabled
|
||||||
|
presence_penalty: 0.0, // 0.0 = disabled
|
||||||
|
frequency_penalty: 0.0, // 0.0 = disabled
|
||||||
|
mirostat: 0, // 0/1/2
|
||||||
|
mirostat_tau: 5, // target entropy
|
||||||
|
mirostat_eta: 0.1, // learning rate
|
||||||
})
|
})
|
||||||
|
|
||||||
const llamaStats = signal(null)
|
const llamaStats = signal(null)
|
||||||
|
@ -264,6 +302,27 @@
|
||||||
const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
|
const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
|
||||||
const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
|
const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
|
||||||
const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
|
const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
|
||||||
|
const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
|
||||||
|
|
||||||
|
const FloatField = ({label, max, min, name, step, value}) => {
|
||||||
|
return html`
|
||||||
|
<div>
|
||||||
|
<label for="${name}">${label}</label>
|
||||||
|
<input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
|
||||||
|
<span>${value}</span>
|
||||||
|
</div>
|
||||||
|
`
|
||||||
|
};
|
||||||
|
|
||||||
|
const IntField = ({label, max, min, name, value}) => {
|
||||||
|
return html`
|
||||||
|
<div>
|
||||||
|
<label for="${name}">${label}</label>
|
||||||
|
<input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
|
||||||
|
<span>${value}</span>
|
||||||
|
</div>
|
||||||
|
`
|
||||||
|
};
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
<form>
|
<form>
|
||||||
|
@ -272,7 +331,9 @@
|
||||||
<label for="prompt">Prompt</label>
|
<label for="prompt">Prompt</label>
|
||||||
<textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
|
<textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
|
||||||
</div>
|
</div>
|
||||||
|
</fieldset>
|
||||||
|
|
||||||
|
<fieldset class="two">
|
||||||
<div>
|
<div>
|
||||||
<label for="user">User name</label>
|
<label for="user">User name</label>
|
||||||
<input type="text" name="user" value="${session.value.user}" oninput=${updateSession} />
|
<input type="text" name="user" value="${session.value.user}" oninput=${updateSession} />
|
||||||
|
@ -282,7 +343,9 @@
|
||||||
<label for="bot">Bot name</label>
|
<label for="bot">Bot name</label>
|
||||||
<input type="text" name="char" value="${session.value.char}" oninput=${updateSession} />
|
<input type="text" name="char" value="${session.value.char}" oninput=${updateSession} />
|
||||||
</div>
|
</div>
|
||||||
|
</fieldset>
|
||||||
|
|
||||||
|
<fieldset>
|
||||||
<div>
|
<div>
|
||||||
<label for="template">Prompt template</label>
|
<label for="template">Prompt template</label>
|
||||||
<textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/>
|
<textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/>
|
||||||
|
@ -292,38 +355,44 @@
|
||||||
<label for="template">Chat history template</label>
|
<label for="template">Chat history template</label>
|
||||||
<textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
|
<textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div>
|
|
||||||
<label for="temperature">Temperature</label>
|
|
||||||
<input type="range" id="temperature" min="0.0" max="1.0" step="0.01" name="temperature" value="${params.value.temperature}" oninput=${updateParamsFloat} />
|
|
||||||
<span>${params.value.temperature}</span>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div>
|
|
||||||
<label for="nPredict">Predictions</label>
|
|
||||||
<input type="range" id="nPredict" min="1" max="2048" step="1" name="n_predict" value="${params.value.n_predict}" oninput=${updateParamsFloat} />
|
|
||||||
<span>${params.value.n_predict}</span>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div>
|
|
||||||
<label for="repeat_penalty">Penalize repeat sequence</label>
|
|
||||||
<input type="range" id="repeat_penalty" min="0.0" max="2.0" step="0.01" name="repeat_penalty" value="${params.value.repeat_penalty}" oninput=${updateParamsFloat} />
|
|
||||||
<span>${params.value.repeat_penalty}</span>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div>
|
|
||||||
<label for="repeat_last_n">Consider N tokens for penalize</label>
|
|
||||||
<input type="range" id="repeat_last_n" min="0.0" max="2048" name="repeat_last_n" value="${params.value.repeat_last_n}" oninput=${updateParamsFloat} />
|
|
||||||
<span>${params.value.repeat_last_n}</span>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
|
||||||
|
<fieldset class="two">
|
||||||
|
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
|
||||||
|
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
|
||||||
|
${FloatField({label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty})}
|
||||||
|
${IntField({label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n})}
|
||||||
|
${IntField({label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k})}
|
||||||
|
${FloatField({label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p})}
|
||||||
|
</fieldset>
|
||||||
|
<details>
|
||||||
|
<summary>More options</summary>
|
||||||
|
<fieldset class="two">
|
||||||
|
${FloatField({label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z})}
|
||||||
|
${FloatField({label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p})}
|
||||||
|
${FloatField({label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty})}
|
||||||
|
${FloatField({label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty})}
|
||||||
|
</fieldset>
|
||||||
|
<hr />
|
||||||
|
<fieldset class="three">
|
||||||
|
<div>
|
||||||
|
<label><input type="radio" name="mirostat" value="0" checked=${params.value.mirostat == 0} oninput=${updateParamsInt} /> no Mirostat</label>
|
||||||
|
<label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
|
||||||
|
<label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
|
||||||
|
</div>
|
||||||
|
${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
|
||||||
|
${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
|
||||||
|
</fieldset>
|
||||||
|
</details>
|
||||||
</form>
|
</form>
|
||||||
`
|
`
|
||||||
}
|
}
|
||||||
// poor mans markdown replacement
|
// poor mans markdown replacement
|
||||||
const Markdownish = (params) => {
|
const Markdownish = (params) => {
|
||||||
const md = params.text
|
const md = params.text
|
||||||
|
.replace(/&/g, '&')
|
||||||
|
.replace(/</g, '<')
|
||||||
|
.replace(/>/g, '>')
|
||||||
.replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
|
.replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
|
||||||
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
|
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
|
||||||
.replace(/__(.*?)__/g, '<strong>$1</strong>')
|
.replace(/__(.*?)__/g, '<strong>$1</strong>')
|
||||||
|
|
|
@ -609,6 +609,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
|
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
|
||||||
|
fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
|
||||||
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||||
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||||
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
|
@ -734,6 +735,14 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.n_gqa = std::stoi(argv[i]);
|
params.n_gqa = std::stoi(argv[i]);
|
||||||
}
|
}
|
||||||
|
else if (arg == "-eps" || arg == "--rms-norm-eps") {
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.rms_norm_eps = std::stof(argv[i]);
|
||||||
|
}
|
||||||
else if (arg == "--rope-freq-base")
|
else if (arg == "--rope-freq-base")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
|
|
|
@ -16,6 +16,8 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static const float rms_norm_eps = 1e-6f;
|
||||||
|
|
||||||
struct random_normal_distribution {
|
struct random_normal_distribution {
|
||||||
std::mt19937 gen;
|
std::mt19937 gen;
|
||||||
std::normal_distribution<float> rd;
|
std::normal_distribution<float> rd;
|
||||||
|
@ -439,7 +441,7 @@ struct ggml_tensor * forward(
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
// cur shape [n_embd,N,1,1]
|
// cur shape [n_embd,N,1,1]
|
||||||
cur = ggml_rms_norm(ctx0, inpL);
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
|
|
||||||
// cur = attention_norm*cur
|
// cur = attention_norm*cur
|
||||||
cur = ggml_mul(ctx0,
|
cur = ggml_mul(ctx0,
|
||||||
|
@ -562,7 +564,7 @@ struct ggml_tensor * forward(
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
// cur shape [n_embd,N,1,1]
|
// cur shape [n_embd,N,1,1]
|
||||||
cur = ggml_rms_norm(ctx0, inpFF);
|
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
||||||
|
|
||||||
// cur = ffn_norm*cur
|
// cur = ffn_norm*cur
|
||||||
// cur shape [n_embd,N,1,1]
|
// cur shape [n_embd,N,1,1]
|
||||||
|
@ -606,7 +608,7 @@ struct ggml_tensor * forward(
|
||||||
{
|
{
|
||||||
|
|
||||||
// inpL shape [n_embd,N,1,1]
|
// inpL shape [n_embd,N,1,1]
|
||||||
inpL = ggml_rms_norm(ctx0, inpL);
|
inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
|
|
||||||
// inpL = norm*inpL
|
// inpL = norm*inpL
|
||||||
// inpL shape [n_embd,N,1,1]
|
// inpL shape [n_embd,N,1,1]
|
||||||
|
@ -694,7 +696,7 @@ struct ggml_tensor * forward_batch(
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
// cur shape [n_embd,N*n_batch,1,1]
|
// cur shape [n_embd,N*n_batch,1,1]
|
||||||
cur = ggml_rms_norm(ctx0, inpL);
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
assert_shape_2d(cur, n_embd, N*n_batch);
|
assert_shape_2d(cur, n_embd, N*n_batch);
|
||||||
|
|
||||||
// cur = attention_norm*cur
|
// cur = attention_norm*cur
|
||||||
|
@ -857,7 +859,7 @@ struct ggml_tensor * forward_batch(
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
// cur shape [n_embd,N*n_batch,1,1]
|
// cur shape [n_embd,N*n_batch,1,1]
|
||||||
cur = ggml_rms_norm(ctx0, inpFF);
|
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
||||||
assert_shape_2d(cur, n_embd, N*n_batch);
|
assert_shape_2d(cur, n_embd, N*n_batch);
|
||||||
|
|
||||||
// cur = ffn_norm*cur
|
// cur = ffn_norm*cur
|
||||||
|
@ -910,7 +912,7 @@ struct ggml_tensor * forward_batch(
|
||||||
{
|
{
|
||||||
|
|
||||||
// inpL shape [n_embd,N*n_batch,1,1]
|
// inpL shape [n_embd,N*n_batch,1,1]
|
||||||
inpL = ggml_rms_norm(ctx0, inpL);
|
inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
assert_shape_2d(inpL, n_embd, N*n_batch);
|
assert_shape_2d(inpL, n_embd, N*n_batch);
|
||||||
|
|
||||||
// inpL = norm*inpL
|
// inpL = norm*inpL
|
||||||
|
@ -979,7 +981,7 @@ struct ggml_tensor * forward_batch_wo_cache(
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
// cur shape [n_embd,N*n_batch,1,1]
|
// cur shape [n_embd,N*n_batch,1,1]
|
||||||
cur = ggml_rms_norm(ctx0, inpL);
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
assert_shape_2d(cur, n_embd, N*n_batch);
|
assert_shape_2d(cur, n_embd, N*n_batch);
|
||||||
|
|
||||||
// cur = attention_norm*cur
|
// cur = attention_norm*cur
|
||||||
|
@ -1085,7 +1087,7 @@ struct ggml_tensor * forward_batch_wo_cache(
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
// cur shape [n_embd,N*n_batch,1,1]
|
// cur shape [n_embd,N*n_batch,1,1]
|
||||||
cur = ggml_rms_norm(ctx0, inpFF);
|
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
||||||
assert_shape_2d(cur, n_embd, N*n_batch);
|
assert_shape_2d(cur, n_embd, N*n_batch);
|
||||||
|
|
||||||
// cur = ffn_norm*cur
|
// cur = ffn_norm*cur
|
||||||
|
@ -1138,7 +1140,7 @@ struct ggml_tensor * forward_batch_wo_cache(
|
||||||
{
|
{
|
||||||
|
|
||||||
// inpL shape [n_embd,N*n_batch,1,1]
|
// inpL shape [n_embd,N*n_batch,1,1]
|
||||||
inpL = ggml_rms_norm(ctx0, inpL);
|
inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
assert_shape_2d(inpL, n_embd, N*n_batch);
|
assert_shape_2d(inpL, n_embd, N*n_batch);
|
||||||
|
|
||||||
// inpL = norm*inpL
|
// inpL = norm*inpL
|
||||||
|
@ -1203,7 +1205,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
cur = ggml_rms_norm(ctx0, inpL);
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
assert_shape_2d(cur, n_embd, N*n_batch);
|
assert_shape_2d(cur, n_embd, N*n_batch);
|
||||||
|
|
||||||
// cur = attention_norm*cur
|
// cur = attention_norm*cur
|
||||||
|
@ -1267,7 +1269,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
|
||||||
{
|
{
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
cur = ggml_rms_norm(ctx0, inpFF);
|
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
||||||
assert_shape_2d(cur, n_embd, N*n_batch);
|
assert_shape_2d(cur, n_embd, N*n_batch);
|
||||||
|
|
||||||
// cur = ffn_norm*cur
|
// cur = ffn_norm*cur
|
||||||
|
@ -1311,7 +1313,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
|
|
||||||
inpL = ggml_rms_norm(ctx0, inpL);
|
inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
assert_shape_2d(inpL, n_embd, N*n_batch);
|
assert_shape_2d(inpL, n_embd, N*n_batch);
|
||||||
|
|
||||||
// inpL = norm*inpL
|
// inpL = norm*inpL
|
||||||
|
@ -1603,7 +1605,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||||
struct my_llama_layer & layer = model->layers[il];
|
struct my_llama_layer & layer = model->layers[il];
|
||||||
// tensors with values necessary for backward pass are in persistent buf(-1)
|
// tensors with values necessary for backward pass are in persistent buf(-1)
|
||||||
// other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed.
|
// other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed.
|
||||||
use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t02, n_embd, N*n_batch);
|
use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t02, n_embd, N*n_batch);
|
||||||
use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch);
|
use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch);
|
||||||
use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch);
|
use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch);
|
||||||
use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch);
|
use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch);
|
||||||
|
@ -1623,7 +1625,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||||
use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch);
|
use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch);
|
||||||
use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch);
|
use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch);
|
||||||
use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch);
|
use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch);
|
||||||
use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21)); assert_shape_2d(t22, n_embd, N*n_batch);
|
use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21, rms_norm_eps)); assert_shape_2d(t22, n_embd, N*n_batch);
|
||||||
use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch);
|
use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch);
|
||||||
use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch);
|
use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch);
|
||||||
use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch);
|
use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch);
|
||||||
|
@ -1666,7 +1668,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||||
}
|
}
|
||||||
clr_buf(0);
|
clr_buf(0);
|
||||||
use_buf(0);
|
use_buf(0);
|
||||||
struct ggml_tensor * t31 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t31, n_embd, N*n_batch);
|
struct ggml_tensor * t31 = expand(gf, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t31, n_embd, N*n_batch);
|
||||||
struct ggml_tensor * t32 = expand(gf, ggml_repeat (ctx0, model->norm, t31)); assert_shape_2d(t32, n_embd, N*n_batch);
|
struct ggml_tensor * t32 = expand(gf, ggml_repeat (ctx0, model->norm, t31)); assert_shape_2d(t32, n_embd, N*n_batch);
|
||||||
struct ggml_tensor * t33 = expand(gf, ggml_mul (ctx0, t32, t31)); assert_shape_2d(t33, n_embd, N*n_batch);
|
struct ggml_tensor * t33 = expand(gf, ggml_mul (ctx0, t32, t31)); assert_shape_2d(t33, n_embd, N*n_batch);
|
||||||
use_buf(-1);
|
use_buf(-1);
|
||||||
|
|
13
ggml-cuda.cu
13
ggml-cuda.cu
|
@ -332,12 +332,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
||||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
|
|
||||||
const float eps = 1e-6f;
|
|
||||||
|
|
||||||
float tmp = 0.0f; // partial sum for thread in warp
|
float tmp = 0.0f; // partial sum for thread in warp
|
||||||
|
|
||||||
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
||||||
|
@ -2122,10 +2120,10 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
|
||||||
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||||
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
||||||
|
@ -2868,8 +2866,11 @@ inline void ggml_cuda_op_rms_norm(
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t i01_diff = i01_high - i01_low;
|
const int64_t i01_diff = i01_high - i01_low;
|
||||||
|
|
||||||
|
float eps;
|
||||||
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
// compute
|
// compute
|
||||||
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
|
||||||
|
|
||||||
(void) src1;
|
(void) src1;
|
||||||
(void) dst;
|
(void) dst;
|
||||||
|
|
|
@ -812,7 +812,8 @@ void ggml_metal_graph_compute(
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
}
|
}
|
||||||
|
|
||||||
const float eps = 1e-6f;
|
float eps;
|
||||||
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
const int nth = 512;
|
const int nth = 512;
|
||||||
|
|
||||||
|
|
16
ggml.c
16
ggml.c
|
@ -5782,6 +5782,7 @@ struct ggml_tensor * ggml_norm_inplace(
|
||||||
static struct ggml_tensor * ggml_rms_norm_impl(
|
static struct ggml_tensor * ggml_rms_norm_impl(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
float eps,
|
||||||
bool inplace) {
|
bool inplace) {
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
|
@ -5791,7 +5792,7 @@ static struct ggml_tensor * ggml_rms_norm_impl(
|
||||||
|
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
// TODO: maybe store epsilon here?
|
ggml_set_op_params(result, &eps, sizeof(eps));
|
||||||
|
|
||||||
result->op = GGML_OP_RMS_NORM;
|
result->op = GGML_OP_RMS_NORM;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -5802,14 +5803,16 @@ static struct ggml_tensor * ggml_rms_norm_impl(
|
||||||
|
|
||||||
struct ggml_tensor * ggml_rms_norm(
|
struct ggml_tensor * ggml_rms_norm(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a) {
|
struct ggml_tensor * a,
|
||||||
return ggml_rms_norm_impl(ctx, a, false);
|
float eps) {
|
||||||
|
return ggml_rms_norm_impl(ctx, a, eps, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_rms_norm_inplace(
|
struct ggml_tensor * ggml_rms_norm_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a) {
|
struct ggml_tensor * a,
|
||||||
return ggml_rms_norm_impl(ctx, a, true);
|
float eps) {
|
||||||
|
return ggml_rms_norm_impl(ctx, a, eps, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_rms_norm_back(
|
struct ggml_tensor * ggml_rms_norm_back(
|
||||||
|
@ -10132,7 +10135,8 @@ static void ggml_compute_forward_rms_norm_f32(
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||||
|
|
||||||
const float eps = 1e-6f; // TODO: make this a parameter
|
float eps;
|
||||||
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
// TODO: optimize
|
// TODO: optimize
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
|
|
7
ggml.h
7
ggml.h
|
@ -865,14 +865,17 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_rms_norm(
|
GGML_API struct ggml_tensor * ggml_rms_norm(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a,
|
||||||
|
float eps);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
|
GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a,
|
||||||
|
float eps);
|
||||||
|
|
||||||
// a - x
|
// a - x
|
||||||
// b - dy
|
// b - dy
|
||||||
|
// TODO: update with configurable eps
|
||||||
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
|
20
llama.cpp
20
llama.cpp
|
@ -187,6 +187,7 @@ struct llama_hparams {
|
||||||
// LLaMAv2
|
// LLaMAv2
|
||||||
// TODO: load from model data hparams
|
// TODO: load from model data hparams
|
||||||
float f_ffn_mult = 1.0f;
|
float f_ffn_mult = 1.0f;
|
||||||
|
float f_rms_norm_eps = 1e-6f;
|
||||||
|
|
||||||
float rope_freq_base = 10000.0f;
|
float rope_freq_base = 10000.0f;
|
||||||
float rope_freq_scale = 1.0f;
|
float rope_freq_scale = 1.0f;
|
||||||
|
@ -870,6 +871,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.n_ctx =*/ 512,
|
/*.n_ctx =*/ 512,
|
||||||
/*.n_batch =*/ 512,
|
/*.n_batch =*/ 512,
|
||||||
/*.n_gqa =*/ 1,
|
/*.n_gqa =*/ 1,
|
||||||
|
/*.rms_norm_eps =*/ 1e-6f,
|
||||||
/*.gpu_layers =*/ 0,
|
/*.gpu_layers =*/ 0,
|
||||||
/*.main_gpu =*/ 0,
|
/*.main_gpu =*/ 0,
|
||||||
/*.tensor_split =*/ nullptr,
|
/*.tensor_split =*/ nullptr,
|
||||||
|
@ -1001,6 +1003,7 @@ static void llama_model_load_internal(
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_batch,
|
int n_batch,
|
||||||
int n_gqa,
|
int n_gqa,
|
||||||
|
float rms_norm_eps,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
int main_gpu,
|
int main_gpu,
|
||||||
const float * tensor_split,
|
const float * tensor_split,
|
||||||
|
@ -1026,6 +1029,9 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
auto & hparams = model.hparams;
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
// TODO: read from file
|
||||||
|
hparams.f_rms_norm_eps = rms_norm_eps;
|
||||||
|
|
||||||
{
|
{
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 26: model.type = e_model::MODEL_3B; break;
|
case 26: model.type = e_model::MODEL_3B; break;
|
||||||
|
@ -1079,6 +1085,7 @@ static void llama_model_load_internal(
|
||||||
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||||
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
||||||
fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
||||||
|
fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
|
||||||
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
||||||
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
||||||
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
||||||
|
@ -1337,6 +1344,7 @@ static bool llama_model_load(
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_batch,
|
int n_batch,
|
||||||
int n_gqa,
|
int n_gqa,
|
||||||
|
float rms_norm_eps,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
int main_gpu,
|
int main_gpu,
|
||||||
const float * tensor_split,
|
const float * tensor_split,
|
||||||
|
@ -1350,7 +1358,7 @@ static bool llama_model_load(
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void *progress_callback_user_data) {
|
void *progress_callback_user_data) {
|
||||||
try {
|
try {
|
||||||
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
||||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||||
return true;
|
return true;
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
|
@ -1403,10 +1411,12 @@ static bool llama_eval_internal(
|
||||||
const int64_t n_vocab = hparams.n_vocab;
|
const int64_t n_vocab = hparams.n_vocab;
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||||
|
|
||||||
|
|
||||||
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
const float freq_base = hparams.rope_freq_base;
|
const float freq_base = hparams.rope_freq_base;
|
||||||
const float freq_scale = hparams.rope_freq_scale;
|
const float freq_scale = hparams.rope_freq_scale;
|
||||||
|
const float rms_norm_eps = hparams.f_rms_norm_eps;
|
||||||
|
|
||||||
const int n_gpu_layers = model.n_gpu_layers;
|
const int n_gpu_layers = model.n_gpu_layers;
|
||||||
|
|
||||||
|
@ -1486,7 +1496,7 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
cur = ggml_rms_norm(ctx0, inpL);
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
offload_func(cur);
|
offload_func(cur);
|
||||||
ggml_set_name(cur, "rms_norm_0");
|
ggml_set_name(cur, "rms_norm_0");
|
||||||
|
|
||||||
|
@ -1634,7 +1644,7 @@ static bool llama_eval_internal(
|
||||||
{
|
{
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
cur = ggml_rms_norm(ctx0, inpFF);
|
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
||||||
offload_func(cur);
|
offload_func(cur);
|
||||||
ggml_set_name(cur, "rms_norm_1");
|
ggml_set_name(cur, "rms_norm_1");
|
||||||
|
|
||||||
|
@ -1687,7 +1697,7 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
cur = ggml_rms_norm(ctx0, inpL);
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
offload_func_nr(cur);
|
offload_func_nr(cur);
|
||||||
ggml_set_name(cur, "rms_norm_2");
|
ggml_set_name(cur, "rms_norm_2");
|
||||||
|
|
||||||
|
@ -3091,7 +3101,7 @@ struct llama_model * llama_load_model_from_file(
|
||||||
|
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.n_gpu_layers,
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
|
||||||
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
||||||
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
||||||
params.progress_callback_user_data)) {
|
params.progress_callback_user_data)) {
|
||||||
|
|
1
llama.h
1
llama.h
|
@ -87,6 +87,7 @@ extern "C" {
|
||||||
int32_t n_ctx; // text context
|
int32_t n_ctx; // text context
|
||||||
int32_t n_batch; // prompt processing batch size
|
int32_t n_batch; // prompt processing batch size
|
||||||
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
||||||
|
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue