Merge branch 'master' into concedo_experimental

# Conflicts:
#	.gitignore
#	CMakeLists.txt
#	Makefile
#	README.md
#	flake.nix
#	ggml-cuda.cu
This commit is contained in:
Concedo 2023-07-22 11:51:30 +08:00
commit 343ae756fa
19 changed files with 721 additions and 444 deletions

14
.gitignore vendored
View file

@ -64,6 +64,20 @@ qnt-*.txt
perf-*.txt perf-*.txt
examples/jeopardy/results.txt examples/jeopardy/results.txt
pyproject.toml
poetry.lock
poetry.toml
# Test binaries
tests/test-double-float
tests/test-grad0
tests/test-opt
tests/test-quantize-fns
tests/test-quantize-perf
tests/test-sampling
tests/test-tokenizer-0
koboldcpp.so koboldcpp.so
koboldcpp_failsafe.so koboldcpp_failsafe.so
koboldcpp_openblas.so koboldcpp_openblas.so

View file

@ -243,7 +243,7 @@ function gg_sum_open_llama_3b_v2 {
if [ -z $GG_BUILD_LOW_PERF ]; then if [ -z $GG_BUILD_LOW_PERF ]; then
rm -rf ${SRC}/models-mnt rm -rf ${SRC}/models-mnt
mnt_models=$(realpath ${MNT}/models) mnt_models=${MNT}/models
mkdir -p ${mnt_models} mkdir -p ${mnt_models}
ln -sfn ${mnt_models} ${SRC}/models-mnt ln -sfn ${mnt_models} ${SRC}/models-mnt

View file

@ -2,21 +2,21 @@
set -e set -e
AI_NAME="${AI_NAME:-Miku}" AI_NAME="${AI_NAME:-Miku}"
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}" MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
USER_NAME="${USER_NAME:-Anon}" USER_NAME="${USER_NAME:-Anon}"
# Uncomment and adjust to the number of CPU cores you want to use. # Uncomment and adjust to the number of CPU cores you want to use.
#N_THREAD="${N_THREAD:-4}" #N_THREAD="${N_THREAD:-4}"
CTX_SIZE="${CTX_SIZE:-4096}"
N_PREDICTS="${N_PREDICTS:-4096}" N_PREDICTS="${N_PREDICTS:-4096}"
GEN_OPTIONS=(--batch_size 1024 GEN_OPTIONS=(--batch_size 1024
--ctx_size 2048 --ctx_size "$CTX_SIZE"
--keep -1 --keep -1
--repeat_last_n 256 --repeat_last_n 256
--repeat_penalty 1.17647 --repeat_penalty 1.17647
--temp 0.7 --temp 0.6
--top_k 40 --mirostat 2)
--top_p 0.5)
if [ -n "$N_THREAD" ]; then if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD") GEN_OPTIONS+=(--threads "$N_THREAD")
@ -24,16 +24,17 @@ fi
./main "${GEN_OPTIONS[@]}" \ ./main "${GEN_OPTIONS[@]}" \
--model "$MODEL" \ --model "$MODEL" \
--in-prefix " " \
--in-suffix "${AI_NAME}:" \
--n_predict "$N_PREDICTS" \ --n_predict "$N_PREDICTS" \
--color --interactive \ --color --interactive \
--reverse-prompt "${USER_NAME}:" \ --reverse-prompt "${USER_NAME}:" \
--prompt " --prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. ${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help. ${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad. ${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her. ${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
The conversation is only between ${USER_NAME} and ${AI_NAME} The conversation is only between ${USER_NAME} and ${AI_NAME}.
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice. The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
${AI_NAME} can only communicate through text, so she can't send images or videos. ${AI_NAME} can only communicate through text, so she can't send images or videos.

View file

@ -260,12 +260,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.cfg_scale = std::stof(argv[i]); params.cfg_scale = std::stof(argv[i]);
} else if (arg == "--cfg-smooth-factor") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cfg_smooth_factor = std::stof(argv[i]);
} else if (arg == "-b" || arg == "--batch-size") { } else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -509,7 +503,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --cfg-negative-prompt PROMPT \n"); fprintf(stderr, " --cfg-negative-prompt PROMPT \n");
fprintf(stderr, " negative prompt to use for guidance. (default: empty)\n"); fprintf(stderr, " negative prompt to use for guidance. (default: empty)\n");
fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
@ -586,7 +579,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
lparams.n_batch = params.n_batch; lparams.n_batch = params.n_batch;
lparams.n_gpu_layers = params.n_gpu_layers; lparams.n_gpu_layers = params.n_gpu_layers;
lparams.main_gpu = params.main_gpu; lparams.main_gpu = params.main_gpu;
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float)); lparams.tensor_split = params.tensor_split;
lparams.low_vram = params.low_vram; lparams.low_vram = params.low_vram;
lparams.seed = params.seed; lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16; lparams.f16_kv = params.memory_f16;

View file

@ -55,7 +55,6 @@ struct gpt_params {
// https://arxiv.org/abs/2306.17806 // https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // How strong is guidance float cfg_scale = 1.f; // How strong is guidance
float cfg_smooth_factor = 1.f; // Smooth factor between old and new logits
std::string model = "models/7B/ggml-model.bin"; // model path std::string model = "models/7B/ggml-model.bin"; // model path
std::string model_alias = "unknown"; // model alias std::string model_alias = "unknown"; // model alias

View file

@ -64,7 +64,7 @@ class MiniGPT4(Blip2Base):
self.max_txt_len = max_txt_len self.max_txt_len = max_txt_len
self.end_sym = end_sym self.end_sym = end_sym
self.model = MyModel(["main", *args]) self.model = MyModel(["main", *args])
# system promt # system prompt
self.model.eval_string("Give the following image: <Img>ImageContent</Img>. " self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
"You will be able to see the image once I provide it to you. Please answer my questions." "You will be able to see the image once I provide it to you. Please answer my questions."
"###") "###")

18
examples/llama2-13b.sh Executable file
View file

@ -0,0 +1,18 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 8

18
examples/llama2.sh Executable file
View file

@ -0,0 +1,18 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 8

View file

@ -557,7 +557,7 @@ int main(int argc, char ** argv) {
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
if (ctx_guidance) { if (ctx_guidance) {
llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor); llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale);
} }
// Apply penalties // Apply penalties

92
examples/make-ggml.py Normal file
View file

@ -0,0 +1,92 @@
"""
This script converts Hugging Face llama models to GGML and quantizes them.
Usage:
python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
Arguments:
- --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
Quant types:
- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
- Q2_K: smallest, extreme quality loss - not recommended
- Q3_K: alias for Q3_K_M
- Q3_K_S: very small, very high quality loss
- Q3_K_M: very small, very high quality loss
- Q3_K_L: small, substantial quality loss
- Q4_K: alias for Q4_K_M
- Q4_K_S: small, significant quality loss
- Q4_K_M: medium, balanced quality - recommended
- Q5_K: alias for Q5_K_M
- Q5_K_S: large, low quality loss - recommended
- Q5_K_M: large, very low quality loss - recommended
- Q6_K: very large, extremely low quality loss
- Q8_0: very large, extremely low quality loss - not recommended
- F16: extremely large, virtually no quality loss - not recommended
- F32: absolutely huge, lossless - not recommended
"""
import subprocess
subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
import argparse
import os
from huggingface_hub import snapshot_download
def main(model, outname, outdir, quants, keep_fp16):
ggml_version = "v3"
if not os.path.isdir(model):
print(f"Model not found at {model}. Downloading...")
try:
if outname is None:
outname = model.split('/')[-1]
model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
except Exception as e:
raise Exception(f"Could not download the model: {e}")
if outdir is None:
outdir = f'../models/{outname}'
if not os.path.isfile(f"{model}/config.json"):
raise Exception(f"Could not find config.json in {model}")
os.makedirs(outdir, exist_ok=True)
print("Building llama.cpp")
subprocess.run(f"cd .. && make quantize", shell=True, check=True)
fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin"
print(f"Making unquantised GGML at {fp16}")
if not os.path.isfile(fp16):
subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
else:
print(f"Unquantised GGML already exists at: {fp16}")
print("Making quants")
for type in quants:
outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin"
print(f"Making {type} : {outfile}")
subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
if not keep_fp16:
os.remove(fp16)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name')
parser.add_argument('--outname', default=None, help='Output model(s) name')
parser.add_argument('--outdir', default=None, help='Output directory')
parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
args = parser.parse_args()
main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16)

View file

@ -7,6 +7,9 @@ target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
) )
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO)

View file

@ -1434,7 +1434,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
gf->perf_time_us = 0; gf->perf_time_us = 0;
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
//const int n_ctx = hparams.n_ctx; const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab; const int n_vocab = hparams.n_vocab;
const int n_embd = hparams.n_embd; const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer; const int n_layer = hparams.n_layer;
@ -1863,10 +1863,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1)); assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head); t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1)); assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd)); assert_shape_2d(t11->grad, N*n_batch, n_embd); t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd)); assert_shape_2d(t11->grad, N*n_batch, n_embd);
t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3)); assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch); t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3)); assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch); t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch)); assert_shape_2d(t08->grad, n_embd, N*n_batch); t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch)); assert_shape_2d(t08->grad, n_embd, N*n_batch);
t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3)); assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch); t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3)); assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch); t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch)); assert_shape_2d(t05->grad, n_embd, N*n_batch); t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch)); assert_shape_2d(t05->grad, n_embd, N*n_batch);
t04->grad = expand(gb, ggml_add_inplace(ctx0, t04->grad = expand(gb, ggml_add_inplace(ctx0,
ggml_add_inplace(ctx0, ggml_add_inplace(ctx0,

View file

@ -2537,6 +2537,9 @@ void ggml_init_cublas() {
} }
void ggml_cuda_set_tensor_split(const float * tensor_split) { void ggml_cuda_set_tensor_split(const float * tensor_split) {
if (tensor_split == nullptr) {
return;
}
bool all_zero = true; bool all_zero = true;
for (int i = 0; i < g_device_count; ++i) { for (int i = 0; i < g_device_count; ++i) {
if (tensor_split[i] != 0.0f) { if (tensor_split[i] != 0.0f) {
@ -2972,19 +2975,18 @@ inline void ggml_cuda_op_rope(
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t i01_diff = i01_high - i01_low; const int64_t i01_diff = i01_high - i01_low;
float freq_base;
float freq_scale;
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1]; const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2]; const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3]; const int n_ctx = ((int32_t *) src1->data)[3];
// RoPE alteration for extended context
float freq_base, freq_scale;
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
const float theta_scale = powf(freq_base, -2.0f/n_dims); const float theta_scale = powf(freq_base, -2.0f/n_dims);
const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02); const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
const float p = p0 * freq_scale;
bool is_glm = mode & 4; bool is_glm = mode & 4;

View file

@ -676,8 +676,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
} break; } break;
case GGML_TYPE_Q3_K: case GGML_TYPE_Q3_K:
@ -685,8 +685,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
} break; } break;
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
@ -694,8 +694,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
} break; } break;
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
@ -703,8 +703,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
} break; } break;
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
@ -712,8 +712,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
} break; } break;
default: default:
@ -739,16 +739,22 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q2_K || else if (src0t == GGML_TYPE_Q3_K) {
src0t == GGML_TYPE_Q3_K || #ifdef GGML_QKK_64
src0t == GGML_TYPE_Q4_K || [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
src0t == GGML_TYPE_Q5_K || #else
src0t == GGML_TYPE_Q6_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; #endif
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; }
else if (src0t == GGML_TYPE_Q5_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];

File diff suppressed because it is too large Load diff

26
ggml.c
View file

@ -6957,9 +6957,9 @@ struct ggml_tensor * ggml_rope_impl(
int n_past, int n_past,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx,
float freq_base, float freq_base,
float freq_scale, float freq_scale,
int n_ctx,
bool inplace) { bool inplace) {
GGML_ASSERT(n_past >= 0); GGML_ASSERT(n_past >= 0);
bool is_node = false; bool is_node = false;
@ -6998,7 +6998,7 @@ struct ggml_tensor * ggml_rope(
int n_dims, int n_dims,
int mode, int mode,
int n_ctx) { int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, false); return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
} }
struct ggml_tensor * ggml_rope_inplace( struct ggml_tensor * ggml_rope_inplace(
@ -7008,7 +7008,7 @@ struct ggml_tensor * ggml_rope_inplace(
int n_dims, int n_dims,
int mode, int mode,
int n_ctx) { int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true); return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
} }
struct ggml_tensor * ggml_rope_custom_inplace( struct ggml_tensor * ggml_rope_custom_inplace(
@ -7017,10 +7017,10 @@ struct ggml_tensor * ggml_rope_custom_inplace(
int n_past, int n_past,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx,
float freq_base, float freq_base,
float freq_scale, float freq_scale) {
int n_ctx) { return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true);
} }
// ggml_rope_back // ggml_rope_back
@ -7030,7 +7030,8 @@ struct ggml_tensor * ggml_rope_back(
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, int n_past,
int n_dims, int n_dims,
int mode) { int mode,
int n_ctx) {
GGML_ASSERT(n_past >= 0); GGML_ASSERT(n_past >= 0);
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet"); GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
@ -7044,12 +7045,13 @@ struct ggml_tensor * ggml_rope_back(
ggml_scratch_save(ctx); ggml_scratch_save(ctx);
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
ggml_set_name(b, "n_past, n_dims, mode"); ggml_set_name(b, "n_past, n_dims, mode");
((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_dims; ((int32_t *) b->data)[1] = n_dims;
((int32_t *) b->data)[2] = mode; ((int32_t *) b->data)[2] = mode;
((int32_t *) b->data)[3] = n_ctx;
ggml_scratch_load(ctx); ggml_scratch_load(ctx);
@ -12378,7 +12380,7 @@ static void ggml_compute_forward_rope_back_f32(
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
assert(src1->type == GGML_TYPE_I32); assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 3); assert(ggml_nelements(src1) == 4);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return; return;
@ -15741,13 +15743,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1]; const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2]; const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
src0->grad = ggml_add_impl(ctx, src0->grad = ggml_add_impl(ctx,
src0->grad, src0->grad,
ggml_rope_back(ctx, ggml_rope_back(ctx,
tensor->grad, tensor->grad,
n_past, n_past,
n_dims, n_dims,
mode), mode,
n_ctx),
inplace); inplace);
} }
if (src1->grad) { if (src1->grad) {
@ -15758,7 +15762,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{ {
if (src0->grad) { if (src0->grad) {
assert(src1->type == GGML_TYPE_I32); assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 3); assert(ggml_nelements(src1) == 4);
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1]; const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2]; const int mode = ((int32_t *) src1->data)[2];

7
ggml.h
View file

@ -1127,9 +1127,9 @@ extern "C" {
int n_past, int n_past,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx,
float freq_base, float freq_base,
float freq_scale, float freq_scale);
int n_ctx);
// rotary position embedding backward, i.e compute dx from dy // rotary position embedding backward, i.e compute dx from dy
// a - dy // a - dy
@ -1138,7 +1138,8 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, int n_past,
int n_dims, int n_dims,
int mode); int mode,
int n_ctx);
// alibi position embedding // alibi position embedding
// in-place, returns view(a) // in-place, returns view(a)

View file

@ -850,7 +850,7 @@ struct llama_context_params llama_context_default_params() {
/*.n_batch =*/ 512, /*.n_batch =*/ 512,
/*.gpu_layers =*/ 0, /*.gpu_layers =*/ 0,
/*.main_gpu =*/ 0, /*.main_gpu =*/ 0,
/*.tensor_split =*/ {0}, /*.tensor_split =*/ nullptr,
/*.rope_freq_base =*/ 10000.0f, /*.rope_freq_base =*/ 10000.0f,
/*.rope_freq_scale =*/ 1.0f, /*.rope_freq_scale =*/ 1.0f,
/*.progress_callback =*/ nullptr, /*.progress_callback =*/ nullptr,
@ -1290,7 +1290,7 @@ static bool llama_model_load(
int n_batch, int n_batch,
int n_gpu_layers, int n_gpu_layers,
int main_gpu, int main_gpu,
float * tensor_split, const float * tensor_split,
float rope_freq_base, float rope_freq_base,
float rope_freq_scale, float rope_freq_scale,
bool low_vram, bool low_vram,
@ -1453,11 +1453,11 @@ static bool llama_eval_internal(
offload_func_kq(tmpq); offload_func_kq(tmpq);
ggml_set_name(tmpq, "tmpq"); ggml_set_name(tmpq, "tmpq");
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0); struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
offload_func_kq(Kcur); offload_func_kq(Kcur);
ggml_set_name(Kcur, "Kcur"); ggml_set_name(Kcur, "Kcur");
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0); struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
offload_func_kq(Qcur); offload_func_kq(Qcur);
ggml_set_name(Qcur, "Qcur"); ggml_set_name(Qcur, "Qcur");
@ -2219,8 +2219,7 @@ void llama_sample_classifier_free_guidance(
struct llama_context * ctx, struct llama_context * ctx,
llama_token_data_array * candidates, llama_token_data_array * candidates,
struct llama_context * guidance_ctx, struct llama_context * guidance_ctx,
float scale, float scale) {
float smooth_factor) {
int64_t t_start_sample_us = ggml_time_us(); int64_t t_start_sample_us = ggml_time_us();
assert(ctx); assert(ctx);
@ -2241,16 +2240,7 @@ void llama_sample_classifier_free_guidance(
for (int i = 0; i < n_vocab; ++i) { for (int i = 0; i < n_vocab; ++i) {
float logit_guidance = logits_guidance[i]; float logit_guidance = logits_guidance[i];
float logit_base = logits_base[i]; float logit_base = logits_base[i];
logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance; candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
}
llama_log_softmax(logits_guidance, n_vocab);
for (int i = 0; i < n_vocab; ++i) {
float logit_base = logits_base[i];
float logit_guidance = logits_guidance[i];
candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
} }
if (ctx) { if (ctx) {

View file

@ -88,7 +88,8 @@ extern "C" {
int32_t n_batch; // prompt processing batch size int32_t n_batch; // prompt processing batch size
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors int32_t main_gpu; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
// ref: https://github.com/ggerganov/llama.cpp/pull/2054 // ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency float rope_freq_base; // RoPE base frequency
@ -343,13 +344,11 @@ extern "C" {
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
/// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
LLAMA_API void llama_sample_classifier_free_guidance( LLAMA_API void llama_sample_classifier_free_guidance(
struct llama_context * ctx, struct llama_context * ctx,
llama_token_data_array * candidates, llama_token_data_array * candidates,
struct llama_context * guidance_ctx, struct llama_context * guidance_ctx,
float scale, float scale);
float smooth_factor);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);