Merge branch 'master' into HEAD
This commit is contained in:
commit
39d370452c
8 changed files with 117 additions and 35 deletions
|
@ -13,17 +13,31 @@ let package = Package(
|
||||||
products: [
|
products: [
|
||||||
.library(name: "llama", targets: ["llama"]),
|
.library(name: "llama", targets: ["llama"]),
|
||||||
],
|
],
|
||||||
dependencies: [
|
|
||||||
.package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
|
|
||||||
],
|
|
||||||
targets: [
|
targets: [
|
||||||
.target(
|
.target(
|
||||||
name: "llama",
|
name: "llama",
|
||||||
dependencies: ["ggml"],
|
|
||||||
path: ".",
|
path: ".",
|
||||||
exclude: ["ggml-metal.metal"],
|
exclude: [
|
||||||
|
"cmake",
|
||||||
|
"examples",
|
||||||
|
"scripts",
|
||||||
|
"models",
|
||||||
|
"tests",
|
||||||
|
"CMakeLists.txt",
|
||||||
|
"ggml-cuda.cu",
|
||||||
|
"ggml-cuda.h",
|
||||||
|
"Makefile"
|
||||||
|
],
|
||||||
sources: [
|
sources: [
|
||||||
|
"ggml.c",
|
||||||
"llama.cpp",
|
"llama.cpp",
|
||||||
|
"ggml-alloc.c",
|
||||||
|
"ggml-backend.c",
|
||||||
|
"ggml-quants.c",
|
||||||
|
"ggml-metal.m",
|
||||||
|
],
|
||||||
|
resources: [
|
||||||
|
.process("ggml-metal.metal")
|
||||||
],
|
],
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [
|
cSettings: [
|
||||||
|
|
46
ci/run.sh
46
ci/run.sh
|
@ -568,6 +568,50 @@ function gg_sum_open_llama_7b_v2 {
|
||||||
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# bge-small
|
||||||
|
|
||||||
|
function gg_run_embd_bge_small {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
|
||||||
|
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
|
||||||
|
|
||||||
|
path_models="../models-mnt/bge-small"
|
||||||
|
|
||||||
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
|
python3 ../convert-hf-to-gguf.py ${path_models}
|
||||||
|
|
||||||
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
|
||||||
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
|
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_embd_bge_small {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'BGE Small (BERT):\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
|
}
|
||||||
|
|
||||||
## main
|
## main
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
|
@ -591,6 +635,8 @@ test $ret -eq 0 && gg_run ctest_debug
|
||||||
test $ret -eq 0 && gg_run ctest_release
|
test $ret -eq 0 && gg_run ctest_release
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
|
test $ret -eq 0 && gg_run embd_bge_small
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||||
test $ret -eq 0 && gg_run open_llama_3b_v2
|
test $ret -eq 0 && gg_run open_llama_3b_v2
|
||||||
|
|
|
@ -3819,15 +3819,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
/* Compute combined scale for the block */
|
/* Compute combined scale for the block */
|
||||||
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
||||||
|
|
||||||
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
||||||
|
|
||||||
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
||||||
const __m256i off = _mm256_set1_epi8( 8 );
|
const __m256i off = _mm256_set1_epi8( 8 );
|
||||||
bx = _mm256_sub_epi8( bx, off );
|
qx = _mm256_sub_epi8( qx, off );
|
||||||
|
|
||||||
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
|
||||||
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||||
|
|
||||||
/* Multiply q with scale and accumulate */
|
/* Multiply q with scale and accumulate */
|
||||||
acc = _mm256_fmadd_ps( d, q, acc );
|
acc = _mm256_fmadd_ps( d, q, acc );
|
||||||
|
@ -4196,10 +4196,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||||
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
||||||
|
|
||||||
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
||||||
const __m256i bx = bytes_from_nibbles_32(x[i].qs);
|
const __m256i qx = bytes_from_nibbles_32(x[i].qs);
|
||||||
const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
|
const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[i].qs );
|
||||||
|
|
||||||
const __m256 xy = mul_sum_us8_pairs_float(bx, by);
|
const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
|
||||||
|
|
||||||
// Accumulate d0*d1*x*y
|
// Accumulate d0*d1*x*y
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
@ -4418,14 +4418,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
/* Compute combined scale for the block */
|
/* Compute combined scale for the block */
|
||||||
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
||||||
|
|
||||||
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
||||||
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
||||||
bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
|
bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
|
||||||
bx = _mm256_or_si256(bx, bxhi);
|
qx = _mm256_or_si256(qx, bxhi);
|
||||||
|
|
||||||
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
|
||||||
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||||
|
|
||||||
/* Multiply q with scale and accumulate */
|
/* Multiply q with scale and accumulate */
|
||||||
acc = _mm256_fmadd_ps(d, q, acc);
|
acc = _mm256_fmadd_ps(d, q, acc);
|
||||||
|
@ -4722,15 +4722,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||||
|
|
||||||
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
||||||
|
|
||||||
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
||||||
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
||||||
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
||||||
bx = _mm256_or_si256(bx, bxhi);
|
qx = _mm256_or_si256(qx, bxhi);
|
||||||
|
|
||||||
const __m256 dy = _mm256_set1_ps(y[i].d);
|
const __m256 dy = _mm256_set1_ps(y[i].d);
|
||||||
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
|
||||||
const __m256 q = mul_sum_us8_pairs_float(bx, by);
|
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
||||||
|
|
||||||
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
||||||
}
|
}
|
||||||
|
@ -4973,10 +4973,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
// Compute combined scale for the block
|
// Compute combined scale for the block
|
||||||
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
||||||
__m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
__m256i qx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
||||||
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
||||||
|
|
||||||
const __m256 q = mul_sum_i8_pairs_float(bx, by);
|
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||||
|
|
||||||
// Multiply q with scale and accumulate
|
// Multiply q with scale and accumulate
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
22
llama.cpp
22
llama.cpp
|
@ -774,22 +774,37 @@ struct LLM_TN {
|
||||||
llm_arch arch;
|
llm_arch arch;
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor) const {
|
std::string operator()(llm_tensor tensor) const {
|
||||||
|
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||||
|
return "__missing__";
|
||||||
|
}
|
||||||
return LLM_TENSOR_NAMES[arch].at(tensor);
|
return LLM_TENSOR_NAMES[arch].at(tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
||||||
|
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||||
|
return "__missing__";
|
||||||
|
}
|
||||||
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
|
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, int bid) const {
|
std::string operator()(llm_tensor tensor, int bid) const {
|
||||||
|
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||||
|
return "__missing__";
|
||||||
|
}
|
||||||
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
||||||
|
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||||
|
return "__missing__";
|
||||||
|
}
|
||||||
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
||||||
|
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||||
|
return "__missing__";
|
||||||
|
}
|
||||||
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -10249,6 +10264,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||||
}
|
}
|
||||||
++qs.i_ffn_up;
|
++qs.i_ffn_up;
|
||||||
}
|
}
|
||||||
|
|
||||||
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||||
//}
|
//}
|
||||||
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
||||||
|
@ -10450,7 +10466,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
quantize &= !params->only_copy;
|
quantize &= !params->only_copy;
|
||||||
|
|
||||||
// do not quantize expert gating tensors
|
// do not quantize expert gating tensors
|
||||||
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
|
||||||
|
|
||||||
|
// do not quantize positional embeddings and token types (BERT)
|
||||||
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
||||||
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
||||||
|
|
||||||
enum ggml_type new_type;
|
enum ggml_type new_type;
|
||||||
void * new_data;
|
void * new_data;
|
||||||
|
|
1
spm-headers/ggml-alloc.h
Symbolic link
1
spm-headers/ggml-alloc.h
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../ggml-alloc.h
|
1
spm-headers/ggml-backend.h
Symbolic link
1
spm-headers/ggml-backend.h
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../ggml-backend.h
|
1
spm-headers/ggml.h
Symbolic link
1
spm-headers/ggml.h
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../ggml.h
|
|
@ -2129,14 +2129,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
test_cases.emplace_back(new test_pad());
|
test_cases.emplace_back(new test_pad());
|
||||||
test_cases.emplace_back(new test_leaky_relu());
|
test_cases.emplace_back(new test_leaky_relu());
|
||||||
|
|
||||||
|
// these tests are disabled to save execution time, but they can be handy for debugging
|
||||||
|
#if 0
|
||||||
#if !defined(__SANITIZE_THREAD__)
|
#if !defined(__SANITIZE_THREAD__)
|
||||||
// FIXME: these tests use too much memory with thread sanitizer
|
// FIXME: these tests use too much memory with thread sanitizer
|
||||||
test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
|
test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
|
||||||
//test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
|
//test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// these tests are disabled to save execution time, but they can be handy for debugging
|
|
||||||
#if 0
|
|
||||||
test_cases.emplace_back(new test_llama(1));
|
test_cases.emplace_back(new test_llama(1));
|
||||||
test_cases.emplace_back(new test_llama(2));
|
test_cases.emplace_back(new test_llama(2));
|
||||||
test_cases.emplace_back(new test_falcon(1));
|
test_cases.emplace_back(new test_falcon(1));
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue