Merge branch 'ggerganov:master' into sgemm-avx

This commit is contained in:
Eve 2024-04-25 16:45:30 +00:00 committed by GitHub
commit 330b3bc5b5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 198 additions and 76 deletions

View file

@ -43,12 +43,6 @@ else()
set(LLAMA_METAL_DEFAULT OFF) set(LLAMA_METAL_DEFAULT OFF)
endif() endif()
if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
set(LLAMA_LLAMAFILE_DEFAULT OFF)
else()
set(LLAMA_LLAMAFILE_DEFAULT ON)
endif()
# general # general
option(BUILD_SHARED_LIBS "build shared libraries" OFF) option(BUILD_SHARED_LIBS "build shared libraries" OFF)
option(LLAMA_STATIC "llama: static link libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF)

View file

@ -93,6 +93,7 @@ Typically finetunes of the base models below are supported as well.
- [X] LLaMA 🦙 - [X] LLaMA 🦙
- [x] LLaMA 2 🦙🦙 - [x] LLaMA 2 🦙🦙
- [x] LLaMA 3 🦙🦙🦙
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral) - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct) - [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
@ -119,8 +120,9 @@ Typically finetunes of the base models below are supported as well.
- [x] [CodeShell](https://github.com/WisdomShell/codeshell) - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
- [x] [Gemma](https://ai.google.dev/gemma) - [x] [Gemma](https://ai.google.dev/gemma)
- [x] [Mamba](https://github.com/state-spaces/mamba) - [x] [Mamba](https://github.com/state-spaces/mamba)
- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
- [x] [Xverse](https://huggingface.co/models?search=xverse) - [x] [Xverse](https://huggingface.co/models?search=xverse)
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01) - [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo) - [x] [OLMo](https://allenai.org/olmo)
@ -135,6 +137,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V) - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
**HTTP server** **HTTP server**

View file

@ -160,7 +160,9 @@ function gg_run_test_scripts_debug {
set -e set -e
# TODO: too slow, run on dedicated node
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
#(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e set +e
} }
@ -184,6 +186,7 @@ function gg_run_test_scripts_release {
set -e set -e
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e set +e
} }

16
examples/gguf-split/tests.sh Normal file → Executable file
View file

@ -4,16 +4,16 @@ set -eu
if [ $# -lt 1 ] if [ $# -lt 1 ]
then then
echo "usage: $0 path_to_build_binary [path_to_temp_folder]" echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
echo "example: $0 ../../build/bin ../../tmp" echo "example: $0 ../../build/bin ../../tmp"
exit 1 exit 1
fi fi
if [ $# -gt 1 ] if [ $# -gt 1 ]
then then
TMP_DIR=$2 TMP_DIR=$2
else else
TMP_DIR=/tmp TMP_DIR=/tmp
fi fi
set -x set -x
@ -21,7 +21,7 @@ set -x
SPLIT=$1/gguf-split SPLIT=$1/gguf-split
MAIN=$1/main MAIN=$1/main
WORK_PATH=$TMP_DIR/gguf-split WORK_PATH=$TMP_DIR/gguf-split
CUR_DIR=$(pwd) ROOT_DIR=$(realpath $(dirname $0)/../../)
mkdir -p "$WORK_PATH" mkdir -p "$WORK_PATH"
@ -30,8 +30,8 @@ rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
# 1. Get a model # 1. Get a model
( (
cd $WORK_PATH cd $WORK_PATH
"$CUR_DIR"/../../scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf "$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
) )
echo PASS echo PASS

View file

@ -1325,7 +1325,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
} }
// Linear interpolation between two points // Linear interpolation between two points
inline float lerp(float s, float e, float t) { inline float clip_lerp(float s, float e, float t) {
return s + (e - s) * t; return s + (e - s) * t;
} }
// Bilinear resize function // Bilinear resize function
@ -1347,17 +1347,17 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta
float y_lerp = py - y_floor; float y_lerp = py - y_floor;
for (int c = 0; c < 3; c++) { for (int c = 0; c < 3; c++) {
float top = lerp( float top = clip_lerp(
static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]), static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
x_lerp x_lerp
); );
float bottom = lerp( float bottom = clip_lerp(
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
x_lerp x_lerp
); );
dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp)); dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(clip_lerp(top, bottom, y_lerp));
} }
} }
} }

View file

@ -97,6 +97,7 @@ static void usage(const char * executable) {
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
printf(" --keep-split: will generate quatized model in the same shards as input");
printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@ -300,6 +301,8 @@ int main(int argc, char ** argv) {
} else { } else {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "--keep-split")) {
params.keep_split = true;
} else { } else {
usage(argv[0]); usage(argv[0]);
} }
@ -332,20 +335,28 @@ int main(int argc, char ** argv) {
std::string fname_out; std::string fname_out;
std::string ftype_str; std::string ftype_str;
std::string suffix = ".gguf";
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
std::string fpath; std::string fpath;
const size_t pos = fname_inp.find_last_of("/\\"); const size_t pos = fname_inp.find_last_of("/\\");
if (pos != std::string::npos) { if (pos != std::string::npos) {
fpath = fname_inp.substr(0, pos + 1); fpath = fname_inp.substr(0, pos + 1);
} }
// export as [inp path]/ggml-model-[ftype].gguf
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf"; // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
fname_out = fpath + "ggml-model-" + ftype_str;
if (!params.keep_split) {
fname_out += suffix;
}
arg_idx++; arg_idx++;
if (ftype_str == "COPY") { if (ftype_str == "COPY") {
params.only_copy = true; params.only_copy = true;
} }
} else { } else {
fname_out = argv[arg_idx]; fname_out = argv[arg_idx];
if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
}
arg_idx++; arg_idx++;
if (argc <= arg_idx) { if (argc <= arg_idx) {

View file

@ -0,0 +1,65 @@
#!/bin/bash
set -eu
if [ $# -lt 1 ]
then
echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
echo "example: $0 ../../build/bin ../../tmp"
exit 1
fi
if [ $# -gt 1 ]
then
TMP_DIR=$2
else
TMP_DIR=/tmp
fi
set -x
SPLIT=$1/gguf-split
QUANTIZE=$1/quantize
MAIN=$1/main
WORK_PATH=$TMP_DIR/quantize
ROOT_DIR=$(realpath $(dirname $0)/../../)
mkdir -p "$WORK_PATH"
# Clean up in case of previously failed test
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
# 1. Get a model
(
cd $WORK_PATH
"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
)
echo PASS
# 2. Split model
$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
echo PASS
echo
# 3. Requant model with '--keep_split'
$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
echo PASS
echo
# 3a. Test the requanted model is loading properly
$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
echo PASS
echo
# 4. Requant mode without '--keep_split'
$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
echo PASS
echo
# 4b. Test the requanted model is loading properly
$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
echo PASS
echo
# Clean up
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf

View file

@ -4,9 +4,8 @@ set -eu
if [ $# -lt 1 ] if [ $# -lt 1 ]
then then
# Start @llama.cpp scenario # Start @llama.cpp scenario
behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
else else
behave "$@" behave "$@"
fi fi

View file

@ -11,6 +11,12 @@
#include <string.h> // memcpy #include <string.h> // memcpy
#include <math.h> // fabsf #include <math.h> // fabsf
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif

View file

@ -14,12 +14,6 @@
#include <stdlib.h> // for qsort #include <stdlib.h> // for qsort
#include <stdio.h> // for GGML_ASSERT #include <stdio.h> // for GGML_ASSERT
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define UNUSED GGML_UNUSED #define UNUSED GGML_UNUSED
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7 // some compilers don't provide _mm256_set_m128i, e.g. gcc 7

12
ggml.c
View file

@ -858,18 +858,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
// simd mappings // simd mappings
// //
#if defined(__ARM_NEON)
#if !defined(__aarch64__)
// 64-bit compatibility
inline static float vaddvq_f32(float32x4_t v) {
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
}
#endif
#endif
// we define a common set of C macros which map to specific intrinsics based on the current architecture // we define a common set of C macros which map to specific intrinsics based on the current architecture
// we then implement the fundamental computation operations below using only these macros // we then implement the fundamental computation operations below using only these macros
// adding support for new architectures requires to define the corresponding SIMD macros // adding support for new architectures requires to define the corresponding SIMD macros

124
llama.cpp
View file

@ -2999,9 +2999,13 @@ struct llama_model_loader {
ggml_tensor * tensor; ggml_tensor * tensor;
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
const int tensor_idx = gguf_find_tensor(gguf_ctx, name); const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
}
} }
}; };
std::vector<llama_tensor_weight> weights; std::vector<llama_tensor_weight> weights;
@ -3040,15 +3044,15 @@ struct llama_model_loader {
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name)); llm_kv = LLM_KV(llm_arch_from_string(arch_name));
files.emplace_back(new llama_file(fname.c_str(), "rb"));
contexts.emplace_back(ctx);
// Save tensors data offset of the main file. // Save tensors data offset of the main file.
// For subsidiary files, `meta` tensor data offset must not be used, // For subsidiary files, `meta` tensor data offset must not be used,
// so we build a unified tensors index for weights. // so we build a unified tensors index for weights.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
weights.emplace_back(0, cur->name, meta, cur); weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
} }
files.emplace_back(new llama_file(fname.c_str(), "rb"));
contexts.emplace_back(ctx);
uint16_t n_split = 0; uint16_t n_split = 0;
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
@ -3082,13 +3086,14 @@ struct llama_model_loader {
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path)); throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
} }
// Save tensors data offset info of the shard.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
}
files.emplace_back(new llama_file(split_path, "rb")); files.emplace_back(new llama_file(split_path, "rb"));
contexts.emplace_back(ctx); contexts.emplace_back(ctx);
// Save tensors data offset info of the shard.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
}
gguf_free(ctx_gguf); gguf_free(ctx_gguf);
} }
@ -3297,6 +3302,10 @@ struct llama_model_loader {
return nullptr; return nullptr;
} }
const llama_tensor_weight * get_weight(int i) const {
return get_weight(get_tensor_name(i));
}
const llama_tensor_weight & require_weight(const char * name) const { const llama_tensor_weight & require_weight(const char * name) const {
const llama_tensor_weight * weight = get_weight(name); const llama_tensor_weight * weight = get_weight(name);
if (!weight) { if (!weight) {
@ -14528,26 +14537,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<no_init<uint8_t>> work; std::vector<no_init<uint8_t>> work;
std::vector<no_init<float>> f32_conv_buf; std::vector<no_init<float>> f32_conv_buf;
uint16_t n_split = 1;
// Assume split index is continuous
if (params->keep_split) {
for (int i = 0; i < ml.n_tensors; ++i) {
n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
}
}
std::vector<gguf_context*> ctx_outs(n_split, NULL);
ctx_outs[0] = ctx_out;
// populate the original tensors so we get an initial meta data // populate the original tensors so we get an initial meta data
for (int i = 0; i < ml.n_tensors; ++i) { for (int i = 0; i < ml.n_tensors; ++i) {
const struct ggml_tensor * meta = ml.get_tensor_meta(i); auto weight = ml.get_weight(i);
gguf_add_tensor(ctx_out, meta); uint16_t i_split = params->keep_split ? weight->idx : 0;
struct ggml_tensor * tensor = weight->tensor;
if (ctx_outs[i_split] == NULL) {
ctx_outs[i_split] = gguf_init_empty();
}
gguf_add_tensor(ctx_outs[i_split], tensor);
} }
std::ofstream fout(fname_out, std::ios::binary); // Set split info if needed
fout.exceptions(std::ofstream::failbit); // fail fast on write errors if (n_split > 1) {
for (size_t i = 0; i < ctx_outs.size(); ++i) {
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
}
}
const size_t meta_size = gguf_get_meta_size(ctx_out); int cur_split = -1;
std::ofstream fout;
auto close_ofstream = [&]() {
// Write metadata and close file handler
if (fout.is_open()) {
fout.seekp(0);
std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
gguf_get_meta_data(ctx_outs[cur_split], data.data());
fout.write((const char *) data.data(), data.size());
fout.close();
}
};
auto new_ofstream = [&](int index) {
cur_split = index;
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
std::string fname = fname_out;
if (params->keep_split) {
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
fname = std::string(split_path);
}
LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size); fout = std::ofstream(fname, std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
// placeholder for the meta data const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
::zeros(fout, meta_size); // placeholder for the meta data
::zeros(fout, meta_size);
};
const auto tn = LLM_TN(model.arch); const auto tn = LLM_TN(model.arch);
new_ofstream(0);
for (int i = 0; i < ml.n_tensors; ++i) { for (int i = 0; i < ml.n_tensors; ++i) {
struct ggml_tensor * tensor = ml.get_tensor_meta(i); auto weight = ml.get_weight(i);
struct ggml_tensor * tensor = weight->tensor;
if (weight->idx != cur_split && params->keep_split) {
close_ofstream();
new_ofstream(weight->idx);
}
const std::string name = ggml_get_name(tensor); const std::string name = ggml_get_name(tensor);
@ -14702,26 +14759,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
total_size_new += new_size; total_size_new += new_size;
// update the gguf meta data as we go // update the gguf meta data as we go
gguf_set_tensor_type(ctx_out, name.c_str(), new_type); gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size); gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
// write tensor data + padding // write tensor data + padding
fout.write((const char *) new_data, new_size); fout.write((const char *) new_data, new_size);
zeros(fout, GGML_PAD(new_size, align) - new_size); zeros(fout, GGML_PAD(new_size, align) - new_size);
} }
close_ofstream();
// go back to beginning of file and write the updated meta data for (auto & c:ctx_outs) {
{ gguf_free(c);
fout.seekp(0);
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
gguf_get_meta_data(ctx_out, data.data());
fout.write((const char *) data.data(), data.size());
} }
fout.close();
gguf_free(ctx_out);
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
@ -15077,6 +15126,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
/*.quantize_output_tensor =*/ true, /*.quantize_output_tensor =*/ true,
/*.only_copy =*/ false, /*.only_copy =*/ false,
/*.pure =*/ false, /*.pure =*/ false,
/*.keep_split =*/ false,
/*.imatrix =*/ nullptr, /*.imatrix =*/ nullptr,
/*.kv_overrides =*/ nullptr, /*.kv_overrides =*/ nullptr,
}; };
@ -16081,6 +16131,8 @@ struct llama_data_file_context : llama_data_context {
* *
*/ */
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
llama_synchronize(ctx);
// copy rng // copy rng
{ {
std::ostringstream rng_ss; std::ostringstream rng_ss;
@ -16233,6 +16285,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
// Sets the state reading from the specified source address // Sets the state reading from the specified source address
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) { size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
llama_synchronize(ctx);
const uint8_t * inp = src; const uint8_t * inp = src;
// set rng // set rng
@ -16537,6 +16591,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
} }
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) { static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
llama_synchronize(ctx);
const auto & kv_self = ctx->kv_self; const auto & kv_self = ctx->kv_self;
GGML_ASSERT(!kv_self.recurrent); // not implemented GGML_ASSERT(!kv_self.recurrent); // not implemented
@ -16654,6 +16710,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
} }
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) { size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
llama_synchronize(ctx);
auto & kv_self = ctx->kv_self; auto & kv_self = ctx->kv_self;
GGML_ASSERT(!kv_self.recurrent); // not implemented GGML_ASSERT(!kv_self.recurrent); // not implemented

View file

@ -288,6 +288,7 @@ extern "C" {
bool quantize_output_tensor; // quantize output.weight bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards
void * imatrix; // pointer to importance matrix data void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides void * kv_overrides; // pointer to vector containing overrides
} llama_model_quantize_params; } llama_model_quantize_params;