Fix memory bugs in loading code

This change hardens the C++ code that loads the GGML file format. Some
people download weights off the Internet to run inference on a trained
model. Since weights don't contain code like graph definitions, having
them be able to load in a secure manner is a reasonable expectation to
have. Therefore this change addresses many of the weaknesses in how we
were going about doing things earlier, which would allow untrustworthy
weights to trigger undefined behaviors with memory. I haven't cared to
investigate whether any of these weaknesses are exploitable, but it'll
certainly be more difficult for that to happen, once this gets merged,
which will enable our users to share more freely, happily, and safely.
This commit is contained in:
Justine Tunney 2023-03-30 19:43:41 -07:00
parent ee0c40dd6d
commit fed6b5da76
No known key found for this signature in database
GPG key ID: BE714B4575D6E328

274
llama.cpp
View file

@ -11,11 +11,15 @@
#include <regex>
#include <cassert>
#include <cstring>
#include <cerrno>
#include <climits>
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#else
#endif
#if !defined(_WIN32) || defined(_POSIX_MAPPED_FILES)
#include <sys/types.h>
#include <sys/mman.h>
#include <unistd.h>
@ -36,7 +40,6 @@
} \
} while (0)
// determine number of model parts based on the dimension
static const std::unordered_map<int, int> LLAMA_N_PARTS = {
{ 4096, 1 },
@ -156,8 +159,8 @@ struct llama_model {
std::vector<uint8_t> buf;
// model memory mapped file
void * mm_addr = NULL;
uint64_t mm_length = 0;
void *mm_addr = NULL;
int64_t mm_length = 0;
// tensors
int n_loaded;
@ -303,11 +306,31 @@ struct llama_context_params llama_context_default_params() {
return result;
}
//
// error reporting
//
#ifdef _WIN32
static int WinStrerror(int err, char *buf, int size) {
return FormatMessageA(
FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
buf, size, NULL);
}
static void LogWindowsError(const char *file, int line, const char *thing) {
#define LogWindowsError(thing) LogWindowsError(__FILE__, __LINE__, thing)
char s[256];
int e = GetLastError();
WinStrerror(e, s, sizeof(s));
fprintf(stderr, "%s:%d: error[%#x]: %s failed: %s\n", file, line, e, thing, s);
}
#endif // _WIN32
//
// model loading
//
static void *mmap_file(const char *fname, uint64_t *mm_length) {
static void *mmap_file(const char *fname, int64_t *mm_length) {
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
HANDLE hFile = CreateFileA(fname,
GENERIC_READ,
@ -316,17 +339,26 @@ static void *mmap_file(const char *fname, uint64_t *mm_length) {
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
NULL);
if (hFile == INVALID_HANDLE_VALUE) return 0;
if (hFile == INVALID_HANDLE_VALUE) {
LogWindowsError("CreateFileA");
return 0;
}
LARGE_INTEGER fileSize;
fileSize.QuadPart = -1;
GetFileSizeEx(hFile, &fileSize);
int64_t length = fileSize.QuadPart;
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
CloseHandle(hFile);
if (!hMapping) return 0;
if (!hMapping) {
LogWindowsError("CreateFileMappingA");
return 0;
}
void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
CloseHandle(hMapping);
if (!addr) return 0;
if (!addr) {
LogWindowsError("MapViewOfFile");
return 0;
}
#else
int fd = open(fname, O_RDONLY);
if (fd == -1) return 0;
@ -339,7 +371,7 @@ static void *mmap_file(const char *fname, uint64_t *mm_length) {
return addr;
}
static void munmap_file(void * addr, size_t length) {
static void munmap_file(void * addr, uint64_t length) {
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
UnmapViewOfFile(addr);
#else
@ -359,6 +391,46 @@ static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
return false;
}
static bool check_n_dims(int32_t n_dims) {
if (n_dims == 1) return true;
if (n_dims == 2) return true;
fprintf(stderr,
"%s: unsupported number of dimensions in tensor: %" PRId32 "\n",
__func__, n_dims);
return false;
}
static bool check_ios_error(std::ios &ios, const char *thing) {
if (ios.good()) {
return true;
}
if (ios.bad()) {
fprintf(stderr, "%s failed: system error: %s\n", thing, strerror(errno));
} else if (ios.eof()) {
fprintf(stderr, "%s failed: unexpected end of file\n", thing);
} else if (ios.fail()) {
fprintf(stderr, "%s failed: illogical operation\n", thing);
}
return false;
}
static bool read_impl(std::istream &fin, char *buf, std::streamsize len, const char *thing) {
fin.read(buf, len);
return check_ios_error(fin, thing);
}
static bool read_buf(std::istream &fin, char *buf, std::streamsize len) {
return read_impl(fin, buf, len, "read_buf()");
}
static bool read_int32(std::istream &fin, int32_t *buf) {
return read_impl(fin, (char *)buf, sizeof(int32_t), "read_int32()");
}
static bool read_float(std::istream &fin, float *buf) {
return read_impl(fin, (char *)buf, sizeof(float), "read_float()");
}
static bool llama_model_load(
const std::string & fname,
llama_context & lctx,
@ -385,13 +457,13 @@ static bool llama_model_load(
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
fin.seekg(0, fin.end);
const size_t file_size = fin.tellg();
const int64_t file_size = fin.tellg();
fin.seekg(0);
// verify magic
{
uint32_t magic;
fin.read((char *) &magic, sizeof(magic));
int32_t magic;
if (!read_int32(fin, &magic)) return false;
if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
__func__, fname.c_str());
@ -401,9 +473,8 @@ static bool llama_model_load(
return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
}
uint32_t format_version;
fin.read((char *) &format_version, sizeof(format_version));
int32_t format_version;
if (!read_int32(fin, &format_version)) return false;
if (format_version != LLAMA_FILE_VERSION) {
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
__func__, fname.c_str(), format_version, LLAMA_FILE_VERSION);
@ -417,14 +488,13 @@ static bool llama_model_load(
{
auto & hparams = model.hparams;
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
//fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
if (!read_int32(fin, &hparams.n_vocab)) return false;
if (!read_int32(fin, &hparams.n_embd)) return false;
if (!read_int32(fin, &hparams.n_mult)) return false;
if (!read_int32(fin, &hparams.n_head)) return false;
if (!read_int32(fin, &hparams.n_layer)) return false;
if (!read_int32(fin, &hparams.n_rot)) return false;
if (!read_int32(fin, &hparams.f16)) return false;
hparams.n_ctx = n_ctx;
@ -476,20 +546,20 @@ static bool llama_model_load(
std::vector<char> tmp(64);
for (int i = 0; i < model.hparams.n_vocab; i++) {
uint32_t len;
fin.read((char *) &len, sizeof(len));
int32_t len;
if (!read_int32(fin, &len)) return false;
word.resize(len);
if (len > 0) {
tmp.resize(len);
fin.read(tmp.data(), len);
if (!read_buf(fin, tmp.data(), len)) return false;
word.assign(tmp.data(), len);
} else {
word.clear();
}
float score;
fin.read((char *) &score, sizeof(score));
if (!read_float(fin, &score)) return false;
vocab.token_to_id[word] = i;
@ -513,12 +583,11 @@ static bool llama_model_load(
case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
default:
{
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
__func__, fname.c_str(), model.hparams.f16);
return false;
}
default: {
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
__func__, fname.c_str(), model.hparams.f16);
return false;
}
}
// map model into memory
@ -546,7 +615,7 @@ static bool llama_model_load(
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
// this is the total memory required to run the inference
const size_t mem_required =
const int64_t mem_required =
ctx_size +
model.mm_length +
MEM_REQ_SCRATCH0.at(model.type) +
@ -554,7 +623,7 @@ static bool llama_model_load(
MEM_REQ_EVAL.at (model.type);
// this is the memory required by one llama_state
const size_t mem_required_state =
const int64_t mem_required_state =
scale*MEM_REQ_KV_SELF.at(model.type);
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
@ -641,7 +710,7 @@ static bool llama_model_load(
// load weights
{
size_t total_size = 0;
int64_t total_size = 0;
model.n_loaded = 0;
while (true) {
@ -649,23 +718,35 @@ static bool llama_model_load(
int32_t length;
int32_t ftype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
if (fin.eof()) {
break;
}
fin.read((char *)&n_dims, 4);
if (fin.eof()) break;
if (!fin.good()) return false;
if (!read_int32(fin, &length)) return false;
if (!read_int32(fin, &ftype)) return false;
int32_t nelements = 1;
int32_t ne[2] = { 1, 1 };
if (!check_n_dims(n_dims)) return false;
for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
if (!read_int32(fin, &ne[i])) return false;
nelements *= ne[i];
}
switch (ftype) {
case 0: // f32
case 1: // f16
break;
case 2: // q4_0
case 3: // q4_1
LLAMA_ASSERT(ne[0] % 64 == 0);
break;
default:
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
return false;
}
std::string name(length, 0);
fin.read(&name[0], length);
if (!read_buf(fin, &name[0], length)) return false;
if (model.tensors.find(name.data()) == model.tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
@ -688,23 +769,13 @@ static bool llama_model_load(
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
}
switch (ftype) {
case 0: // f32
case 1: // f16
break;
case 2: // q4_0
case 3: // q4_1
assert(ne[0] % 64 == 0);
break;
default:
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
return false;
};
// load the tensor data into memory without copying or reading it
size_t offset = fin.tellg();
size_t tensor_data_size = ggml_nbytes(tensor);
int64_t offset = fin.tellg();
int64_t tensor_data_size = ggml_nbytes(tensor);
offset = (offset + 31) & -32;
LLAMA_ASSERT(0 <= offset && offset < model.mm_length);
LLAMA_ASSERT(0 <= tensor_data_size && tensor_data_size <= model.mm_length);
LLAMA_ASSERT(offset + tensor_data_size <= model.mm_length);
tensor->data = mm_addr + offset;
fin.seekg(offset + tensor_data_size);
total_size += tensor_data_size;
@ -712,13 +783,11 @@ static bool llama_model_load(
// progress
if (progress_callback) {
double current_progress = size_t(fin.tellg()) / double(file_size);
double current_progress = int64_t(fin.tellg()) / double(file_size);
progress_callback(current_progress, progress_callback_user_data);
}
}
fin.close();
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
if (model.n_loaded == 0) {
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
@ -1305,8 +1374,8 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
// verify magic
{
uint32_t magic;
finp.read((char *) &magic, sizeof(magic));
int32_t magic;
if (!read_int32(finp, &magic)) return false;
if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
__func__, fname_inp.c_str());
@ -1318,9 +1387,8 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
fout.write((char *) &magic, sizeof(magic));
uint32_t format_version;
finp.read((char *) &format_version, sizeof(format_version));
int32_t format_version;
if (!read_int32(finp, &format_version)) return false;
if (format_version != LLAMA_FILE_VERSION) {
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
__func__, fname_inp.c_str(), format_version, LLAMA_FILE_VERSION);
@ -1334,14 +1402,14 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
// load hparams
{
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
//finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
if (!read_int32(finp, &hparams.n_vocab)) return false;
//if (!read_int32(finp, &hparams.n_ctx)) return false;
if (!read_int32(finp, &hparams.n_embd)) return false;
if (!read_int32(finp, &hparams.n_mult)) return false;
if (!read_int32(finp, &hparams.n_head)) return false;
if (!read_int32(finp, &hparams.n_layer)) return false;
if (!read_int32(finp, &hparams.n_rot)) return false;
if (!read_int32(finp, &hparams.f16)) return false;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
@ -1374,16 +1442,16 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<char> word(32);
vocab.id_to_token.resize(n_vocab);
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
finp.read ((char *) &len, sizeof(len));
int32_t len;
if (!read_int32(finp, &len)) return false;
fout.write((char *) &len, sizeof(len));
word.resize(len);
finp.read ((char *) &word[0], len);
if (!read_buf(finp, (char *)&word[0], len)) return false;
fout.write((char *) &word[0], len);
float score;
finp.read ((char *) &score, sizeof(score));
if (!read_float(finp, &score)) return false;
fout.write((char *) &score, sizeof(score));
vocab.token_to_id[word.data()] = i;
@ -1412,33 +1480,33 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
int32_t length;
int32_t ftype;
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
if (finp.eof()) {
break;
}
finp.read((char *)&n_dims, 4);
if (finp.eof()) break;
if (!finp.good()) return false;
if (!read_int32(finp, &length)) return false;
if (!read_int32(finp, &ftype)) return false;
int32_t nelements = 1;
int32_t ne[2] = { 1, 1 };
if (!check_n_dims(n_dims)) return false;
for (int i = 0; i < n_dims; ++i) {
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
if (!read_int32(finp, &ne[i])) return false;
nelements *= ne[i];
}
std::string name(length, 0);
finp.read (&name[0], length);
if (!read_buf(finp, &name[0], length)) return false;
{
// ensure tensor data is aligned
uint64_t offset = finp.tellg();
int64_t offset = finp.tellg();
offset = (offset + 31) & -32;
finp.seekg(offset);
}
{
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
LLAMA_ASSERT(0 <= ftype && ftype < 4);
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
}
@ -1466,14 +1534,14 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
if (ftype == 1) {
data_f16.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
if (!read_buf(finp, (char *)&data_f16[0], nelements * sizeof(ggml_fp16_t))) return false;
data_f32.resize(nelements);
for (int i = 0; i < nelements; ++i) {
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
}
} else {
data_f32.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
if (!read_buf(finp, (char *)&data_f32[0], nelements * sizeof(float))) return false;
}
ftype = itype;
@ -1481,7 +1549,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
data_u8.resize(nelements*bpe);
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
if (!read_buf(finp, (char *)&data_u8[0], nelements * bpe)) return false;
}
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
@ -1508,18 +1576,14 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
switch (type) {
case GGML_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
break;
case GGML_TYPE_Q4_1:
{
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
break;
default:
{
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
return false;
}
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
return false;
}
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
@ -1563,7 +1627,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
finp.close();
fout.close();
return true;
return !fout.bad();
}
//