Merge 'origin/master' into clfixes
This commit is contained in:
commit
e4640eec70
10 changed files with 110 additions and 58 deletions
|
@ -1,6 +1,7 @@
|
||||||
#include <locale.h>
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "build-info.h"
|
#include "build-info.h"
|
||||||
|
|
||||||
|
#include <locale.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
|
|
@ -578,6 +578,37 @@ void console_set_color(console_state & con_st, console_color_t color) {
|
||||||
}
|
}
|
||||||
|
|
||||||
char32_t getchar32() {
|
char32_t getchar32() {
|
||||||
|
#if defined(_WIN32)
|
||||||
|
HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
|
||||||
|
wchar_t high_surrogate = 0;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
INPUT_RECORD record;
|
||||||
|
DWORD count;
|
||||||
|
if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
|
||||||
|
return WEOF;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
|
||||||
|
wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
|
||||||
|
if (wc == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
|
||||||
|
high_surrogate = wc;
|
||||||
|
continue;
|
||||||
|
} else if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
|
||||||
|
if (high_surrogate != 0) { // Check if we have a high surrogate
|
||||||
|
return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
high_surrogate = 0; // Reset the high surrogate
|
||||||
|
return static_cast<char32_t>(wc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
wchar_t wc = getwchar();
|
wchar_t wc = getwchar();
|
||||||
if (static_cast<wint_t>(wc) == WEOF) {
|
if (static_cast<wint_t>(wc) == WEOF) {
|
||||||
return WEOF;
|
return WEOF;
|
||||||
|
@ -596,6 +627,7 @@ char32_t getchar32() {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return static_cast<char32_t>(wc);
|
return static_cast<char32_t>(wc);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void pop_cursor(console_state & con_st) {
|
void pop_cursor(console_state & con_st) {
|
||||||
|
|
|
@ -31,6 +31,8 @@ int main(int argc, char ** argv) {
|
||||||
params.prompt = gpt_random_prompt(rng);
|
params.prompt = gpt_random_prompt(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_init_backend();
|
||||||
|
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
|
|
|
@ -96,8 +96,7 @@ int main(int argc, char ** argv) {
|
||||||
params.prompt = gpt_random_prompt(rng);
|
params.prompt = gpt_random_prompt(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
// params.prompt = R"(// this function checks if the number n is prime
|
llama_init_backend();
|
||||||
//bool is_prime(int n) {)";
|
|
||||||
|
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
|
|
|
@ -143,6 +143,8 @@ int main(int argc, char ** argv) {
|
||||||
params.prompt = gpt_random_prompt(rng);
|
params.prompt = gpt_random_prompt(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_init_backend();
|
||||||
|
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
#include "ggml.h"
|
|
||||||
#include "llama.h"
|
|
||||||
#include "build-info.h"
|
#include "build-info.h"
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
@ -42,8 +42,6 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st
|
||||||
// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
|
// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
|
||||||
//
|
//
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
ggml_time_init();
|
|
||||||
|
|
||||||
if (argc < 3) {
|
if (argc < 3) {
|
||||||
fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
|
fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
|
||||||
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
|
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
|
||||||
|
@ -52,12 +50,7 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// needed to initialize f16 tables
|
llama_init_backend();
|
||||||
{
|
|
||||||
struct ggml_init_params params = { 0, NULL, false };
|
|
||||||
struct ggml_context * ctx = ggml_init(params);
|
|
||||||
ggml_free(ctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
// parse command line arguments
|
// parse command line arguments
|
||||||
const std::string fname_inp = argv[1];
|
const std::string fname_inp = argv[1];
|
||||||
|
@ -116,25 +109,25 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
const int64_t t_main_start_us = ggml_time_us();
|
const int64_t t_main_start_us = llama_time_us();
|
||||||
|
|
||||||
int64_t t_quantize_us = 0;
|
int64_t t_quantize_us = 0;
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
{
|
{
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = llama_time_us();
|
||||||
|
|
||||||
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
|
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
|
||||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
t_quantize_us = ggml_time_us() - t_start_us;
|
t_quantize_us = llama_time_us() - t_start_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
// report timing
|
// report timing
|
||||||
{
|
{
|
||||||
const int64_t t_main_end_us = ggml_time_us();
|
const int64_t t_main_end_us = llama_time_us();
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
|
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
|
||||||
|
|
6
ggml.c
6
ggml.c
|
@ -512,7 +512,7 @@ static inline int hsum_i32_4(const __m128i a) {
|
||||||
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
||||||
}
|
}
|
||||||
|
|
||||||
#if __AVX2__ || __AVX512F__
|
#if defined(__AVX2__) || defined(__AVX512F__)
|
||||||
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
||||||
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
||||||
uint32_t x32;
|
uint32_t x32;
|
||||||
|
@ -688,7 +688,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
||||||
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
||||||
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
||||||
|
|
||||||
#if __ARM_NEON
|
#if defined(__ARM_NEON)
|
||||||
|
|
||||||
#if !defined(__aarch64__)
|
#if !defined(__aarch64__)
|
||||||
|
|
||||||
|
@ -2481,7 +2481,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
||||||
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
sumf += (GGML_FP16_TO_FP32(x[i]).d*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
|
|
40
llama-util.h
40
llama-util.h
|
@ -101,12 +101,12 @@ struct llama_file {
|
||||||
LLAMA_ASSERT(ret == 0); // same
|
LLAMA_ASSERT(ret == 0); // same
|
||||||
}
|
}
|
||||||
|
|
||||||
void read_raw(void * ptr, size_t size) {
|
void read_raw(void * ptr, size_t len) const {
|
||||||
if (size == 0) {
|
if (len == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
errno = 0;
|
errno = 0;
|
||||||
std::size_t ret = std::fread(ptr, size, 1, fp);
|
std::size_t ret = std::fread(ptr, len, 1, fp);
|
||||||
if (ferror(fp)) {
|
if (ferror(fp)) {
|
||||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||||
}
|
}
|
||||||
|
@ -127,12 +127,12 @@ struct llama_file {
|
||||||
return std::string(chars.data(), len);
|
return std::string(chars.data(), len);
|
||||||
}
|
}
|
||||||
|
|
||||||
void write_raw(const void * ptr, size_t size) {
|
void write_raw(const void * ptr, size_t len) const {
|
||||||
if (size == 0) {
|
if (len == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
errno = 0;
|
errno = 0;
|
||||||
size_t ret = std::fwrite(ptr, size, 1, fp);
|
size_t ret = std::fwrite(ptr, len, 1, fp);
|
||||||
if (ret != 1) {
|
if (ret != 1) {
|
||||||
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
||||||
}
|
}
|
||||||
|
@ -267,9 +267,9 @@ struct llama_mlock {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void init(void * addr) {
|
void init(void * ptr) {
|
||||||
LLAMA_ASSERT(this->addr == NULL && this->size == 0);
|
LLAMA_ASSERT(addr == NULL && size == 0);
|
||||||
this->addr = addr;
|
addr = ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void grow_to(size_t target_size) {
|
void grow_to(size_t target_size) {
|
||||||
|
@ -340,14 +340,14 @@ struct llama_mlock {
|
||||||
return (size_t) si.dwPageSize;
|
return (size_t) si.dwPageSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool raw_lock(void * addr, size_t size) {
|
bool raw_lock(void * ptr, size_t len) {
|
||||||
for (int tries = 1; ; tries++) {
|
for (int tries = 1; ; tries++) {
|
||||||
if (VirtualLock(addr, size)) {
|
if (VirtualLock(ptr, len)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (tries == 2) {
|
if (tries == 2) {
|
||||||
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
||||||
size, this->size, llama_format_win_err(GetLastError()).c_str());
|
len, size, llama_format_win_err(GetLastError()).c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -363,7 +363,7 @@ struct llama_mlock {
|
||||||
// is equal to the number of pages in its minimum working set minus
|
// is equal to the number of pages in its minimum working set minus
|
||||||
// a small overhead."
|
// a small overhead."
|
||||||
// Hopefully a megabyte is enough overhead:
|
// Hopefully a megabyte is enough overhead:
|
||||||
size_t increment = size + 1048576;
|
size_t increment = len + 1048576;
|
||||||
// The minimum must be <= the maximum, so we need to increase both:
|
// The minimum must be <= the maximum, so we need to increase both:
|
||||||
min_ws_size += increment;
|
min_ws_size += increment;
|
||||||
max_ws_size += increment;
|
max_ws_size += increment;
|
||||||
|
@ -375,8 +375,8 @@ struct llama_mlock {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void raw_unlock(void * addr, size_t size) {
|
void raw_unlock(void * ptr, size_t len) {
|
||||||
if (!VirtualUnlock(addr, size)) {
|
if (!VirtualUnlock(ptr, len)) {
|
||||||
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
llama_format_win_err(GetLastError()).c_str());
|
||||||
}
|
}
|
||||||
|
@ -388,12 +388,12 @@ struct llama_mlock {
|
||||||
return (size_t) 65536;
|
return (size_t) 65536;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool raw_lock(const void * addr, size_t size) {
|
bool raw_lock(const void * addr, size_t len) {
|
||||||
fprintf(stderr, "warning: mlock not supported on this system\n");
|
fprintf(stderr, "warning: mlock not supported on this system\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void raw_unlock(const void * addr, size_t size) {}
|
void raw_unlock(const void * addr, size_t len) {}
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -404,10 +404,10 @@ struct llama_buffer {
|
||||||
|
|
||||||
llama_buffer() = default;
|
llama_buffer() = default;
|
||||||
|
|
||||||
void resize(size_t size) {
|
void resize(size_t len) {
|
||||||
delete[] addr;
|
delete[] addr;
|
||||||
addr = new uint8_t[size];
|
addr = new uint8_t[len];
|
||||||
this->size = size;
|
size = len;
|
||||||
}
|
}
|
||||||
|
|
||||||
~llama_buffer() {
|
~llama_buffer() {
|
||||||
|
|
26
llama.cpp
26
llama.cpp
|
@ -45,6 +45,7 @@ enum e_model {
|
||||||
MODEL_65B,
|
MODEL_65B,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static const size_t MB = 1024*1024;
|
static const size_t MB = 1024*1024;
|
||||||
|
|
||||||
// computed for n_ctx == 2048
|
// computed for n_ctx == 2048
|
||||||
|
@ -110,7 +111,7 @@ struct llama_hparams {
|
||||||
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams & other) const {
|
||||||
return memcmp(this, &other, sizeof(llama_hparams));
|
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -502,7 +503,7 @@ struct llama_file_loader {
|
||||||
|
|
||||||
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
||||||
// skip to the next multiple of 32 bytes
|
// skip to the next multiple of 32 bytes
|
||||||
file.seek(-file.tell() & 31, SEEK_CUR);
|
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
||||||
}
|
}
|
||||||
shard.file_idx = file_idx;
|
shard.file_idx = file_idx;
|
||||||
shard.file_off = file.tell();
|
shard.file_off = file.tell();
|
||||||
|
@ -577,7 +578,7 @@ struct llama_file_saver {
|
||||||
file.write_u32(new_type);
|
file.write_u32(new_type);
|
||||||
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
||||||
file.write_raw(tensor.name.data(), tensor.name.size());
|
file.write_raw(tensor.name.data(), tensor.name.size());
|
||||||
file.seek(-file.tell() & 31, SEEK_CUR);
|
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
||||||
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
|
||||||
file.write_raw(new_data, new_size);
|
file.write_raw(new_data, new_size);
|
||||||
}
|
}
|
||||||
|
@ -838,6 +839,21 @@ bool llama_mlock_supported() {
|
||||||
return llama_mlock::SUPPORTED;
|
return llama_mlock::SUPPORTED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_init_backend() {
|
||||||
|
ggml_time_init();
|
||||||
|
|
||||||
|
// needed to initialize f16 tables
|
||||||
|
{
|
||||||
|
struct ggml_init_params params = { 0, NULL, false };
|
||||||
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
|
ggml_free(ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t llama_time_us() {
|
||||||
|
return ggml_time_us();
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// model loading
|
// model loading
|
||||||
//
|
//
|
||||||
|
@ -2618,8 +2634,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sets the state reading from the specified source address
|
// Sets the state reading from the specified source address
|
||||||
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
const uint8_t * inp = src;
|
uint8_t * inp = src;
|
||||||
|
|
||||||
// set rng
|
// set rng
|
||||||
{
|
{
|
||||||
|
|
11
llama.h
11
llama.h
|
@ -79,7 +79,7 @@ extern "C" {
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||||
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||||
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||||
|
@ -90,6 +90,13 @@ extern "C" {
|
||||||
LLAMA_API bool llama_mmap_supported();
|
LLAMA_API bool llama_mmap_supported();
|
||||||
LLAMA_API bool llama_mlock_supported();
|
LLAMA_API bool llama_mlock_supported();
|
||||||
|
|
||||||
|
// TODO: not great API - very likely to change
|
||||||
|
// Initialize the llama + ggml backend
|
||||||
|
// Call once at the start of the program
|
||||||
|
LLAMA_API void llama_init_backend();
|
||||||
|
|
||||||
|
LLAMA_API int64_t llama_time_us();
|
||||||
|
|
||||||
// Various functions for loading a ggml llama model.
|
// Various functions for loading a ggml llama model.
|
||||||
// Allocate (almost) all memory needed for the model.
|
// Allocate (almost) all memory needed for the model.
|
||||||
// Return NULL on failure
|
// Return NULL on failure
|
||||||
|
@ -138,7 +145,7 @@ extern "C" {
|
||||||
|
|
||||||
// Set the state reading from the specified address
|
// Set the state reading from the specified address
|
||||||
// Returns the number of bytes read
|
// Returns the number of bytes read
|
||||||
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
|
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
|
||||||
|
|
||||||
// Save/load session file
|
// Save/load session file
|
||||||
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue