Merge 'origin/master' into hipblas
This commit is contained in:
commit
68e79cc134
9 changed files with 1018 additions and 954 deletions
|
@ -16,6 +16,7 @@ Command line options:
|
||||||
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
||||||
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
|
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
|
||||||
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
|
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
|
||||||
|
- `--numa`: Attempt optimizations that help on some NUMA systems.
|
||||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||||
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -167,7 +167,7 @@
|
||||||
mirostat: 0, // 0/1/2
|
mirostat: 0, // 0/1/2
|
||||||
mirostat_tau: 5, // target entropy
|
mirostat_tau: 5, // target entropy
|
||||||
mirostat_eta: 0.1, // learning rate
|
mirostat_eta: 0.1, // learning rate
|
||||||
grammar: null,
|
grammar: '',
|
||||||
})
|
})
|
||||||
|
|
||||||
const llamaStats = signal(null)
|
const llamaStats = signal(null)
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
#include "index.html.hpp"
|
#include "index.html.hpp"
|
||||||
#include "index.js.hpp"
|
#include "index.js.hpp"
|
||||||
#include "completion.js.hpp"
|
#include "completion.js.hpp"
|
||||||
|
#include "json-schema-to-grammar.mjs.hpp"
|
||||||
|
|
||||||
#ifndef SERVER_VERBOSE
|
#ifndef SERVER_VERBOSE
|
||||||
#define SERVER_VERBOSE 1
|
#define SERVER_VERBOSE 1
|
||||||
|
@ -666,6 +667,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
{
|
{
|
||||||
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||||
}
|
}
|
||||||
|
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n");
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
|
fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
|
||||||
fprintf(stdout, " number of layers to store in VRAM\n");
|
fprintf(stdout, " number of layers to store in VRAM\n");
|
||||||
|
@ -940,6 +942,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
{
|
{
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
}
|
}
|
||||||
|
else if (arg == "--numa")
|
||||||
|
{
|
||||||
|
params.numa = true;
|
||||||
|
}
|
||||||
else if (arg == "--embedding")
|
else if (arg == "--embedding")
|
||||||
{
|
{
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
|
@ -1213,6 +1219,12 @@ int main(int argc, char **argv)
|
||||||
res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
|
res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
|
||||||
return false; });
|
return false; });
|
||||||
|
|
||||||
|
// this is only called if no index.html is found in the public --path
|
||||||
|
svr.Get("/json-schema-to-grammar.mjs", [](const Request &, Response &res)
|
||||||
|
{
|
||||||
|
res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
|
||||||
|
return false; });
|
||||||
|
|
||||||
svr.Post("/completion", [&llama](const Request &req, Response &res)
|
svr.Post("/completion", [&llama](const Request &req, Response &res)
|
||||||
{
|
{
|
||||||
auto lock = llama.lock();
|
auto lock = llama.lock();
|
||||||
|
|
94
ggml-cuda.cu
94
ggml-cuda.cu
|
@ -1854,7 +1854,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
||||||
}
|
}
|
||||||
|
|
||||||
// contiguous u/y values
|
// contiguous u/y values
|
||||||
// also used for q5_K
|
|
||||||
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
||||||
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
||||||
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
||||||
|
@ -1864,19 +1863,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
||||||
float sumf_m = 0.0f;
|
float sumf_m = 0.0f;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) {
|
for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
|
||||||
int sumi_d = 0;
|
int sumi_d = 0;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) {
|
for (int j = 0; j < QI8_1; ++j) {
|
||||||
sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product
|
sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
||||||
sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const float2 ds8f = __half22float2(ds8[i0 / 4]);
|
const float2 ds8f = __half22float2(ds8[i]);
|
||||||
|
|
||||||
sumf_d += ds8f.x * (sc[i0/4] * sumi_d);
|
sumf_d += ds8f.x * (sc[i] * sumi_d);
|
||||||
sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val
|
sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
|
||||||
}
|
}
|
||||||
|
|
||||||
const float2 dm4f = __half22float2(dm4);
|
const float2 dm4f = __half22float2(dm4);
|
||||||
|
@ -1893,7 +1891,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
||||||
#define VDR_Q5_K_Q8_1_MMQ 8
|
#define VDR_Q5_K_Q8_1_MMQ 8
|
||||||
|
|
||||||
// contiguous v/x values
|
// contiguous v/x values
|
||||||
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
||||||
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
||||||
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
||||||
|
|
||||||
|
@ -1930,6 +1928,40 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// contiguous u/y values
|
||||||
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
||||||
|
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
||||||
|
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
||||||
|
|
||||||
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||||
|
float sumf_d = 0.0f;
|
||||||
|
float sumf_m = 0.0f;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
|
||||||
|
int sumi_d = 0;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < QI8_1; ++j) {
|
||||||
|
sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
||||||
|
}
|
||||||
|
|
||||||
|
const float2 ds8f = __half22float2(ds8[i]);
|
||||||
|
|
||||||
|
sumf_d += ds8f.x * (sc[i] * sumi_d);
|
||||||
|
sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
|
||||||
|
}
|
||||||
|
|
||||||
|
const float2 dm4f = __half22float2(dm4);
|
||||||
|
|
||||||
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
||||||
|
|
||||||
|
#else
|
||||||
|
assert(false);
|
||||||
|
return 0.0f; // only to satisfy the compiler
|
||||||
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
|
}
|
||||||
|
|
||||||
#define VDR_Q6_K_Q8_1_MMVQ 1
|
#define VDR_Q6_K_Q8_1_MMVQ 1
|
||||||
#define VDR_Q6_K_Q8_1_MMQ 8
|
#define VDR_Q6_K_Q8_1_MMQ 8
|
||||||
|
|
||||||
|
@ -2925,18 +2957,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
|
||||||
int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
|
|
||||||
v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
|
|
||||||
v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
|
|
||||||
}
|
|
||||||
|
|
||||||
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
||||||
|
|
||||||
const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
|
const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
|
||||||
return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
|
return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
|
||||||
|
x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
||||||
|
@ -2983,7 +3008,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
||||||
u[2*i+1] = q8[4];
|
u[2*i+1] = q8[4];
|
||||||
}
|
}
|
||||||
|
|
||||||
return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
|
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@ -3126,7 +3151,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
||||||
|
|
||||||
const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
|
const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
|
||||||
const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
|
const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
|
||||||
return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
|
return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
|
||||||
|
x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
||||||
|
@ -3402,7 +3428,11 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
|
||||||
#define MMQ_Y_Q4_1_PASCAL 64
|
#define MMQ_Y_Q4_1_PASCAL 64
|
||||||
#define NWARPS_Q4_1_PASCAL 8
|
#define NWARPS_Q4_1_PASCAL 8
|
||||||
|
|
||||||
template <bool need_check> static __global__ void mul_mat_q4_1(
|
template <bool need_check> static __global__ void
|
||||||
|
#if __CUDA_ARCH__ < CC_TURING
|
||||||
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
||||||
|
#endif // __CUDA_ARCH__ < CC_TURING
|
||||||
|
mul_mat_q4_1(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
|
@ -3572,7 +3602,11 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
|
||||||
#define MMQ_Y_Q3_K_PASCAL 64
|
#define MMQ_Y_Q3_K_PASCAL 64
|
||||||
#define NWARPS_Q3_K_PASCAL 8
|
#define NWARPS_Q3_K_PASCAL 8
|
||||||
|
|
||||||
template <bool need_check> static __global__ void mul_mat_q3_K(
|
template <bool need_check> static __global__ void
|
||||||
|
#if __CUDA_ARCH__ < CC_TURING
|
||||||
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
||||||
|
#endif // __CUDA_ARCH__ < CC_TURING
|
||||||
|
mul_mat_q3_K(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
|
@ -3602,11 +3636,15 @@ template <bool need_check> static __global__ void mul_mat_q3_K(
|
||||||
#define MMQ_X_Q4_K_AMPERE 64
|
#define MMQ_X_Q4_K_AMPERE 64
|
||||||
#define MMQ_Y_Q4_K_AMPERE 128
|
#define MMQ_Y_Q4_K_AMPERE 128
|
||||||
#define NWARPS_Q4_K_AMPERE 4
|
#define NWARPS_Q4_K_AMPERE 4
|
||||||
#define MMQ_X_Q4_K_PASCAL 32
|
#define MMQ_X_Q4_K_PASCAL 64
|
||||||
#define MMQ_Y_Q4_K_PASCAL 64
|
#define MMQ_Y_Q4_K_PASCAL 64
|
||||||
#define NWARPS_Q4_K_PASCAL 8
|
#define NWARPS_Q4_K_PASCAL 8
|
||||||
|
|
||||||
template <bool need_check> static __global__ void mul_mat_q4_K(
|
template <bool need_check> static __global__ void
|
||||||
|
#if __CUDA_ARCH__ < CC_TURING
|
||||||
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
||||||
|
#endif // __CUDA_ARCH__ < CC_TURING
|
||||||
|
mul_mat_q4_K(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
|
@ -3670,11 +3708,15 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
|
||||||
#define MMQ_X_Q6_K_AMPERE 64
|
#define MMQ_X_Q6_K_AMPERE 64
|
||||||
#define MMQ_Y_Q6_K_AMPERE 64
|
#define MMQ_Y_Q6_K_AMPERE 64
|
||||||
#define NWARPS_Q6_K_AMPERE 4
|
#define NWARPS_Q6_K_AMPERE 4
|
||||||
#define MMQ_X_Q6_K_PASCAL 32
|
#define MMQ_X_Q6_K_PASCAL 64
|
||||||
#define MMQ_Y_Q6_K_PASCAL 64
|
#define MMQ_Y_Q6_K_PASCAL 64
|
||||||
#define NWARPS_Q6_K_PASCAL 8
|
#define NWARPS_Q6_K_PASCAL 8
|
||||||
|
|
||||||
template <bool need_check> static __global__ void mul_mat_q6_K(
|
template <bool need_check> static __global__ void
|
||||||
|
#if __CUDA_ARCH__ < CC_TURING
|
||||||
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
||||||
|
#endif // __CUDA_ARCH__ < CC_TURING
|
||||||
|
mul_mat_q6_K(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
|
|
|
@ -126,7 +126,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
|
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
|
||||||
if (error) {
|
if (error) {
|
||||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
exit(1);
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
@ -144,7 +144,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
||||||
if (error) {
|
if (error) {
|
||||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
exit(1);
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_QKK_64
|
#ifdef GGML_QKK_64
|
||||||
|
@ -156,7 +156,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
#endif
|
#endif
|
||||||
if (error) {
|
if (error) {
|
||||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
exit(1);
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -3337,6 +3337,12 @@ struct llama_context * llama_new_context_with_model(
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
ctx->ctx_metal = ggml_metal_init(1);
|
ctx->ctx_metal = ggml_metal_init(1);
|
||||||
|
|
||||||
|
if (!ctx->ctx_metal) {
|
||||||
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
||||||
|
llama_free(ctx);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
void * data_ptr = NULL;
|
void * data_ptr = NULL;
|
||||||
size_t data_size = 0;
|
size_t data_size = 0;
|
||||||
|
|
||||||
|
|
2
llama.h
2
llama.h
|
@ -97,7 +97,7 @@ extern "C" {
|
||||||
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
||||||
// if it exists.
|
// if it exists.
|
||||||
// It might not exist for progress report where '.' is output repeatedly.
|
// It might not exist for progress report where '.' is output repeatedly.
|
||||||
typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data);
|
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
||||||
|
|
||||||
struct llama_context_params {
|
struct llama_context_params {
|
||||||
uint32_t seed; // RNG seed, -1 for random
|
uint32_t seed; // RNG seed, -1 for random
|
||||||
|
|
3
scripts/get-wikitext-2.sh
Normal file
3
scripts/get-wikitext-2.sh
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
|
Loading…
Add table
Add a link
Reference in a new issue