merging, do not use
This commit is contained in:
commit
a8958f6b76
11 changed files with 18708 additions and 3228 deletions
14
ggml-cuda.cu
14
ggml-cuda.cu
|
@ -42,19 +42,19 @@ typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y,
|
||||||
#define QK4_0 32
|
#define QK4_0 32
|
||||||
#define QR4_0 2
|
#define QR4_0 2
|
||||||
typedef struct {
|
typedef struct {
|
||||||
float d; // delta
|
half d; // delta
|
||||||
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||||
} block_q4_0;
|
} block_q4_0;
|
||||||
static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
||||||
|
|
||||||
#define QK4_1 32
|
#define QK4_1 32
|
||||||
#define QR4_1 2
|
#define QR4_1 2
|
||||||
typedef struct {
|
typedef struct {
|
||||||
float d; // delta
|
half d; // delta
|
||||||
float m; // min
|
half m; // min
|
||||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||||
} block_q4_1;
|
} block_q4_1;
|
||||||
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||||
|
|
||||||
#define QK5_0 32
|
#define QK5_0 32
|
||||||
#define QR5_0 2
|
#define QR5_0 2
|
||||||
|
@ -78,10 +78,10 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
||||||
#define QK8_0 32
|
#define QK8_0 32
|
||||||
#define QR8_0 1
|
#define QR8_0 1
|
||||||
typedef struct {
|
typedef struct {
|
||||||
float d; // delta
|
half d; // delta
|
||||||
int8_t qs[QK8_0]; // quants
|
int8_t qs[QK8_0]; // quants
|
||||||
} block_q8_0;
|
} block_q8_0;
|
||||||
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
|
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
||||||
|
|
||||||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||||
#define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec
|
#define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec
|
||||||
|
|
|
@ -813,7 +813,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
||||||
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||||
src1->type == GGML_TYPE_F32 &&
|
src1->type == GGML_TYPE_F32 &&
|
||||||
dst->type == GGML_TYPE_F32 &&
|
dst->type == GGML_TYPE_F32 &&
|
||||||
((GetQuantsUnshuffled() && GetGPULayers()>0 && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
|
((GetQuantsUnshuffled() && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -1094,8 +1094,6 @@ extern "C" {
|
||||||
|
|
||||||
void SetQuantsUnshuffled(bool unshuffled);
|
void SetQuantsUnshuffled(bool unshuffled);
|
||||||
bool GetQuantsUnshuffled();
|
bool GetQuantsUnshuffled();
|
||||||
void SetGPULayers(bool layers);
|
|
||||||
bool GetGPULayers();
|
|
||||||
|
|
||||||
GGML_API int ggml_cpu_has_avx (void);
|
GGML_API int ggml_cpu_has_avx (void);
|
||||||
GGML_API int ggml_cpu_has_avx2 (void);
|
GGML_API int ggml_cpu_has_avx2 (void);
|
||||||
|
|
|
@ -20,8 +20,8 @@
|
||||||
#include "gptj_v2.cpp"
|
#include "gptj_v2.cpp"
|
||||||
#include "gpt2_v1.cpp"
|
#include "gpt2_v1.cpp"
|
||||||
#include "gpt2_v2.cpp"
|
#include "gpt2_v2.cpp"
|
||||||
#include "rwkv.cpp"
|
#include "rwkv_v2.cpp"
|
||||||
#include "neox.cpp"
|
#include "neox_v2.cpp"
|
||||||
|
|
||||||
|
|
||||||
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
|
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
|
||||||
|
|
19
llama.cpp
19
llama.cpp
|
@ -412,6 +412,7 @@ enum llama_file_version {
|
||||||
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
||||||
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
||||||
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
||||||
|
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_file_loader {
|
struct llama_file_loader {
|
||||||
|
@ -444,6 +445,8 @@ struct llama_file_loader {
|
||||||
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
||||||
} else if (magic == 'ggjt' && version == 2) {
|
} else if (magic == 'ggjt' && version == 2) {
|
||||||
file_version = LLAMA_FILE_VERSION_GGJT_V2;
|
file_version = LLAMA_FILE_VERSION_GGJT_V2;
|
||||||
|
} else if (magic == 'ggjt' && version == 3) {
|
||||||
|
file_version = LLAMA_FILE_VERSION_GGJT_V3;
|
||||||
} else {
|
} else {
|
||||||
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
||||||
magic, version);
|
magic, version);
|
||||||
|
@ -861,7 +864,8 @@ static const char *llama_file_version_name(llama_file_version version) {
|
||||||
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
||||||
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
||||||
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
||||||
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
|
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
|
||||||
|
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
|
||||||
}
|
}
|
||||||
|
|
||||||
return "unknown";
|
return "unknown";
|
||||||
|
@ -946,12 +950,19 @@ static void llama_model_load_internal(
|
||||||
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
||||||
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
||||||
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
||||||
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
||||||
printf("\nLegacy LLAMA GGJT compatability changes triggered.\n");
|
printf("\nLegacy LLAMA GGJT v1 compatability changes triggered.\n");
|
||||||
//throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
|
||||||
|
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
||||||
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
||||||
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
||||||
|
printf("\nLegacy LLAMA GGJT v2 compatability changes triggered.\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
2
llama.h
2
llama.h
|
@ -19,7 +19,7 @@
|
||||||
# define LLAMA_API
|
# define LLAMA_API
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LLAMA_FILE_VERSION 2
|
#define LLAMA_FILE_VERSION 3
|
||||||
#define LLAMA_FILE_MAGIC 'ggjt'
|
#define LLAMA_FILE_MAGIC 'ggjt'
|
||||||
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
|
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
|
||||||
#define LLAMA_SESSION_MAGIC 'ggsn'
|
#define LLAMA_SESSION_MAGIC 'ggsn'
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
|
|
||||||
#include "otherarch.h"
|
#include "otherarch.h"
|
||||||
|
|
||||||
#include "rwkv.h"
|
#include "rwkv_v2.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
Loading…
Add table
Add a link
Reference in a new issue