merging, do not use

This commit is contained in:
Concedo 2023-05-20 15:12:31 +08:00
commit a8958f6b76
11 changed files with 18708 additions and 3228 deletions

View file

@ -42,19 +42,19 @@ typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y,
#define QK4_0 32 #define QK4_0 32
#define QR4_0 2 #define QR4_0 2
typedef struct { typedef struct {
float d; // delta half d; // delta
uint8_t qs[QK4_0 / 2]; // nibbles / quants uint8_t qs[QK4_0 / 2]; // nibbles / quants
} block_q4_0; } block_q4_0;
static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding"); static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
#define QK4_1 32 #define QK4_1 32
#define QR4_1 2 #define QR4_1 2
typedef struct { typedef struct {
float d; // delta half d; // delta
float m; // min half m; // min
uint8_t qs[QK4_1 / 2]; // nibbles / quants uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1; } block_q4_1;
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
#define QK5_0 32 #define QK5_0 32
#define QR5_0 2 #define QR5_0 2
@ -78,10 +78,10 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
#define QK8_0 32 #define QK8_0 32
#define QR8_0 1 #define QR8_0 1
typedef struct { typedef struct {
float d; // delta half d; // delta
int8_t qs[QK8_0]; // quants int8_t qs[QK8_0]; // quants
} block_q8_0; } block_q8_0;
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
#define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec #define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec

View file

@ -813,7 +813,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
src1->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 &&
dst->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
((GetQuantsUnshuffled() && GetGPULayers()>0 && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) { ((GetQuantsUnshuffled() && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
return true; return true;
} }

3290
ggml.c

File diff suppressed because it is too large Load diff

2
ggml.h
View file

@ -1094,8 +1094,6 @@ extern "C" {
void SetQuantsUnshuffled(bool unshuffled); void SetQuantsUnshuffled(bool unshuffled);
bool GetQuantsUnshuffled(); bool GetQuantsUnshuffled();
void SetGPULayers(bool layers);
bool GetGPULayers();
GGML_API int ggml_cpu_has_avx (void); GGML_API int ggml_cpu_has_avx (void);
GGML_API int ggml_cpu_has_avx2 (void); GGML_API int ggml_cpu_has_avx2 (void);

18581
ggml_v2.c

File diff suppressed because it is too large Load diff

View file

@ -20,8 +20,8 @@
#include "gptj_v2.cpp" #include "gptj_v2.cpp"
#include "gpt2_v1.cpp" #include "gpt2_v1.cpp"
#include "gpt2_v2.cpp" #include "gpt2_v2.cpp"
#include "rwkv.cpp" #include "rwkv_v2.cpp"
#include "neox.cpp" #include "neox_v2.cpp"
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt) //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)

View file

@ -412,6 +412,7 @@ enum llama_file_version {
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
LLAMA_FILE_VERSION_GGJT_V1, // added padding LLAMA_FILE_VERSION_GGJT_V1, // added padding
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
}; };
struct llama_file_loader { struct llama_file_loader {
@ -444,6 +445,8 @@ struct llama_file_loader {
file_version = LLAMA_FILE_VERSION_GGJT_V1; file_version = LLAMA_FILE_VERSION_GGJT_V1;
} else if (magic == 'ggjt' && version == 2) { } else if (magic == 'ggjt' && version == 2) {
file_version = LLAMA_FILE_VERSION_GGJT_V2; file_version = LLAMA_FILE_VERSION_GGJT_V2;
} else if (magic == 'ggjt' && version == 3) {
file_version = LLAMA_FILE_VERSION_GGJT_V3;
} else { } else {
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version); magic, version);
@ -861,7 +864,8 @@ static const char *llama_file_version_name(llama_file_version version) {
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)"; case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
} }
return "unknown"; return "unknown";
@ -946,12 +950,19 @@ static void llama_model_load_internal(
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
} }
if (file_version != LLAMA_FILE_VERSION_GGJT_V2) { if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 && if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 && hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
printf("\nLegacy LLAMA GGJT compatability changes triggered.\n"); printf("\nLegacy LLAMA GGJT v1 compatability changes triggered.\n");
//throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)"); }
}
if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
printf("\nLegacy LLAMA GGJT v2 compatability changes triggered.\n");
} }
} }

View file

@ -19,7 +19,7 @@
# define LLAMA_API # define LLAMA_API
#endif #endif
#define LLAMA_FILE_VERSION 2 #define LLAMA_FILE_VERSION 3
#define LLAMA_FILE_MAGIC 'ggjt' #define LLAMA_FILE_MAGIC 'ggjt'
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml' #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
#define LLAMA_SESSION_MAGIC 'ggsn' #define LLAMA_SESSION_MAGIC 'ggsn'

View file

@ -3,7 +3,7 @@
#include "otherarch.h" #include "otherarch.h"
#include "rwkv.h" #include "rwkv_v2.h"
#include "ggml.h" #include "ggml.h"
#include <string> #include <string>