llm : add Falcon support (#2717)
* llama : refactor GGUF constants into static maps * llama : check if model architecture is known * llama : refactor llama_model_load_internal() * gguf : add KV constant maps * llm : read arch-specific KVs * convert : add dummy scores + types * falcon : load tensor data (CPU only) * llama : fix loading progress bar * llama : add arch member to llama_model * falcon : CPU inference working * falcon : support non-40B models * falcon : minor * llama : minor updates ggml-ci * convert-falcon-hf-to-gguf.py : fix special token mapping * llama.cpp : llama default UNK token = id 0 * llama.cpp : fix bpe tokenizer * llama.cpp : fix the fix of bpe tokenizer * ggml : pass eps to ggml_norm * metal : implement RoPE (mode = 2) + avoid ggml_repeat * ggml : ggml_repeat always creates new tensor * falcon : copy-paste self-attention from LLaMA * metal : print extra compute pipeline info * falcon : minor changes (still chasing the Metal problem) * llama.cpp : fix linefeed token * metal : fix GELU kernel numerical stability by using precise::tanh * metal : temporary workaround for the concurrency optimization bug * falcon : add CUDA offloading (#2739) * llama : better model naming and size reporting * llama : prep new tokenizer support * llama : advanced BPE tokenizer based on ggllm.cpp imlpementation * llama : remove oboslete comment ggml-ci * common : remove obsolete BPE API + disable test-tokenizer-1 * llama : revert BPE special-case in llama_byte_to_token() * cuda : add TODOs for RoPE NeoX implementation * llama : default special tokens based on vocab type * perplexity : add log for start of tokenization --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com> Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
parent
a192860cfe
commit
cf658adc83
18 changed files with 1596 additions and 668 deletions
29
ggml-cuda.cu
29
ggml-cuda.cu
|
@ -3907,6 +3907,29 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
// TODO: this implementation is wrong!
|
||||
//static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
||||
// const float p_delta, const int p_delta_rows, const float theta_scale) {
|
||||
// const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
//
|
||||
// if (col >= ncols) {
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
// const int i = row*ncols + col/2;
|
||||
//
|
||||
// const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
||||
// const float sin_theta = sinf(theta);
|
||||
// const float cos_theta = cosf(theta);
|
||||
//
|
||||
// const float x0 = x[i + 0];
|
||||
// const float x1 = x[i + ncols/2];
|
||||
//
|
||||
// dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
// dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
||||
//}
|
||||
|
||||
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
||||
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int half_n_dims = ncols/4;
|
||||
|
@ -5515,7 +5538,8 @@ inline void ggml_cuda_op_rope(
|
|||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
|
||||
const bool is_glm = mode & 4;
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
|
||||
// compute
|
||||
if (is_glm) {
|
||||
|
@ -5523,6 +5547,9 @@ inline void ggml_cuda_op_rope(
|
|||
const float id_p = min(p, n_ctx - 2.f);
|
||||
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
||||
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
||||
} else if (is_neox) {
|
||||
GGML_ASSERT(false && "RoPE NeoX not implemented yet");
|
||||
#pragma message("TODO: implement RoPE NeoX for CUDA")
|
||||
} else {
|
||||
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
||||
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue