Merge branch 'master' into feat/dockerize

This commit is contained in:
Bernat Vadell 2023-03-17 09:02:11 +01:00
commit 50fa1a006e
3 changed files with 130 additions and 43 deletions

View file

@ -219,7 +219,3 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a` - Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
### Misc
- Practice your C++ typing skills: https://typing-battles.ggerganov.com

149
ggml.c
View file

@ -607,10 +607,11 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
assert(k % QK == 0); assert(k % QK == 0);
const int nb = k / QK; const int nb = k / QK;
const size_t bs = 2*sizeof(float) + QK/2;
float * restrict pm = (float *) (y); uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
float * restrict pd = (float *) (pm + nb); uint8_t * restrict pm = ((uint8_t *)y + 0*bs + sizeof(float));
uint8_t * restrict pb = (uint8_t *) (pd + nb); uint8_t * restrict pb = ((uint8_t *)y + 0*bs + 2*sizeof(float));
uint8_t pp[QK/2]; uint8_t pp[QK/2];
@ -627,8 +628,10 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
const float d = (max - min) / ((1 << 4) - 1); const float d = (max - min) / ((1 << 4) - 1);
const float id = d ? 1.0f/d : 0.0f; const float id = d ? 1.0f/d : 0.0f;
pm[i] = min; *(float *)pm = min;
pd[i] = d; *(float *)pd = d;
pm += bs;
pd += bs;
for (int l = 0; l < QK; l += 2) { for (int l = 0; l < QK; l += 2) {
const float v0 = (x[i*QK + l + 0] - min)*id; const float v0 = (x[i*QK + l + 0] - min)*id;
@ -643,7 +646,8 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
pp[l/2] = vi0 | (vi1 << 4); pp[l/2] = vi0 | (vi1 << 4);
} }
memcpy(pb + i*QK/2, pp, sizeof(pp)); memcpy(pb, pp, sizeof(pp));
pb += bs;
} }
} }
@ -687,16 +691,17 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {
assert(k % QK == 0); assert(k % QK == 0);
const int nb = k / QK; const int nb = k / QK;
const size_t bs = 2*sizeof(float) + QK/2;
const float * restrict pm = (const float *) (x); const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
const float * restrict pd = (const float *) (pm + nb); const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
const uint8_t * restrict pb = (const uint8_t *) (pd + nb); const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
const float m = pm[i]; const float d = *(const float *) (pd + i*bs);
const float d = pd[i]; const float m = *(const float *) (pm + i*bs);
const uint8_t * restrict pp = pb + i*QK/2; const uint8_t * restrict pp = pb + i*bs;
for (int l = 0; l < QK; l += 2) { for (int l = 0; l < QK; l += 2) {
const uint8_t vi = pp[l/2]; const uint8_t vi = pp[l/2];
@ -1584,28 +1589,109 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict x, const void * restrict y) { inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
const int nb = n / QK; const int nb = n / QK;
const float * restrict pm0 = (const float *) x; const size_t bs = 2*sizeof(float) + QK/2;
const float * restrict pm1 = (const float *) y;
const float * restrict pd0 = (const float *) (pm0 + nb); const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs);
const float * restrict pd1 = (const float *) (pm1 + nb); const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs);
const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb); const uint8_t * restrict pm0 = ((const uint8_t *)x + 0*bs + sizeof(float));
const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb); const uint8_t * restrict pm1 = ((const uint8_t *)y + 0*bs + sizeof(float));
const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + 2*sizeof(float));
float sumf = 0.0; float sumf = 0.0;
#if 1 #if defined(__AVX2__)
#if QK == 32
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
// Accumulator for constant offsets
float acc_offset = 0.0f;
// Main loop
for (int i = 0; i < nb; ++i) {
const float * m0 = (const float *) (pm0 + i*bs);
const float * m1 = (const float *) (pm1 + i*bs);
const float * d0 = (const float *) (pd0 + i*bs);
const float * d1 = (const float *) (pd1 + i*bs);
const uint8_t * restrict p0 = pb0 + i*bs;
const uint8_t * restrict p1 = pb1 + i*bs;
const __m256 d0v = _mm256_broadcast_ss( d0 );
const __m256 d1v = _mm256_broadcast_ss( d1 );
const __m256 m0v = _mm256_broadcast_ss( m0 );
const __m256 m1v = _mm256_broadcast_ss( m1 );
// Compute combined scale for the block
const __m256 scale_01 = _mm256_mul_ps( d0v, d1v );
// Compute cross scales for the block
const __m256 scale_0 = _mm256_mul_ps( d0v, m1v );
const __m256 scale_1 = _mm256_mul_ps( m0v, d1v );
const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 );
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
__m256i bx = bytesFromNibbles( p0 );
__m256i by = bytesFromNibbles( p1 );
// Now we have a vector with bytes in [ 0 .. 15 ] interval.
// Sign-extend first 16 signed bytes into int16_t
__m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
__m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
// Compute products of int16_t integers, add pairwise
__m256i i32 = _mm256_madd_epi16( x16, y16 );
// Sign-extend last 16 signed bytes into int16_t vectors
__m256i x16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
__m256i y16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
// Accumulate products of int16_t integers
i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16_h, y16_h ) );
// compute sums of unsigned bytes in bx, by in blocks of 8.
// This results in a layout like X100 0000 X200 0000 X300 0000 X400 0000,
// which we then interleave as X100 Y100 X200 Y200 X300 Y300 X400 Y400.
// so if we then cast to 8 singles, we get 8 floats like [ x0_7, y0_7, x8_15, y8_15, x16_23, y16_23, x24_31, y24_31 ]
__m256i xsumi = _mm256_sad_epu8( bx, _mm256_setzero_si256() );
__m256i ysumi = _mm256_sad_epu8( by, _mm256_setzero_si256() );
__m256i sumsi = _mm256_or_si256( xsumi, _mm256_slli_si256( ysumi, 4 ) );
__m256 sums = _mm256_cvtepi32_ps( sumsi );
// Convert int32_t to float
__m256 p = _mm256_cvtepi32_ps( i32 );
// Apply the scale, and accumulate
// acc += d0*d1*x*y + d0*m1*x + d1*m0*y
acc = _mm256_fmadd_ps( scale_01, p, acc );
acc = _mm256_fmadd_ps( cross_scales, sums, acc );
// acc_offset += m0*m1 (for each entry in the block)
acc_offset += (*m0)*(*m1);
}
// Return horizontal sum of the acc vector
__m128 res = _mm256_extractf128_ps( acc, 1 );
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
sumf = _mm_cvtss_f32( res ) + acc_offset * QK;
#else
#error "not implemented for QK"
#endif
#else
// scalar // scalar
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
const float m0 = pm0[i]; const float m0 = *(const float *) (pm0 + i*bs);
const float m1 = pm1[i]; const float m1 = *(const float *) (pm1 + i*bs);
const float d0 = pd0[i]; const float d0 = *(const float *) (pd0 + i*bs);
const float d1 = pd1[i]; const float d1 = *(const float *) (pd1 + i*bs);
const uint8_t * restrict p0 = pb0 + i*QK/2; const uint8_t * restrict p0 = pb0 + i*bs;
const uint8_t * restrict p1 = pb1 + i*QK/2; const uint8_t * restrict p1 = pb1 + i*bs;
for (int j = 0; j < QK/2; j++) { for (int j = 0; j < QK/2; j++) {
const uint8_t v0 = p0[j]; const uint8_t v0 = p0[j];
@ -1839,16 +1925,17 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res
assert(n % QK == 0); assert(n % QK == 0);
const int nb = n / QK; const int nb = n / QK;
const size_t bs = 2*sizeof(float) + QK/2;
const float * restrict pm = (const float *) (x); const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
const float * restrict pd = (const float *) (pm + nb); const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
const uint8_t * restrict pb = (const uint8_t *) (pd + nb); const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
const float m = pm[i]; const float d = *(const float *) (pd + i*bs);
const float d = pd[i]; const float m = *(const float *) (pm + i*bs);
const uint8_t * restrict pp = pb + i*QK/2; const uint8_t * restrict pp = pb + i*bs;
for (int l = 0; l < QK; l += 2) { for (int l = 0; l < QK; l += 2) {
const uint8_t vi = pp[l/2]; const uint8_t vi = pp[l/2];

View file

@ -489,7 +489,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) { size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk; const int nb = k / qk;
const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2); const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
const size_t row_size = nb*bs;
assert(k % qk == 0); assert(k % qk == 0);
@ -499,9 +500,9 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
char * pdst = (char *) dst; char * pdst = (char *) dst;
for (int j = 0; j < n; j += k) { for (int j = 0; j < n; j += k) {
float * pm = (float *) (pdst + (j/k)*row_size); uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
float * pd = (float *) (pm + nb); uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
uint8_t * pb = (uint8_t *) (pd + nb); uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
@ -519,8 +520,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
const float d = (max - min) / ((1 << 4) - 1); const float d = (max - min) / ((1 << 4) - 1);
const float id = d ? 1.0f/d : 0.0f; const float id = d ? 1.0f/d : 0.0f;
pm[i] = min; *(float *) pd = d;
pd[i] = d; *(float *) pm = min;
pd += bs;
pm += bs;
for (int l = 0; l < qk; l += 2) { for (int l = 0; l < qk; l += 2) {
const float v0 = (src[j + i*qk + l + 0] - min)*id; const float v0 = (src[j + i*qk + l + 0] - min)*id;
@ -538,7 +541,8 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
pp[l/2] = vi0 | (vi1 << 4); pp[l/2] = vi0 | (vi1 << 4);
} }
memcpy(pb + i*qk/2, pp, pp_size); memcpy(pb, pp, pp_size);
pb += bs;
} }
} }
} }