Merge branch 'master' into concedo
# Conflicts: # CMakeLists.txt
This commit is contained in:
commit
9ab6e87b58
6 changed files with 176 additions and 18 deletions
|
@ -21,7 +21,7 @@ extern "C"
|
|||
{
|
||||
std::string model = inputs.model_filename;
|
||||
file_format = check_file_format(model.c_str());
|
||||
printf("\nIdentified as LLAMA model: (ver %d)\n", file_format);
|
||||
printf("\n---\nIdentified as LLAMA model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||
|
||||
return llama_load_model(inputs, file_format);
|
||||
}
|
||||
|
|
192
ggml.c
192
ggml.c
|
@ -461,6 +461,39 @@ static inline __m128i packNibbles( __m256i bytes )
|
|||
__m128i r1 = _mm256_extracti128_si256( bytes, 1 );
|
||||
return _mm_packus_epi16( r0, r1 );
|
||||
}
|
||||
#elif __AVX__
|
||||
static inline __m128i bytesFromNibbles( const uint8_t* rsi )
|
||||
{
|
||||
// Load 8 bytes from memory
|
||||
__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
|
||||
|
||||
// Expand bytes into uint16_t values
|
||||
__m128i bytes = _mm_cvtepu8_epi16( tmp );
|
||||
|
||||
// Unpack values into individual bytes
|
||||
const __m128i lowMask = _mm_set1_epi8( 0xF );
|
||||
__m128i high = _mm_andnot_si128( lowMask, bytes );
|
||||
__m128i low = _mm_and_si128( lowMask, bytes );
|
||||
high = _mm_slli_epi16( high, 4 );
|
||||
bytes = _mm_or_si128( low, high );
|
||||
return bytes;
|
||||
}
|
||||
|
||||
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
||||
{
|
||||
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
||||
const __m128i lowByte = _mm_set1_epi16( 0xFF );
|
||||
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
|
||||
__m128i low = _mm_and_si128( lowByte, bytes1 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes1 = _mm_or_si128( low, high );
|
||||
high = _mm_andnot_si128( lowByte, bytes2 );
|
||||
low = _mm_and_si128( lowByte, bytes2 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes2 = _mm_or_si128( low, high );
|
||||
|
||||
return _mm_packus_epi16( bytes1, bytes2);
|
||||
}
|
||||
#endif
|
||||
|
||||
// method 5
|
||||
|
@ -509,8 +542,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
|
|||
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
|
||||
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
|
||||
|
||||
assert(vi0 >= 0 && vi0 < 16);
|
||||
assert(vi1 >= 0 && vi1 < 16);
|
||||
assert(vi0 < 16);
|
||||
assert(vi1 < 16);
|
||||
|
||||
pp[l/2] = vi0 | (vi1 << 4);
|
||||
}
|
||||
|
@ -660,6 +693,80 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|||
__m128i res = packNibbles( i0 );
|
||||
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
|
||||
}
|
||||
#elif defined(__AVX__)
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Load elements into 4 AVX vectors
|
||||
__m256 v0 = _mm256_loadu_ps( x );
|
||||
__m256 v1 = _mm256_loadu_ps( x + 8 );
|
||||
__m256 v2 = _mm256_loadu_ps( x + 16 );
|
||||
__m256 v3 = _mm256_loadu_ps( x + 24 );
|
||||
x += 32;
|
||||
|
||||
// Compute max(abs(e)) for the block
|
||||
const __m256 signBit = _mm256_set1_ps( -0.0f );
|
||||
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
|
||||
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
|
||||
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
|
||||
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
|
||||
|
||||
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
|
||||
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
|
||||
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
|
||||
const float maxScalar = _mm_cvtss_f32( max4 );
|
||||
|
||||
// Quantize these floats
|
||||
const float d = maxScalar / 7.0f;
|
||||
y[i].d = d;
|
||||
const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;
|
||||
const __m256 mul = _mm256_set1_ps( id );
|
||||
|
||||
// Apply the multiplier
|
||||
v0 = _mm256_mul_ps( v0, mul );
|
||||
v1 = _mm256_mul_ps( v1, mul );
|
||||
v2 = _mm256_mul_ps( v2, mul );
|
||||
v3 = _mm256_mul_ps( v3, mul );
|
||||
|
||||
// Round to nearest integer
|
||||
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
|
||||
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
|
||||
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
|
||||
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
|
||||
|
||||
// Convert floats to integers
|
||||
__m256i i0 = _mm256_cvtps_epi32( v0 );
|
||||
__m256i i1 = _mm256_cvtps_epi32( v1 );
|
||||
__m256i i2 = _mm256_cvtps_epi32( v2 );
|
||||
__m256i i3 = _mm256_cvtps_epi32( v3 );
|
||||
|
||||
// Since we don't have in AVX some necessary functions,
|
||||
// we split the registers in half and call AVX2 analogs from SSE
|
||||
__m128i ni0 = _mm256_castsi256_si128( i0 );
|
||||
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
|
||||
__m128i ni2 = _mm256_castsi256_si128( i1 );
|
||||
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
|
||||
__m128i ni4 = _mm256_castsi256_si128( i2 );
|
||||
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
|
||||
__m128i ni6 = _mm256_castsi256_si128( i3 );
|
||||
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
|
||||
|
||||
// Convert int32 to int16
|
||||
ni0 = _mm_packs_epi32( ni0, ni1 );
|
||||
ni2 = _mm_packs_epi32( ni2, ni3 );
|
||||
ni4 = _mm_packs_epi32( ni4, ni5 );
|
||||
ni6 = _mm_packs_epi32( ni6, ni7 );
|
||||
// Convert int16 to int8
|
||||
ni0 = _mm_packs_epi16( ni0, ni2 );
|
||||
ni4 = _mm_packs_epi16( ni4, ni6 );
|
||||
|
||||
// Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
|
||||
const __m128i off = _mm_set1_epi8( 8);
|
||||
ni0 = _mm_add_epi8( ni0, off );
|
||||
ni4 = _mm_add_epi8( ni4, off );
|
||||
|
||||
// Compress the vector into 4 bit/value, and store
|
||||
__m128i res = packNibbles( ni0, ni4 );
|
||||
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
|
||||
}
|
||||
#elif defined(__wasm_simd128__)
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
@ -730,8 +837,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
|
|||
const uint8_t vi0 = roundf(v0);
|
||||
const uint8_t vi1 = roundf(v1);
|
||||
|
||||
assert(vi0 >= 0 && vi0 < 16);
|
||||
assert(vi1 >= 0 && vi1 < 16);
|
||||
assert(vi0 < 16);
|
||||
assert(vi1 < 16);
|
||||
|
||||
pp[l/2] = vi0 | (vi1 << 4);
|
||||
}
|
||||
|
@ -1726,7 +1833,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|||
const block_q4_0 * restrict x = vx;
|
||||
const block_q4_0 * restrict y = vy;
|
||||
|
||||
ggml_float sumf = 0.0;
|
||||
float sumf = 0.0;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
float sum0 = 0.0f;
|
||||
|
@ -1821,7 +1928,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|||
#endif
|
||||
}
|
||||
|
||||
sumf = (ggml_float)(sum0 + sum1);
|
||||
sumf = sum0 + sum1;
|
||||
#elif defined(__AVX512F__)
|
||||
// Initialize accumulator with zeros
|
||||
__m512 acc0 = _mm512_setzero_ps();
|
||||
|
@ -1855,6 +1962,10 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
// Main loop
|
||||
// TODO: figure a way to do this in a portable way
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC unroll 16
|
||||
#endif
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
// Compute combined scale for the block
|
||||
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
|
||||
|
@ -1868,20 +1979,21 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|||
bx = _mm256_sub_epi8( bx, off );
|
||||
by = _mm256_sub_epi8( by, off );
|
||||
|
||||
// Sign-extend first 16 signed bytes into int16_t
|
||||
__m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
|
||||
__m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
|
||||
// Compute products of int16_t integers, add pairwise
|
||||
__m256i i32 = _mm256_madd_epi16( x16, y16 );
|
||||
// Get absolute values of x vectors
|
||||
const __m256i ax = _mm256_sign_epi8(bx, bx);
|
||||
|
||||
// Sign-extend last 16 signed bytes into int16_t vectors
|
||||
x16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
|
||||
y16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
|
||||
// Accumulate products of int16_t integers
|
||||
i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16, y16 ) );
|
||||
// Sign the values of the y vectors
|
||||
const __m256i sy = _mm256_sign_epi8(by, bx);
|
||||
|
||||
// Perform multiplication and create 16-bit values
|
||||
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
||||
|
||||
const __m256i ones = _mm256_set1_epi16(1);
|
||||
const __m256i i32 = _mm256_madd_epi16(ones, dot);
|
||||
|
||||
// Convert int32_t to float
|
||||
__m256 p = _mm256_cvtepi32_ps( i32 );
|
||||
const __m256 p = _mm256_cvtepi32_ps( i32 );
|
||||
|
||||
// Apply the scale, and accumulate
|
||||
acc = _mm256_fmadd_ps( d, p, acc );
|
||||
}
|
||||
|
@ -1892,6 +2004,52 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|||
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
|
||||
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
||||
|
||||
sumf = _mm_cvtss_f32( res );
|
||||
#elif defined(__AVX__)
|
||||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
// Main loop
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
// Compute combined scale for the block
|
||||
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
|
||||
|
||||
__m128i i32[2];
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
// Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
|
||||
__m128i bx = bytesFromNibbles( x[i].qs + 8*j );
|
||||
__m128i by = bytesFromNibbles( y[i].qs + 8*j );
|
||||
|
||||
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
||||
const __m128i off = _mm_set1_epi8( 8 );
|
||||
bx = _mm_sub_epi8( bx, off );
|
||||
by = _mm_sub_epi8( by, off );
|
||||
|
||||
// Get absolute values of x vectors
|
||||
const __m128i ax = _mm_sign_epi8(bx, bx);
|
||||
|
||||
// Sign the values of the y vectors
|
||||
const __m128i sy = _mm_sign_epi8(by, bx);
|
||||
|
||||
// Perform multiplication and create 16-bit values
|
||||
const __m128i dot = _mm_maddubs_epi16(ax, sy);
|
||||
|
||||
const __m128i ones = _mm_set1_epi16(1);
|
||||
i32[j] = _mm_madd_epi16(ones, dot);
|
||||
}
|
||||
|
||||
// Convert int32_t to float
|
||||
__m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
|
||||
// Apply the scale, and accumulate
|
||||
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
|
||||
}
|
||||
|
||||
// Return horizontal sum of the acc vector
|
||||
__m128 res = _mm256_extractf128_ps( acc, 1 );
|
||||
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
|
||||
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
|
||||
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
||||
|
||||
sumf = _mm_cvtss_f32( res );
|
||||
#elif defined(__wasm_simd128__)
|
||||
// wasm simd
|
||||
|
|
BIN
llamacpp.dll
BIN
llamacpp.dll
Binary file not shown.
Binary file not shown.
BIN
main.exe
BIN
main.exe
Binary file not shown.
BIN
quantize.exe
BIN
quantize.exe
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue