From b883473a6cbb82e7f60c26105b9aa59569d8dc57 Mon Sep 17 00:00:00 2001 From: Paul Kulchenko Date: Tue, 21 Sep 2021 22:51:13 -0700 Subject: [PATCH] Remove optimized blamka rounds from Argon2 Optimized blamka rounds occasionally SIGSERV for me, so the reference implementation is used instead. --- third_party/argon2/blamka-round-opt.h | 467 -------------------------- third_party/argon2/opt.c | 283 ---------------- 2 files changed, 750 deletions(-) delete mode 100644 third_party/argon2/blamka-round-opt.h delete mode 100644 third_party/argon2/opt.c diff --git a/third_party/argon2/blamka-round-opt.h b/third_party/argon2/blamka-round-opt.h deleted file mode 100644 index 9c01e5625..000000000 --- a/third_party/argon2/blamka-round-opt.h +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#ifndef BLAKE_ROUND_MKA_OPT_H -#define BLAKE_ROUND_MKA_OPT_H - -#include "third_party/argon2/blake2-impl.h" - -#include -#if defined(__SSSE3__) -#include /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */ -#endif - -#if !defined(__AVX512F__) -#if !defined(__AVX2__) -#if !defined(__XOP__) -#if defined(__SSSE3__) -#define r16 \ - (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) -#define r24 \ - (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) -#define _mm_roti_epi64(x, c) \ - (-(c) == 32) \ - ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \ - : (-(c) == 24) \ - ? _mm_shuffle_epi8((x), r24) \ - : (-(c) == 16) \ - ? _mm_shuffle_epi8((x), r16) \ - : (-(c) == 63) \ - ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ - _mm_add_epi64((x), (x))) \ - : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ - _mm_slli_epi64((x), 64 - (-(c)))) -#else /* defined(__SSE2__) */ -#define _mm_roti_epi64(r, c) \ - _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c)))) -#endif -#else -#endif - -static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { - const __m128i z = _mm_mul_epu32(x, y); - return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z)); -} - -#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - A0 = fBlaMka(A0, B0); \ - A1 = fBlaMka(A1, B1); \ - \ - D0 = _mm_xor_si128(D0, A0); \ - D1 = _mm_xor_si128(D1, A1); \ - \ - D0 = _mm_roti_epi64(D0, -32); \ - D1 = _mm_roti_epi64(D1, -32); \ - \ - C0 = fBlaMka(C0, D0); \ - C1 = fBlaMka(C1, D1); \ - \ - B0 = _mm_xor_si128(B0, C0); \ - B1 = _mm_xor_si128(B1, C1); \ - \ - B0 = _mm_roti_epi64(B0, -24); \ - B1 = _mm_roti_epi64(B1, -24); \ - } while ((void)0, 0) - -#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - A0 = fBlaMka(A0, B0); \ - A1 = fBlaMka(A1, B1); \ - \ - D0 = _mm_xor_si128(D0, A0); \ - D1 = _mm_xor_si128(D1, A1); \ - \ - D0 = _mm_roti_epi64(D0, -16); \ - D1 = _mm_roti_epi64(D1, -16); \ - \ - C0 = fBlaMka(C0, D0); \ - C1 = fBlaMka(C1, D1); \ - \ - B0 = _mm_xor_si128(B0, C0); \ - B1 = _mm_xor_si128(B1, C1); \ - \ - B0 = _mm_roti_epi64(B0, -63); \ - B1 = _mm_roti_epi64(B1, -63); \ - } while ((void)0, 0) - -#if defined(__SSSE3__) -#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - __m128i t0 = _mm_alignr_epi8(B1, B0, 8); \ - __m128i t1 = _mm_alignr_epi8(B0, B1, 8); \ - B0 = t0; \ - B1 = t1; \ - \ - t0 = C0; \ - C0 = C1; \ - C1 = t0; \ - \ - t0 = _mm_alignr_epi8(D1, D0, 8); \ - t1 = _mm_alignr_epi8(D0, D1, 8); \ - D0 = t1; \ - D1 = t0; \ - } while ((void)0, 0) - -#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - __m128i t0 = _mm_alignr_epi8(B0, B1, 8); \ - __m128i t1 = _mm_alignr_epi8(B1, B0, 8); \ - B0 = t0; \ - B1 = t1; \ - \ - t0 = C0; \ - C0 = C1; \ - C1 = t0; \ - \ - t0 = _mm_alignr_epi8(D0, D1, 8); \ - t1 = _mm_alignr_epi8(D1, D0, 8); \ - D0 = t1; \ - D1 = t0; \ - } while ((void)0, 0) -#else /* SSE2 */ -#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - __m128i t0 = D0; \ - __m128i t1 = B0; \ - D0 = C0; \ - C0 = C1; \ - C1 = D0; \ - D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0)); \ - D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1)); \ - B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1)); \ - B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1)); \ - } while ((void)0, 0) - -#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - __m128i t0, t1; \ - t0 = C0; \ - C0 = C1; \ - C1 = t0; \ - t0 = B0; \ - t1 = D0; \ - B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0)); \ - B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1)); \ - D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1)); \ - D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1)); \ - } while ((void)0, 0) -#endif - -#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \ - do { \ - G1(A0, B0, C0, D0, A1, B1, C1, D1); \ - G2(A0, B0, C0, D0, A1, B1, C1, D1); \ - \ - DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ - \ - G1(A0, B0, C0, D0, A1, B1, C1, D1); \ - G2(A0, B0, C0, D0, A1, B1, C1, D1); \ - \ - UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ - } while ((void)0, 0) -#else /* __AVX2__ */ - -#include - -#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)) -#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) -#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) -#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x))) - -#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - do { \ - __m256i ml = _mm256_mul_epu32(A0, B0); \ - ml = _mm256_add_epi64(ml, ml); \ - A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ - D0 = _mm256_xor_si256(D0, A0); \ - D0 = rotr32(D0); \ - \ - ml = _mm256_mul_epu32(C0, D0); \ - ml = _mm256_add_epi64(ml, ml); \ - C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ - \ - B0 = _mm256_xor_si256(B0, C0); \ - B0 = rotr24(B0); \ - \ - ml = _mm256_mul_epu32(A1, B1); \ - ml = _mm256_add_epi64(ml, ml); \ - A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ - D1 = _mm256_xor_si256(D1, A1); \ - D1 = rotr32(D1); \ - \ - ml = _mm256_mul_epu32(C1, D1); \ - ml = _mm256_add_epi64(ml, ml); \ - C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ - \ - B1 = _mm256_xor_si256(B1, C1); \ - B1 = rotr24(B1); \ - } while((void)0, 0); - -#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - do { \ - __m256i ml = _mm256_mul_epu32(A0, B0); \ - ml = _mm256_add_epi64(ml, ml); \ - A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ - D0 = _mm256_xor_si256(D0, A0); \ - D0 = rotr16(D0); \ - \ - ml = _mm256_mul_epu32(C0, D0); \ - ml = _mm256_add_epi64(ml, ml); \ - C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ - B0 = _mm256_xor_si256(B0, C0); \ - B0 = rotr63(B0); \ - \ - ml = _mm256_mul_epu32(A1, B1); \ - ml = _mm256_add_epi64(ml, ml); \ - A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ - D1 = _mm256_xor_si256(D1, A1); \ - D1 = rotr16(D1); \ - \ - ml = _mm256_mul_epu32(C1, D1); \ - ml = _mm256_add_epi64(ml, ml); \ - C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ - B1 = _mm256_xor_si256(B1, C1); \ - B1 = rotr63(B1); \ - } while((void)0, 0); - -#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ - C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ - D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ - \ - B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ - C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ - D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ - } while((void)0, 0); - -#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ - do { \ - __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ - __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ - B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ - B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ - \ - tmp1 = C0; \ - C0 = C1; \ - C1 = tmp1; \ - \ - tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \ - tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \ - D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ - D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ - } while(0); - -#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ - C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ - D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ - \ - B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ - C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ - D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ - } while((void)0, 0); - -#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ - do { \ - __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ - __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ - B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ - B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ - \ - tmp1 = C0; \ - C0 = C1; \ - C1 = tmp1; \ - \ - tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \ - tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \ - D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ - D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ - } while((void)0, 0); - -#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \ - do{ \ - G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ - DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ - \ - G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ - UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ - } while((void)0, 0); - -#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \ - do{ \ - G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ - DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ - G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ - UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ - } while((void)0, 0); - -#endif /* __AVX2__ */ - -#else /* __AVX512F__ */ - -#error __AVX512F__ Not supported - -#define ror64(x, n) _mm512_ror_epi64((x), (n)) - -static __m512i muladd(__m512i x, __m512i y) -{ - __m512i z = _mm512_mul_epu32(x, y); - return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z)); -} - -#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - A0 = muladd(A0, B0); \ - A1 = muladd(A1, B1); \ -\ - D0 = _mm512_xor_si512(D0, A0); \ - D1 = _mm512_xor_si512(D1, A1); \ -\ - D0 = ror64(D0, 32); \ - D1 = ror64(D1, 32); \ -\ - C0 = muladd(C0, D0); \ - C1 = muladd(C1, D1); \ -\ - B0 = _mm512_xor_si512(B0, C0); \ - B1 = _mm512_xor_si512(B1, C1); \ -\ - B0 = ror64(B0, 24); \ - B1 = ror64(B1, 24); \ - } while ((void)0, 0) - -#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - A0 = muladd(A0, B0); \ - A1 = muladd(A1, B1); \ -\ - D0 = _mm512_xor_si512(D0, A0); \ - D1 = _mm512_xor_si512(D1, A1); \ -\ - D0 = ror64(D0, 16); \ - D1 = ror64(D1, 16); \ -\ - C0 = muladd(C0, D0); \ - C1 = muladd(C1, D1); \ -\ - B0 = _mm512_xor_si512(B0, C0); \ - B1 = _mm512_xor_si512(B1, C1); \ -\ - B0 = ror64(B0, 63); \ - B1 = ror64(B1, 63); \ - } while ((void)0, 0) - -#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ - B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ -\ - C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ - C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ -\ - D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ - D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ - } while ((void)0, 0) - -#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ - B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ -\ - C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ - C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ -\ - D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ - D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ - } while ((void)0, 0) - -#define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - G1(A0, B0, C0, D0, A1, B1, C1, D1); \ - G2(A0, B0, C0, D0, A1, B1, C1, D1); \ -\ - DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ -\ - G1(A0, B0, C0, D0, A1, B1, C1, D1); \ - G2(A0, B0, C0, D0, A1, B1, C1, D1); \ -\ - UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ - } while ((void)0, 0) - -#define SWAP_HALVES(A0, A1) \ - do { \ - __m512i t0, t1; \ - t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \ - t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \ - A0 = t0; \ - A1 = t1; \ - } while((void)0, 0) - -#define SWAP_QUARTERS(A0, A1) \ - do { \ - SWAP_HALVES(A0, A1); \ - A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \ - A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \ - } while((void)0, 0) - -#define UNSWAP_QUARTERS(A0, A1) \ - do { \ - A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \ - A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \ - SWAP_HALVES(A0, A1); \ - } while((void)0, 0) - -#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \ - do { \ - SWAP_HALVES(A0, B0); \ - SWAP_HALVES(C0, D0); \ - SWAP_HALVES(A1, B1); \ - SWAP_HALVES(C1, D1); \ - BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \ - SWAP_HALVES(A0, B0); \ - SWAP_HALVES(C0, D0); \ - SWAP_HALVES(A1, B1); \ - SWAP_HALVES(C1, D1); \ - } while ((void)0, 0) - -#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \ - do { \ - SWAP_QUARTERS(A0, A1); \ - SWAP_QUARTERS(B0, B1); \ - SWAP_QUARTERS(C0, C1); \ - SWAP_QUARTERS(D0, D1); \ - BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \ - UNSWAP_QUARTERS(A0, A1); \ - UNSWAP_QUARTERS(B0, B1); \ - UNSWAP_QUARTERS(C0, C1); \ - UNSWAP_QUARTERS(D0, D1); \ - } while ((void)0, 0) - -#endif /* __AVX512F__ */ -#endif /* BLAKE_ROUND_MKA_OPT_H */ diff --git a/third_party/argon2/opt.c b/third_party/argon2/opt.c deleted file mode 100644 index a069e55f2..000000000 --- a/third_party/argon2/opt.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Argon2 reference source code package - reference C implementations - * - * Copyright 2015 - * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves - * - * You may use this work under the terms of a Creative Commons CC0 1.0 - * License/Waiver or the Apache Public License 2.0, at your option. The terms of - * these licenses can be found at: - * - * - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0 - * - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0 - * - * You should have received a copy of both of these licenses along with this - * software. If not, they may be obtained at the above URLs. - */ - -#include -#include -#include - -#include "third_party/argon2/argon2.h" -#include "third_party/argon2/core.h" - -#include "third_party/argon2/blake2.h" -#include "third_party/argon2/blamka-round-opt.h" - -/* - * Function fills a new memory block and optionally XORs the old block over the new one. - * Memory must be initialized. - * @param state Pointer to the just produced block. Content will be updated(!) - * @param ref_block Pointer to the reference block - * @param next_block Pointer to the block to be XORed over. May coincide with @ref_block - * @param with_xor Whether to XOR into the new block (1) or just overwrite (0) - * @pre all block pointers must be valid - */ -#if defined(__AVX512F__) -static void fill_block(__m512i *state, const block *ref_block, - block *next_block, int with_xor) { - __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK]; - unsigned int i; - - if (with_xor) { - for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { - state[i] = _mm512_xor_si512( - state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i)); - block_XY[i] = _mm512_xor_si512( - state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i)); - } - } else { - for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { - block_XY[i] = state[i] = _mm512_xor_si512( - state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i)); - } - } - - for (i = 0; i < 2; ++i) { - BLAKE2_ROUND_1( - state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3], - state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]); - } - - for (i = 0; i < 2; ++i) { - BLAKE2_ROUND_2( - state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i], - state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]); - } - - for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { - state[i] = _mm512_xor_si512(state[i], block_XY[i]); - _mm512_storeu_si512((__m512i *)next_block->v + i, state[i]); - } -} -#elif defined(__AVX2__) -static void fill_block(__m256i *state, const block *ref_block, - block *next_block, int with_xor) { - __m256i block_XY[ARGON2_HWORDS_IN_BLOCK]; - unsigned int i; - - if (with_xor) { - for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { - state[i] = _mm256_xor_si256( - state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i)); - block_XY[i] = _mm256_xor_si256( - state[i], _mm256_loadu_si256((const __m256i *)next_block->v + i)); - } - } else { - for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { - block_XY[i] = state[i] = _mm256_xor_si256( - state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i)); - } - } - - for (i = 0; i < 4; ++i) { - BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5], - state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]); - } - - for (i = 0; i < 4; ++i) { - BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i], - state[16 + i], state[20 + i], state[24 + i], state[28 + i]); - } - - for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { - state[i] = _mm256_xor_si256(state[i], block_XY[i]); - _mm256_storeu_si256((__m256i *)next_block->v + i, state[i]); - } -} -#else -static void fill_block(__m128i *state, const block *ref_block, - block *next_block, int with_xor) { - __m128i block_XY[ARGON2_OWORDS_IN_BLOCK]; - unsigned int i; - - if (with_xor) { - for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { - state[i] = _mm_xor_si128( - state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i)); - block_XY[i] = _mm_xor_si128( - state[i], _mm_loadu_si128((const __m128i *)next_block->v + i)); - } - } else { - for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { - block_XY[i] = state[i] = _mm_xor_si128( - state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i)); - } - } - - for (i = 0; i < 8; ++i) { - BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], - state[8 * i + 3], state[8 * i + 4], state[8 * i + 5], - state[8 * i + 6], state[8 * i + 7]); - } - - for (i = 0; i < 8; ++i) { - BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], - state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i], - state[8 * 6 + i], state[8 * 7 + i]); - } - - for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { - state[i] = _mm_xor_si128(state[i], block_XY[i]); - _mm_storeu_si128((__m128i *)next_block->v + i, state[i]); - } -} -#endif - -static void next_addresses(block *address_block, block *input_block) { - /*Temporary zero-initialized blocks*/ -#if defined(__AVX512F__) - __m512i zero_block[ARGON2_512BIT_WORDS_IN_BLOCK]; - __m512i zero2_block[ARGON2_512BIT_WORDS_IN_BLOCK]; -#elif defined(__AVX2__) - __m256i zero_block[ARGON2_HWORDS_IN_BLOCK]; - __m256i zero2_block[ARGON2_HWORDS_IN_BLOCK]; -#else - __m128i zero_block[ARGON2_OWORDS_IN_BLOCK]; - __m128i zero2_block[ARGON2_OWORDS_IN_BLOCK]; -#endif - - memset(zero_block, 0, sizeof(zero_block)); - memset(zero2_block, 0, sizeof(zero2_block)); - - /*Increasing index counter*/ - input_block->v[6]++; - - /*First iteration of G*/ - fill_block(zero_block, input_block, address_block, 0); - - /*Second iteration of G*/ - fill_block(zero2_block, address_block, address_block, 0); -} - -void fill_segment(const argon2_instance_t *instance, - argon2_position_t position) { - block *ref_block = NULL, *curr_block = NULL; - block address_block, input_block; - uint64_t pseudo_rand, ref_index, ref_lane; - uint32_t prev_offset, curr_offset; - uint32_t starting_index, i; -#if defined(__AVX512F__) - __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK]; -#elif defined(__AVX2__) - __m256i state[ARGON2_HWORDS_IN_BLOCK]; -#else - __m128i state[ARGON2_OWORDS_IN_BLOCK]; -#endif - int data_independent_addressing; - - if (instance == NULL) { - return; - } - - data_independent_addressing = - (instance->type == Argon2_i) || - (instance->type == Argon2_id && (position.pass == 0) && - (position.slice < ARGON2_SYNC_POINTS / 2)); - - if (data_independent_addressing) { - init_block_value(&input_block, 0); - - input_block.v[0] = position.pass; - input_block.v[1] = position.lane; - input_block.v[2] = position.slice; - input_block.v[3] = instance->memory_blocks; - input_block.v[4] = instance->passes; - input_block.v[5] = instance->type; - } - - starting_index = 0; - - if ((0 == position.pass) && (0 == position.slice)) { - starting_index = 2; /* we have already generated the first two blocks */ - - /* Don't forget to generate the first block of addresses: */ - if (data_independent_addressing) { - next_addresses(&address_block, &input_block); - } - } - - /* Offset of the current block */ - curr_offset = position.lane * instance->lane_length + - position.slice * instance->segment_length + starting_index; - - if (0 == curr_offset % instance->lane_length) { - /* Last block in this lane */ - prev_offset = curr_offset + instance->lane_length - 1; - } else { - /* Previous block */ - prev_offset = curr_offset - 1; - } - - memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE); - - for (i = starting_index; i < instance->segment_length; - ++i, ++curr_offset, ++prev_offset) { - /*1.1 Rotating prev_offset if needed */ - if (curr_offset % instance->lane_length == 1) { - prev_offset = curr_offset - 1; - } - - /* 1.2 Computing the index of the reference block */ - /* 1.2.1 Taking pseudo-random value from the previous block */ - if (data_independent_addressing) { - if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) { - next_addresses(&address_block, &input_block); - } - pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK]; - } else { - pseudo_rand = instance->memory[prev_offset].v[0]; - } - - /* 1.2.2 Computing the lane of the reference block */ - ref_lane = ((pseudo_rand >> 32)) % instance->lanes; - - if ((position.pass == 0) && (position.slice == 0)) { - /* Can not reference other lanes yet */ - ref_lane = position.lane; - } - - /* 1.2.3 Computing the number of possible reference block within the - * lane. - */ - position.index = i; - ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, - ref_lane == position.lane); - - /* 2 Creating a new block */ - ref_block = - instance->memory + instance->lane_length * ref_lane + ref_index; - curr_block = instance->memory + curr_offset; - if (ARGON2_VERSION_10 == instance->version) { - /* version 1.2.1 and earlier: overwrite, not XOR */ - fill_block(state, ref_block, curr_block, 0); - } else { - if(0 == position.pass) { - fill_block(state, ref_block, curr_block, 0); - } else { - fill_block(state, ref_block, curr_block, 1); - } - } - } -}