7#include <botan/argon2.h>
14class SIMD_4x64
final {
16 SIMD_4x64& operator=(
const SIMD_4x64& other) =
default;
17 SIMD_4x64(
const SIMD_4x64& other) =
default;
19 SIMD_4x64& operator=(SIMD_4x64&& other) =
default;
20 SIMD_4x64(SIMD_4x64&& other) =
default;
22 ~SIMD_4x64() =
default;
27 m_simd = _mm256_setzero_si256();
31 static BOTAN_FUNC_ISA(
"avx2") SIMD_4x64 load_le2(const
void* inl, const
void* inh) {
33 _mm256_loadu2_m128i(
reinterpret_cast<const __m128i*
>(inl),
reinterpret_cast<const __m128i*
>(
inh)));
37 return SIMD_4x64(_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(in)));
40 void store_le(uint64_t out[4])
const { this->
store_le(
reinterpret_cast<uint8_t*
>(out)); }
43 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(out), m_simd);
47 _mm256_storeu2_m128i(
reinterpret_cast<__m128i*
>(outh),
reinterpret_cast<__m128i*
>(
outl), m_simd);
50 SIMD_4x64
operator+(
const SIMD_4x64& other)
const {
51 SIMD_4x64 retval(*
this);
56 SIMD_4x64
operator^(
const SIMD_4x64& other)
const {
57 SIMD_4x64 retval(*
this);
63 m_simd = _mm256_add_epi64(m_simd, other.m_simd);
67 m_simd = _mm256_xor_si256(m_simd, other.m_simd);
72 SIMD_4x64
rotr() const
73 requires(ROT > 0 && ROT < 64)
75 if constexpr(ROT == 16) {
76 auto tab = _mm256_setr_epi8(2,
108 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, tab));
109 }
else if constexpr(ROT == 24) {
110 auto tab = _mm256_setr_epi8(3,
142 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, tab));
143 }
else if constexpr(ROT == 32) {
144 auto tab = _mm256_setr_epi8(4,
176 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, tab));
178 return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd,
static_cast<int>(ROT)),
179 _mm256_slli_epi64(m_simd,
static_cast<int>(64 - ROT))));
183 template <
size_t ROT>
184 SIMD_4x64
rotl()
const {
185 return this->
rotr<64 - ROT>();
189 static BOTAN_FUNC_ISA(
"avx2") SIMD_4x64 mul2_32(SIMD_4x64 x, SIMD_4x64 y) {
190 const __m256i m = _mm256_mul_epu32(x.m_simd, y.m_simd);
191 return SIMD_4x64(_mm256_add_epi64(m, m));
194 template <u
int8_t CTRL>
195 static BOTAN_FUNC_ISA(
"avx2") SIMD_4x64 permute_4x64(SIMD_4x64 x) {
196 return SIMD_4x64(_mm256_permute4x64_epi64(x.m_simd, CTRL));
200 static void twist(SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
201 B = SIMD_4x64::permute_4x64<0b00'11'10'01>(B);
202 C = SIMD_4x64::permute_4x64<0b01'00'11'10>(C);
203 D = SIMD_4x64::permute_4x64<0b10'01'00'11>(D);
207 static void untwist(SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
208 B = SIMD_4x64::permute_4x64<0b10'01'00'11>(B);
209 C = SIMD_4x64::permute_4x64<0b01'00'11'10>(C);
210 D = SIMD_4x64::permute_4x64<0b00'11'10'01>(D);
213 explicit BOTAN_FUNC_ISA(
"avx2") SIMD_4x64(__m256i x) : m_simd(x) {}
219BOTAN_FORCE_INLINE void blamka_G(SIMD_4x64& A, SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
220 A += B + SIMD_4x64::mul2_32(A, B);
224 C += D + SIMD_4x64::mul2_32(C, D);
228 A += B + SIMD_4x64::mul2_32(A, B);
232 C += D + SIMD_4x64::mul2_32(C, D);
237BOTAN_FORCE_INLINE void blamka_R(SIMD_4x64& A, SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
238 blamka_G(A, B, C, D);
240 SIMD_4x64::twist(B, C, D);
241 blamka_G(A, B, C, D);
242 SIMD_4x64::untwist(B, C, D);
248 for(
size_t i = 0; i != 8; ++i) {
249 SIMD_4x64 A = SIMD_4x64::load_le(&N[16 * i + 4 * 0]);
250 SIMD_4x64 B = SIMD_4x64::load_le(&N[16 * i + 4 * 1]);
251 SIMD_4x64 C = SIMD_4x64::load_le(&N[16 * i + 4 * 2]);
252 SIMD_4x64 D = SIMD_4x64::load_le(&N[16 * i + 4 * 3]);
254 blamka_R(A, B, C, D);
256 A.store_le(&
T[16 * i + 4 * 0]);
257 B.store_le(&
T[16 * i + 4 * 1]);
258 C.store_le(&
T[16 * i + 4 * 2]);
259 D.store_le(&
T[16 * i + 4 * 3]);
262 for(
size_t i = 0; i != 8; ++i) {
263 SIMD_4x64 A = SIMD_4x64::load_le2(&
T[2 * i + 32 * 0], &
T[2 * i + 32 * 0 + 16]);
264 SIMD_4x64 B = SIMD_4x64::load_le2(&
T[2 * i + 32 * 1], &
T[2 * i + 32 * 1 + 16]);
265 SIMD_4x64 C = SIMD_4x64::load_le2(&
T[2 * i + 32 * 2], &
T[2 * i + 32 * 2 + 16]);
266 SIMD_4x64 D = SIMD_4x64::load_le2(&
T[2 * i + 32 * 3], &
T[2 * i + 32 * 3 + 16]);
268 blamka_R(A, B, C, D);
270 A.store_le2(&
T[2 * i + 32 * 0], &
T[2 * i + 32 * 0 + 16]);
271 B.store_le2(&
T[2 * i + 32 * 1], &
T[2 * i + 32 * 1 + 16]);
272 C.store_le2(&
T[2 * i + 32 * 2], &
T[2 * i + 32 * 2 + 16]);
273 D.store_le2(&
T[2 * i + 32 * 3], &
T[2 * i + 32 * 3 + 16]);
276 for(
size_t i = 0; i != 128 / 8; ++i) {
277 SIMD_4x64 n0 = SIMD_4x64::load_le(&N[8 * i]);
278 SIMD_4x64 n1 = SIMD_4x64::load_le(&N[8 * i + 4]);
279 SIMD_4x64 t0 = SIMD_4x64::load_le(&
T[8 * i]);
280 SIMD_4x64 t1 = SIMD_4x64::load_le(&
T[8 * i + 4]);
284 n0.store_le(&N[8 * i]);
285 n1.store_le(&N[8 * i + 4]);
int(* final)(unsigned char *, CTX *)
#define BOTAN_FUNC_ISA(isa)
#define BOTAN_FORCE_INLINE
constexpr void store_le(uint16_t in, uint8_t out[2])
constexpr T rotl(T input)
OctetString operator^(const OctetString &k1, const OctetString &k2)
OctetString operator+(const OctetString &k1, const OctetString &k2)
constexpr T rotr(T input)
constexpr T load_le(const uint8_t in[], size_t off)