7#include <botan/argon2.h>
17 SIMD_4x64& operator=(
const SIMD_4x64& other) =
default;
18 SIMD_4x64(
const SIMD_4x64& other) =
default;
20 SIMD_4x64& operator=(SIMD_4x64&& other) =
default;
21 SIMD_4x64(SIMD_4x64&& other) =
default;
23 ~SIMD_4x64() =
default;
28 m_simd = _mm256_setzero_si256();
33 SIMD_4x64 load_le2(const
void* inl, const
void* inh)
35 return SIMD_4x64(_mm256_loadu2_m128i(
36 reinterpret_cast<const __m128i*
>(inl),
37 reinterpret_cast<const __m128i*
>(
inh)));
41 SIMD_4x64
load_le(const
void* in)
43 return SIMD_4x64(_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(in)));
48 this->
store_le(
reinterpret_cast<uint8_t*
>(out));
54 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(out), m_simd);
58 void store_le2(
void* outh,
void* outl)
61 reinterpret_cast<__m128i*
>(outh),
62 reinterpret_cast<__m128i*
>(outl),
66 SIMD_4x64
operator+(
const SIMD_4x64& other)
const
68 SIMD_4x64 retval(*
this);
73 SIMD_4x64
operator^(
const SIMD_4x64& other)
const
75 SIMD_4x64 retval(*
this);
81 void operator+=(const SIMD_4x64& other)
83 m_simd = _mm256_add_epi64(m_simd, other.m_simd);
87 void operator^=(const SIMD_4x64& other)
89 m_simd = _mm256_xor_si256(m_simd, other.m_simd);
94 SIMD_4x64
rotr()
const
96 static_assert(ROT > 0 && ROT < 64,
"Invalid rotation constant");
98 if constexpr(ROT == 16)
100 auto tab = _mm256_setr_epi8(
101 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9,
102 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
103 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, tab));
105 else if constexpr(ROT == 24)
107 auto tab = _mm256_setr_epi8(
108 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10,
109 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
110 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, tab));
112 else if constexpr(ROT == 32)
114 auto tab = _mm256_setr_epi8(
115 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11,
116 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11);
117 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, tab));
121 return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd,
static_cast<int>(ROT)),
122 _mm256_slli_epi64(m_simd,
static_cast<int>(64-ROT))));
127 SIMD_4x64
rotl()
const
129 return this->rotr<64-ROT>();
134 SIMD_4x64 mul2_32(SIMD_4x64 x, SIMD_4x64 y)
136 const __m256i m = _mm256_mul_epu32(x.m_simd,
y.m_simd);
137 return SIMD_4x64(_mm256_add_epi64(m, m));
140 template<u
int8_t CTRL>
143 return SIMD_4x64(_mm256_permute4x64_epi64(x.m_simd, CTRL));
152 B = SIMD_4x64::permute_4x64<0b00'11'10'01>(B);
153 C = SIMD_4x64::permute_4x64<0b01'00'11'10>(C);
154 D = SIMD_4x64::permute_4x64<0b10'01'00'11>(D);
163 B = SIMD_4x64::permute_4x64<0b10'01'00'11>(B);
164 C = SIMD_4x64::permute_4x64<0b01'00'11'10>(C);
165 D = SIMD_4x64::permute_4x64<0b00'11'10'01>(D);
168 explicit BOTAN_FUNC_ISA(
"avx2") SIMD_4x64(__m256i x) : m_simd(x) {}
179 A += B + SIMD_4x64::mul2_32(A, B);
183 C += D + SIMD_4x64::mul2_32(C, D);
187 A += B + SIMD_4x64::mul2_32(A, B);
191 C += D + SIMD_4x64::mul2_32(C, D);
202 blamka_G(A, B, C, D);
204 SIMD_4x64::twist(B, C, D);
205 blamka_G(A, B, C, D);
206 SIMD_4x64::untwist(B, C, D);
212void Argon2::blamka_avx2(uint64_t N[128], uint64_t
T[128])
214 for(
size_t i = 0; i != 8; ++i)
216 SIMD_4x64 A = SIMD_4x64::load_le(&N[16*i + 4*0]);
217 SIMD_4x64 B = SIMD_4x64::load_le(&N[16*i + 4*1]);
218 SIMD_4x64 C = SIMD_4x64::load_le(&N[16*i + 4*2]);
219 SIMD_4x64 D = SIMD_4x64::load_le(&N[16*i + 4*3]);
221 blamka_R(A, B, C, D);
223 A.store_le(&
T[16*i + 4*0]);
224 B.store_le(&
T[16*i + 4*1]);
225 C.store_le(&
T[16*i + 4*2]);
226 D.store_le(&
T[16*i + 4*3]);
229 for(
size_t i = 0; i != 8; ++i)
231 SIMD_4x64 A = SIMD_4x64::load_le2(&
T[2*i + 32*0], &
T[2*i + 32*0 + 16]);
232 SIMD_4x64 B = SIMD_4x64::load_le2(&
T[2*i + 32*1], &
T[2*i + 32*1 + 16]);
233 SIMD_4x64 C = SIMD_4x64::load_le2(&
T[2*i + 32*2], &
T[2*i + 32*2 + 16]);
234 SIMD_4x64 D = SIMD_4x64::load_le2(&
T[2*i + 32*3], &
T[2*i + 32*3 + 16]);
236 blamka_R(A, B, C, D);
238 A.store_le2(&
T[2*i + 32*0], &
T[2*i + 32*0 + 16]);
239 B.store_le2(&
T[2*i + 32*1], &
T[2*i + 32*1 + 16]);
240 C.store_le2(&
T[2*i + 32*2], &
T[2*i + 32*2 + 16]);
241 D.store_le2(&
T[2*i + 32*3], &
T[2*i + 32*3 + 16]);
244 for(
size_t i = 0; i != 128 / 8; ++i)
246 SIMD_4x64 n0 = SIMD_4x64::load_le(&N[8*i]);
247 SIMD_4x64 n1 = SIMD_4x64::load_le(&N[8*i+4]);
248 SIMD_4x64 t0 = SIMD_4x64::load_le(&
T[8*i]);
249 SIMD_4x64 t1 = SIMD_4x64::load_le(&
T[8*i+4]);
253 n0.store_le(&N[8*i]);
254 n1.store_le(&N[8*i+4]);
int(* final)(unsigned char *, CTX *)
#define BOTAN_FUNC_ISA(isa)
#define BOTAN_FORCE_INLINE
constexpr void store_le(uint16_t in, uint8_t out[2])
constexpr T rotr(T input)
OctetString operator^(const OctetString &k1, const OctetString &k2)
OctetString operator+(const OctetString &k1, const OctetString &k2)
constexpr T load_le(const uint8_t in[], size_t off)
constexpr T rotl(T input)