7#include <botan/argon2.h>
14class SIMD_4x64
final {
16 SIMD_4x64& operator=(
const SIMD_4x64& other) =
default;
17 SIMD_4x64(
const SIMD_4x64& other) =
default;
19 SIMD_4x64& operator=(SIMD_4x64&& other) =
default;
20 SIMD_4x64(SIMD_4x64&& other) =
default;
22 ~SIMD_4x64() =
default;
25 BOTAN_FUNC_ISA(
"avx2") SIMD_4x64() { m_simd = _mm256_setzero_si256(); }
28 static BOTAN_FUNC_ISA(
"avx2") SIMD_4x64 load_le2(const
void* inl, const
void* inh) {
30 _mm256_loadu2_m128i(
reinterpret_cast<const __m128i*
>(inl),
reinterpret_cast<const __m128i*
>(
inh)));
34 return SIMD_4x64(_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(in)));
37 void store_le(uint64_t out[4])
const { this->
store_le(
reinterpret_cast<uint8_t*
>(out)); }
40 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(out), m_simd);
44 _mm256_storeu2_m128i(
reinterpret_cast<__m128i*
>(outh),
reinterpret_cast<__m128i*
>(
outl), m_simd);
47 SIMD_4x64
operator+(
const SIMD_4x64& other)
const {
48 SIMD_4x64 retval(*
this);
53 SIMD_4x64
operator^(
const SIMD_4x64& other)
const {
54 SIMD_4x64 retval(*
this);
60 m_simd = _mm256_add_epi64(m_simd, other.m_simd);
64 m_simd = _mm256_xor_si256(m_simd, other.m_simd);
69 SIMD_4x64
rotr() const
70 requires(ROT > 0 && ROT < 64)
72 if constexpr(ROT == 16) {
74 _mm256_set_epi64x(0x09080f0e0d0c0b0a, 0x0100070605040302, 0x09080f0e0d0c0b0a, 0x0100070605040302);
76 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_16));
77 }
else if constexpr(ROT == 24) {
79 _mm256_set_epi64x(0x0a09080f0e0d0c0b, 0x0201000706050403, 0x0a09080f0e0d0c0b, 0x0201000706050403);
81 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_24));
82 }
else if constexpr(ROT == 32) {
84 _mm256_set_epi64x(0x0b0a09080f0e0d0c, 0x0302010007060504, 0x0b0a09080f0e0d0c, 0x0302010007060504);
86 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_32));
88 return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd,
static_cast<int>(ROT)),
89 _mm256_slli_epi64(m_simd,
static_cast<int>(64 - ROT))));
94 SIMD_4x64
rotl()
const {
95 return this->
rotr<64 - ROT>();
99 static BOTAN_FUNC_ISA(
"avx2") SIMD_4x64 mul2_32(SIMD_4x64 x, SIMD_4x64 y) {
100 const __m256i m = _mm256_mul_epu32(x.m_simd, y.m_simd);
101 return SIMD_4x64(_mm256_add_epi64(m, m));
104 template <u
int8_t CTRL>
105 static BOTAN_FUNC_ISA(
"avx2") SIMD_4x64 permute_4x64(SIMD_4x64 x) {
106 return SIMD_4x64(_mm256_permute4x64_epi64(x.m_simd, CTRL));
110 static void twist(SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
111 B = SIMD_4x64::permute_4x64<0b00'11'10'01>(B);
112 C = SIMD_4x64::permute_4x64<0b01'00'11'10>(C);
113 D = SIMD_4x64::permute_4x64<0b10'01'00'11>(D);
117 static void untwist(SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
118 B = SIMD_4x64::permute_4x64<0b10'01'00'11>(B);
119 C = SIMD_4x64::permute_4x64<0b01'00'11'10>(C);
120 D = SIMD_4x64::permute_4x64<0b00'11'10'01>(D);
123 explicit BOTAN_FUNC_ISA(
"avx2") SIMD_4x64(__m256i x) : m_simd(x) {}
129BOTAN_FORCE_INLINE void blamka_G(SIMD_4x64& A, SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
130 A += B + SIMD_4x64::mul2_32(A, B);
134 C += D + SIMD_4x64::mul2_32(C, D);
138 A += B + SIMD_4x64::mul2_32(A, B);
142 C += D + SIMD_4x64::mul2_32(C, D);
147BOTAN_FORCE_INLINE void blamka_R(SIMD_4x64& A, SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
148 blamka_G(A, B, C, D);
150 SIMD_4x64::twist(B, C, D);
151 blamka_G(A, B, C, D);
152 SIMD_4x64::untwist(B, C, D);
158 for(
size_t i = 0; i != 8; ++i) {
159 SIMD_4x64 A = SIMD_4x64::load_le(&N[16 * i + 4 * 0]);
160 SIMD_4x64 B = SIMD_4x64::load_le(&N[16 * i + 4 * 1]);
161 SIMD_4x64 C = SIMD_4x64::load_le(&N[16 * i + 4 * 2]);
162 SIMD_4x64 D = SIMD_4x64::load_le(&N[16 * i + 4 * 3]);
164 blamka_R(A, B, C, D);
166 A.store_le(&
T[16 * i + 4 * 0]);
167 B.store_le(&
T[16 * i + 4 * 1]);
168 C.store_le(&
T[16 * i + 4 * 2]);
169 D.store_le(&
T[16 * i + 4 * 3]);
172 for(
size_t i = 0; i != 8; ++i) {
173 SIMD_4x64 A = SIMD_4x64::load_le2(&
T[2 * i + 32 * 0], &
T[2 * i + 32 * 0 + 16]);
174 SIMD_4x64 B = SIMD_4x64::load_le2(&
T[2 * i + 32 * 1], &
T[2 * i + 32 * 1 + 16]);
175 SIMD_4x64 C = SIMD_4x64::load_le2(&
T[2 * i + 32 * 2], &
T[2 * i + 32 * 2 + 16]);
176 SIMD_4x64 D = SIMD_4x64::load_le2(&
T[2 * i + 32 * 3], &
T[2 * i + 32 * 3 + 16]);
178 blamka_R(A, B, C, D);
180 A.store_le2(&
T[2 * i + 32 * 0], &
T[2 * i + 32 * 0 + 16]);
181 B.store_le2(&
T[2 * i + 32 * 1], &
T[2 * i + 32 * 1 + 16]);
182 C.store_le2(&
T[2 * i + 32 * 2], &
T[2 * i + 32 * 2 + 16]);
183 D.store_le2(&
T[2 * i + 32 * 3], &
T[2 * i + 32 * 3 + 16]);
186 for(
size_t i = 0; i != 128 / 8; ++i) {
187 SIMD_4x64 n0 = SIMD_4x64::load_le(&N[8 * i]);
188 SIMD_4x64 n1 = SIMD_4x64::load_le(&N[8 * i + 4]);
189 SIMD_4x64 t0 = SIMD_4x64::load_le(&
T[8 * i]);
190 SIMD_4x64 t1 = SIMD_4x64::load_le(&
T[8 * i + 4]);
194 n0.store_le(&N[8 * i]);
195 n1.store_le(&N[8 * i + 4]);
int(* final)(unsigned char *, CTX *)
#define BOTAN_FUNC_ISA(isa)
#define BOTAN_FORCE_INLINE
constexpr T rotl(T input)
OctetString operator^(const OctetString &k1, const OctetString &k2)
OctetString operator+(const OctetString &k1, const OctetString &k2)
constexpr T rotr(T input)
constexpr auto store_le(ParamTs &&... params)
constexpr auto load_le(ParamTs &&... params)