7#include <botan/argon2.h>
9#include <botan/compiler.h>
16class SIMD_4x64
final {
18 SIMD_4x64& operator=(
const SIMD_4x64& other) =
default;
19 SIMD_4x64(
const SIMD_4x64& other) =
default;
21 SIMD_4x64& operator=(SIMD_4x64&& other) =
default;
22 SIMD_4x64(SIMD_4x64&& other) =
default;
24 ~SIMD_4x64() =
default;
27 BOTAN_FUNC_ISA(
"avx2") SIMD_4x64() { m_simd = _mm256_setzero_si256(); }
30 static BOTAN_FUNC_ISA(
"avx2") SIMD_4x64 load_le2(const
void* inl, const
void* inh) {
32 _mm256_loadu2_m128i(
reinterpret_cast<const __m128i*
>(inl),
reinterpret_cast<const __m128i*
>(
inh)));
36 return SIMD_4x64(_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(in)));
39 void store_le(uint64_t out[4])
const { this->
store_le(
reinterpret_cast<uint8_t*
>(out)); }
42 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(out), m_simd);
46 _mm256_storeu2_m128i(
reinterpret_cast<__m128i*
>(outh),
reinterpret_cast<__m128i*
>(
outl), m_simd);
49 SIMD_4x64
operator+(
const SIMD_4x64& other)
const {
50 SIMD_4x64 retval(*
this);
55 SIMD_4x64
operator^(
const SIMD_4x64& other)
const {
56 SIMD_4x64 retval(*
this);
62 m_simd = _mm256_add_epi64(m_simd, other.m_simd);
66 m_simd = _mm256_xor_si256(m_simd, other.m_simd);
71 SIMD_4x64
rotr() const
72 requires(ROT > 0 && ROT < 64)
74 if constexpr(ROT == 16) {
76 _mm256_set_epi64x(0x09080f0e0d0c0b0a, 0x0100070605040302, 0x09080f0e0d0c0b0a, 0x0100070605040302);
78 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_16));
79 }
else if constexpr(ROT == 24) {
81 _mm256_set_epi64x(0x0a09080f0e0d0c0b, 0x0201000706050403, 0x0a09080f0e0d0c0b, 0x0201000706050403);
83 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_24));
84 }
else if constexpr(ROT == 32) {
86 _mm256_set_epi64x(0x0b0a09080f0e0d0c, 0x0302010007060504, 0x0b0a09080f0e0d0c, 0x0302010007060504);
88 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, shuf_rot_32));
90 return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd,
static_cast<int>(ROT)),
91 _mm256_slli_epi64(m_simd,
static_cast<int>(64 - ROT))));
96 SIMD_4x64
rotl()
const {
97 return this->
rotr<64 - ROT>();
101 static BOTAN_FUNC_ISA(
"avx2") SIMD_4x64 mul2_32(SIMD_4x64 x, SIMD_4x64 y) {
102 const __m256i m = _mm256_mul_epu32(x.m_simd, y.m_simd);
103 return SIMD_4x64(_mm256_add_epi64(m, m));
106 template <u
int8_t CTRL>
107 static BOTAN_FUNC_ISA(
"avx2") SIMD_4x64 permute_4x64(SIMD_4x64 x) {
108 return SIMD_4x64(_mm256_permute4x64_epi64(x.m_simd, CTRL));
112 static void twist(SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
113 B = SIMD_4x64::permute_4x64<0b00'11'10'01>(B);
114 C = SIMD_4x64::permute_4x64<0b01'00'11'10>(C);
115 D = SIMD_4x64::permute_4x64<0b10'01'00'11>(D);
119 static void untwist(SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
120 B = SIMD_4x64::permute_4x64<0b10'01'00'11>(B);
121 C = SIMD_4x64::permute_4x64<0b01'00'11'10>(C);
122 D = SIMD_4x64::permute_4x64<0b00'11'10'01>(D);
125 explicit BOTAN_FUNC_ISA(
"avx2") SIMD_4x64(__m256i x) : m_simd(x) {}
131BOTAN_FORCE_INLINE void blamka_G(SIMD_4x64& A, SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
132 A += B + SIMD_4x64::mul2_32(A, B);
136 C += D + SIMD_4x64::mul2_32(C, D);
140 A += B + SIMD_4x64::mul2_32(A, B);
144 C += D + SIMD_4x64::mul2_32(C, D);
149BOTAN_FORCE_INLINE void blamka_R(SIMD_4x64& A, SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
150 blamka_G(A, B, C, D);
152 SIMD_4x64::twist(B, C, D);
153 blamka_G(A, B, C, D);
154 SIMD_4x64::untwist(B, C, D);
160 for(
size_t i = 0; i != 8; ++i) {
161 SIMD_4x64 A = SIMD_4x64::load_le(&N[16 * i + 4 * 0]);
162 SIMD_4x64 B = SIMD_4x64::load_le(&N[16 * i + 4 * 1]);
163 SIMD_4x64 C = SIMD_4x64::load_le(&N[16 * i + 4 * 2]);
164 SIMD_4x64 D = SIMD_4x64::load_le(&N[16 * i + 4 * 3]);
166 blamka_R(A, B, C, D);
168 A.store_le(&
T[16 * i + 4 * 0]);
169 B.store_le(&
T[16 * i + 4 * 1]);
170 C.store_le(&
T[16 * i + 4 * 2]);
171 D.store_le(&
T[16 * i + 4 * 3]);
174 for(
size_t i = 0; i != 8; ++i) {
175 SIMD_4x64 A = SIMD_4x64::load_le2(&
T[2 * i + 32 * 0], &
T[2 * i + 32 * 0 + 16]);
176 SIMD_4x64 B = SIMD_4x64::load_le2(&
T[2 * i + 32 * 1], &
T[2 * i + 32 * 1 + 16]);
177 SIMD_4x64 C = SIMD_4x64::load_le2(&
T[2 * i + 32 * 2], &
T[2 * i + 32 * 2 + 16]);
178 SIMD_4x64 D = SIMD_4x64::load_le2(&
T[2 * i + 32 * 3], &
T[2 * i + 32 * 3 + 16]);
180 blamka_R(A, B, C, D);
182 A.store_le2(&
T[2 * i + 32 * 0], &
T[2 * i + 32 * 0 + 16]);
183 B.store_le2(&
T[2 * i + 32 * 1], &
T[2 * i + 32 * 1 + 16]);
184 C.store_le2(&
T[2 * i + 32 * 2], &
T[2 * i + 32 * 2 + 16]);
185 D.store_le2(&
T[2 * i + 32 * 3], &
T[2 * i + 32 * 3 + 16]);
188 for(
size_t i = 0; i != 128 / 8; ++i) {
189 SIMD_4x64 n0 = SIMD_4x64::load_le(&N[8 * i]);
190 SIMD_4x64 n1 = SIMD_4x64::load_le(&N[8 * i + 4]);
191 SIMD_4x64 t0 = SIMD_4x64::load_le(&
T[8 * i]);
192 SIMD_4x64 t1 = SIMD_4x64::load_le(&
T[8 * i + 4]);
196 n0.store_le(&N[8 * i]);
197 n1.store_le(&N[8 * i + 4]);
int(* final)(unsigned char *, CTX *)
#define BOTAN_FUNC_ISA(isa)
#define BOTAN_FORCE_INLINE
constexpr T rotl(T input)
OctetString operator^(const OctetString &k1, const OctetString &k2)
OctetString operator+(const OctetString &k1, const OctetString &k2)
constexpr T rotr(T input)
constexpr auto store_le(ParamTs &&... params)
constexpr auto load_le(ParamTs &&... params)