Botan 3.1.1
Crypto and TLS for C&
argon2_avx2.cpp
Go to the documentation of this file.
1/**
2* (C) 2023 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/argon2.h>
8#include <immintrin.h>
9
10namespace Botan {
11
12namespace {
13
14class SIMD_4x64 final {
15 public:
16 SIMD_4x64& operator=(const SIMD_4x64& other) = default;
17 SIMD_4x64(const SIMD_4x64& other) = default;
18
19 SIMD_4x64& operator=(SIMD_4x64&& other) = default;
20 SIMD_4x64(SIMD_4x64&& other) = default;
21
22 ~SIMD_4x64() = default;
23
24 BOTAN_FUNC_ISA("avx2")
25 SIMD_4x64() // zero initialized
26 {
27 m_simd = _mm256_setzero_si256();
28 }
29
30 // Load two halves at different addresses
31 static BOTAN_FUNC_ISA("avx2") SIMD_4x64 load_le2(const void* inl, const void* inh) {
32 return SIMD_4x64(
33 _mm256_loadu2_m128i(reinterpret_cast<const __m128i*>(inl), reinterpret_cast<const __m128i*>(inh)));
34 }
35
36 static BOTAN_FUNC_ISA("avx2") SIMD_4x64 load_le(const void* in) {
37 return SIMD_4x64(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in)));
38 }
39
40 void store_le(uint64_t out[4]) const { this->store_le(reinterpret_cast<uint8_t*>(out)); }
41
42 BOTAN_FUNC_ISA("avx2") void store_le(uint8_t out[]) const {
43 _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), m_simd);
44 }
45
46 BOTAN_FUNC_ISA("avx2") void store_le2(void* outh, void* outl) {
47 _mm256_storeu2_m128i(reinterpret_cast<__m128i*>(outh), reinterpret_cast<__m128i*>(outl), m_simd);
48 }
49
50 SIMD_4x64 operator+(const SIMD_4x64& other) const {
51 SIMD_4x64 retval(*this);
52 retval += other;
53 return retval;
54 }
55
56 SIMD_4x64 operator^(const SIMD_4x64& other) const {
57 SIMD_4x64 retval(*this);
58 retval ^= other;
59 return retval;
60 }
61
62 BOTAN_FUNC_ISA("avx2") void operator+=(const SIMD_4x64& other) {
63 m_simd = _mm256_add_epi64(m_simd, other.m_simd);
64 }
65
66 BOTAN_FUNC_ISA("avx2") void operator^=(const SIMD_4x64& other) {
67 m_simd = _mm256_xor_si256(m_simd, other.m_simd);
68 }
69
70 template <size_t ROT>
71 BOTAN_FUNC_ISA("avx2")
72 SIMD_4x64 rotr() const
73 requires(ROT > 0 && ROT < 64)
74 {
75 if constexpr(ROT == 16) {
76 auto tab = _mm256_setr_epi8(2,
77 3,
78 4,
79 5,
80 6,
81 7,
82 0,
83 1,
84 10,
85 11,
86 12,
87 13,
88 14,
89 15,
90 8,
91 9,
92 2,
93 3,
94 4,
95 5,
96 6,
97 7,
98 0,
99 1,
100 10,
101 11,
102 12,
103 13,
104 14,
105 15,
106 8,
107 9);
108 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, tab));
109 } else if constexpr(ROT == 24) {
110 auto tab = _mm256_setr_epi8(3,
111 4,
112 5,
113 6,
114 7,
115 0,
116 1,
117 2,
118 11,
119 12,
120 13,
121 14,
122 15,
123 8,
124 9,
125 10,
126 3,
127 4,
128 5,
129 6,
130 7,
131 0,
132 1,
133 2,
134 11,
135 12,
136 13,
137 14,
138 15,
139 8,
140 9,
141 10);
142 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, tab));
143 } else if constexpr(ROT == 32) {
144 auto tab = _mm256_setr_epi8(4,
145 5,
146 6,
147 7,
148 0,
149 1,
150 2,
151 3,
152 12,
153 13,
154 14,
155 15,
156 8,
157 9,
158 10,
159 11,
160 4,
161 5,
162 6,
163 7,
164 0,
165 1,
166 2,
167 3,
168 12,
169 13,
170 14,
171 15,
172 8,
173 9,
174 10,
175 11);
176 return SIMD_4x64(_mm256_shuffle_epi8(m_simd, tab));
177 } else {
178 return SIMD_4x64(_mm256_or_si256(_mm256_srli_epi64(m_simd, static_cast<int>(ROT)),
179 _mm256_slli_epi64(m_simd, static_cast<int>(64 - ROT))));
180 }
181 }
182
183 template <size_t ROT>
184 SIMD_4x64 rotl() const {
185 return this->rotr<64 - ROT>();
186 }
187
188 // Argon2 specific operation
189 static BOTAN_FUNC_ISA("avx2") SIMD_4x64 mul2_32(SIMD_4x64 x, SIMD_4x64 y) {
190 const __m256i m = _mm256_mul_epu32(x.m_simd, y.m_simd);
191 return SIMD_4x64(_mm256_add_epi64(m, m));
192 }
193
194 template <uint8_t CTRL>
195 static BOTAN_FUNC_ISA("avx2") SIMD_4x64 permute_4x64(SIMD_4x64 x) {
196 return SIMD_4x64(_mm256_permute4x64_epi64(x.m_simd, CTRL));
197 }
198
199 // Argon2 specific
200 static void twist(SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
201 B = SIMD_4x64::permute_4x64<0b00'11'10'01>(B);
202 C = SIMD_4x64::permute_4x64<0b01'00'11'10>(C);
203 D = SIMD_4x64::permute_4x64<0b10'01'00'11>(D);
204 }
205
206 // Argon2 specific
207 static void untwist(SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
208 B = SIMD_4x64::permute_4x64<0b10'01'00'11>(B);
209 C = SIMD_4x64::permute_4x64<0b01'00'11'10>(C);
210 D = SIMD_4x64::permute_4x64<0b00'11'10'01>(D);
211 }
212
213 explicit BOTAN_FUNC_ISA("avx2") SIMD_4x64(__m256i x) : m_simd(x) {}
214
215 private:
216 __m256i m_simd;
217};
218
219BOTAN_FORCE_INLINE void blamka_G(SIMD_4x64& A, SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
220 A += B + SIMD_4x64::mul2_32(A, B);
221 D ^= A;
222 D = D.rotr<32>();
223
224 C += D + SIMD_4x64::mul2_32(C, D);
225 B ^= C;
226 B = B.rotr<24>();
227
228 A += B + SIMD_4x64::mul2_32(A, B);
229 D ^= A;
230 D = D.rotr<16>();
231
232 C += D + SIMD_4x64::mul2_32(C, D);
233 B ^= C;
234 B = B.rotr<63>();
235}
236
237BOTAN_FORCE_INLINE void blamka_R(SIMD_4x64& A, SIMD_4x64& B, SIMD_4x64& C, SIMD_4x64& D) {
238 blamka_G(A, B, C, D);
239
240 SIMD_4x64::twist(B, C, D);
241 blamka_G(A, B, C, D);
242 SIMD_4x64::untwist(B, C, D);
243}
244
245} // namespace
246
247BOTAN_FUNC_ISA("avx2") void Argon2::blamka_avx2(uint64_t N[128], uint64_t T[128]) {
248 for(size_t i = 0; i != 8; ++i) {
249 SIMD_4x64 A = SIMD_4x64::load_le(&N[16 * i + 4 * 0]);
250 SIMD_4x64 B = SIMD_4x64::load_le(&N[16 * i + 4 * 1]);
251 SIMD_4x64 C = SIMD_4x64::load_le(&N[16 * i + 4 * 2]);
252 SIMD_4x64 D = SIMD_4x64::load_le(&N[16 * i + 4 * 3]);
253
254 blamka_R(A, B, C, D);
255
256 A.store_le(&T[16 * i + 4 * 0]);
257 B.store_le(&T[16 * i + 4 * 1]);
258 C.store_le(&T[16 * i + 4 * 2]);
259 D.store_le(&T[16 * i + 4 * 3]);
260 }
261
262 for(size_t i = 0; i != 8; ++i) {
263 SIMD_4x64 A = SIMD_4x64::load_le2(&T[2 * i + 32 * 0], &T[2 * i + 32 * 0 + 16]);
264 SIMD_4x64 B = SIMD_4x64::load_le2(&T[2 * i + 32 * 1], &T[2 * i + 32 * 1 + 16]);
265 SIMD_4x64 C = SIMD_4x64::load_le2(&T[2 * i + 32 * 2], &T[2 * i + 32 * 2 + 16]);
266 SIMD_4x64 D = SIMD_4x64::load_le2(&T[2 * i + 32 * 3], &T[2 * i + 32 * 3 + 16]);
267
268 blamka_R(A, B, C, D);
269
270 A.store_le2(&T[2 * i + 32 * 0], &T[2 * i + 32 * 0 + 16]);
271 B.store_le2(&T[2 * i + 32 * 1], &T[2 * i + 32 * 1 + 16]);
272 C.store_le2(&T[2 * i + 32 * 2], &T[2 * i + 32 * 2 + 16]);
273 D.store_le2(&T[2 * i + 32 * 3], &T[2 * i + 32 * 3 + 16]);
274 }
275
276 for(size_t i = 0; i != 128 / 8; ++i) {
277 SIMD_4x64 n0 = SIMD_4x64::load_le(&N[8 * i]);
278 SIMD_4x64 n1 = SIMD_4x64::load_le(&N[8 * i + 4]);
279 SIMD_4x64 t0 = SIMD_4x64::load_le(&T[8 * i]);
280 SIMD_4x64 t1 = SIMD_4x64::load_le(&T[8 * i + 4]);
281
282 n0 ^= t0;
283 n1 ^= t1;
284 n0.store_le(&N[8 * i]);
285 n1.store_le(&N[8 * i + 4]);
286 }
287}
288
289} // namespace Botan
static const void * inh
Definition: argon2_avx2.cpp:31
void * outl
Definition: argon2_avx2.cpp:46
int(* final)(unsigned char *, CTX *)
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:81
#define BOTAN_FORCE_INLINE
Definition: compiler.h:157
FE_25519 T
Definition: ge.cpp:34
Definition: alg_id.cpp:13
constexpr void store_le(uint16_t in, uint8_t out[2])
Definition: loadstor.h:422
constexpr T rotl(T input)
Definition: rotate.h:21
OctetString operator^(const OctetString &k1, const OctetString &k2)
Definition: symkey.cpp:108
OctetString operator+(const OctetString &k1, const OctetString &k2)
Definition: symkey.cpp:98
constexpr T rotr(T input)
Definition: rotate.h:33
constexpr T load_le(const uint8_t in[], size_t off)
Definition: loadstor.h:121