Botan 3.8.1
Crypto and TLS for C&
simd_avx2.h
Go to the documentation of this file.
1/*
2* (C) 2018 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#ifndef BOTAN_SIMD_AVX2_H_
8#define BOTAN_SIMD_AVX2_H_
9
10#include <botan/compiler.h>
11#include <botan/types.h>
12#include <botan/internal/isa_extn.h>
13#include <immintrin.h>
14
15namespace Botan {
16
17class SIMD_8x32 final {
18 public:
19 SIMD_8x32& operator=(const SIMD_8x32& other) = default;
20 SIMD_8x32(const SIMD_8x32& other) = default;
21
22 SIMD_8x32& operator=(SIMD_8x32&& other) = default;
23 SIMD_8x32(SIMD_8x32&& other) = default;
24
25 ~SIMD_8x32() = default;
26
27 BOTAN_FN_ISA_AVX2
28 BOTAN_FORCE_INLINE SIMD_8x32() noexcept { m_avx2 = _mm256_setzero_si256(); }
29
30 BOTAN_FN_ISA_AVX2
31 explicit SIMD_8x32(const uint32_t B[8]) noexcept {
32 m_avx2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(B));
33 }
34
35 BOTAN_FN_ISA_AVX2
36 explicit SIMD_8x32(uint32_t B0,
37 uint32_t B1,
38 uint32_t B2,
39 uint32_t B3,
40 uint32_t B4,
41 uint32_t B5,
42 uint32_t B6,
43 uint32_t B7) noexcept {
44 m_avx2 = _mm256_set_epi32(B7, B6, B5, B4, B3, B2, B1, B0);
45 }
46
47 BOTAN_FN_ISA_AVX2
48 explicit SIMD_8x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3) noexcept {
49 m_avx2 = _mm256_set_epi32(B3, B2, B1, B0, B3, B2, B1, B0);
50 }
51
52 BOTAN_FN_ISA_AVX2
53 static SIMD_8x32 splat(uint32_t B) noexcept { return SIMD_8x32(_mm256_set1_epi32(B)); }
54
55 BOTAN_FN_ISA_AVX2
56 static SIMD_8x32 load_le(const uint8_t* in) noexcept {
57 return SIMD_8x32(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in)));
58 }
59
60 BOTAN_FN_ISA_AVX2
61 static SIMD_8x32 load_le(const uint32_t* in) noexcept {
62 return SIMD_8x32(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in)));
63 }
64
65 BOTAN_FN_ISA_AVX2
66 static SIMD_8x32 load_le128(const uint8_t* in) noexcept {
67 return SIMD_8x32(_mm256_broadcastsi128_si256(_mm_loadu_si128(reinterpret_cast<const __m128i*>(in))));
68 }
69
70 BOTAN_FN_ISA_AVX2
71 static SIMD_8x32 load_le128(const uint32_t* in) noexcept {
72 return SIMD_8x32(_mm256_broadcastsi128_si256(_mm_loadu_si128(reinterpret_cast<const __m128i*>(in))));
73 }
74
75 BOTAN_FN_ISA_AVX2
76 static SIMD_8x32 load_be(const uint8_t* in) noexcept { return load_le(in).bswap(); }
77
78 BOTAN_FN_ISA_AVX2
79 void store_le(uint8_t out[]) const noexcept { _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), m_avx2); }
80
81 BOTAN_FN_ISA_AVX2
82 void store_le128(uint8_t out[]) const noexcept {
83 _mm_storeu_si128(reinterpret_cast<__m128i*>(out), _mm256_extracti128_si256(raw(), 0));
84 }
85
86 BOTAN_FN_ISA_AVX2
87 static SIMD_8x32 load_le128(const uint32_t in1[], const uint32_t in2[]) noexcept {
88 return SIMD_8x32(
89 _mm256_loadu2_m128i(reinterpret_cast<const __m128i*>(in2), reinterpret_cast<const __m128i*>(in1)));
90 }
91
92 BOTAN_FN_ISA_AVX2
93 static SIMD_8x32 load_be128(const uint8_t in1[], const uint8_t in2[]) noexcept {
94 return SIMD_8x32(
95 _mm256_loadu2_m128i(reinterpret_cast<const __m128i*>(in2), reinterpret_cast<const __m128i*>(in1)))
96 .bswap();
97 }
98
99 BOTAN_FN_ISA_AVX2
100 void store_le128(uint32_t out1[], uint32_t out2[]) const noexcept {
101 _mm256_storeu2_m128i(reinterpret_cast<__m128i*>(out2), reinterpret_cast<__m128i*>(out1), raw());
102 }
103
104 BOTAN_FN_ISA_AVX2
105 void store_be(uint8_t out[]) const noexcept { bswap().store_le(out); }
106
107 template <size_t ROT>
108 BOTAN_FN_ISA_AVX2 SIMD_8x32 rotl() const noexcept
109 requires(ROT > 0 && ROT < 32)
110 {
111#if defined(__AVX512VL__)
112 return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT));
113#else
114 if constexpr(ROT == 8) {
115 const __m256i shuf_rotl_8 =
116 _mm256_set_epi64x(0x0e0d0c0f'0a09080b, 0x06050407'02010003, 0x0e0d0c0f'0a09080b, 0x06050407'02010003);
117
118 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8));
119 } else if constexpr(ROT == 16) {
120 const __m256i shuf_rotl_16 =
121 _mm256_set_epi64x(0x0d0c0f0e'09080b0a, 0x05040706'01000302, 0x0d0c0f0e'09080b0a, 0x05040706'01000302);
122
123 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16));
124 } else if constexpr(ROT == 24) {
125 const __m256i shuf_rotl_24 =
126 _mm256_set_epi64x(0x0c0f0e0d'080b0a09, 0x04070605'00030201, 0x0c0f0e0d'080b0a09, 0x04070605'00030201);
127
128 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_24));
129 } else {
130 return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)),
131 _mm256_srli_epi32(m_avx2, static_cast<int>(32 - ROT))));
132 }
133#endif
134 }
135
136 template <size_t ROT>
137 BOTAN_FN_ISA_AVX2 SIMD_8x32 rotr() const noexcept {
138 return this->rotl<32 - ROT>();
139 }
140
141 SIMD_8x32 BOTAN_FN_ISA_AVX2 sigma0() const noexcept {
142 const SIMD_8x32 rot1 = this->rotr<2>();
143 const SIMD_8x32 rot2 = this->rotr<13>();
144 const SIMD_8x32 rot3 = this->rotr<22>();
145 return rot1 ^ rot2 ^ rot3;
146 }
147
148 SIMD_8x32 BOTAN_FN_ISA_AVX2 sigma1() const noexcept {
149 const SIMD_8x32 rot1 = this->rotr<6>();
150 const SIMD_8x32 rot2 = this->rotr<11>();
151 const SIMD_8x32 rot3 = this->rotr<25>();
152 return rot1 ^ rot2 ^ rot3;
153 }
154
155 BOTAN_FN_ISA_AVX2
156 SIMD_8x32 operator+(const SIMD_8x32& other) const noexcept {
157 SIMD_8x32 retval(*this);
158 retval += other;
159 return retval;
160 }
161
162 BOTAN_FN_ISA_AVX2
163 SIMD_8x32 operator-(const SIMD_8x32& other) const noexcept {
164 SIMD_8x32 retval(*this);
165 retval -= other;
166 return retval;
167 }
168
169 BOTAN_FN_ISA_AVX2
170 SIMD_8x32 operator^(const SIMD_8x32& other) const noexcept {
171 SIMD_8x32 retval(*this);
172 retval ^= other;
173 return retval;
174 }
175
176 BOTAN_FN_ISA_AVX2
177 SIMD_8x32 operator|(const SIMD_8x32& other) const noexcept {
178 SIMD_8x32 retval(*this);
179 retval |= other;
180 return retval;
181 }
182
183 BOTAN_FN_ISA_AVX2
184 SIMD_8x32 operator&(const SIMD_8x32& other) const noexcept {
185 SIMD_8x32 retval(*this);
186 retval &= other;
187 return retval;
188 }
189
190 BOTAN_FN_ISA_AVX2
191 void operator+=(const SIMD_8x32& other) { m_avx2 = _mm256_add_epi32(m_avx2, other.m_avx2); }
192
193 BOTAN_FN_ISA_AVX2
194 void operator-=(const SIMD_8x32& other) { m_avx2 = _mm256_sub_epi32(m_avx2, other.m_avx2); }
195
196 BOTAN_FN_ISA_AVX2
197 void operator^=(const SIMD_8x32& other) { m_avx2 = _mm256_xor_si256(m_avx2, other.m_avx2); }
198
199 BOTAN_FN_ISA_AVX2
200 void operator^=(uint32_t other) { *this ^= SIMD_8x32::splat(other); }
201
202 BOTAN_FN_ISA_AVX2
203 void operator|=(const SIMD_8x32& other) { m_avx2 = _mm256_or_si256(m_avx2, other.m_avx2); }
204
205 BOTAN_FN_ISA_AVX2
206 void operator&=(const SIMD_8x32& other) { m_avx2 = _mm256_and_si256(m_avx2, other.m_avx2); }
207
208 template <int SHIFT>
209 BOTAN_FN_ISA_AVX2 SIMD_8x32 shl() const noexcept {
210 return SIMD_8x32(_mm256_slli_epi32(m_avx2, SHIFT));
211 }
212
213 template <int SHIFT>
214 BOTAN_FN_ISA_AVX2 SIMD_8x32 shr() const noexcept {
215 return SIMD_8x32(_mm256_srli_epi32(m_avx2, SHIFT));
216 }
217
218 BOTAN_FN_ISA_AVX2
219 SIMD_8x32 operator~() const noexcept {
220 return SIMD_8x32(_mm256_xor_si256(m_avx2, _mm256_set1_epi32(0xFFFFFFFF)));
221 }
222
223 // (~reg) & other
224 BOTAN_FN_ISA_AVX2
225 SIMD_8x32 andc(const SIMD_8x32& other) const noexcept {
226 return SIMD_8x32(_mm256_andnot_si256(m_avx2, other.m_avx2));
227 }
228
229 BOTAN_FN_ISA_AVX2
230 SIMD_8x32 bswap() const noexcept {
231 alignas(32) const uint8_t BSWAP_TBL[32] = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
232 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
233
234 const __m256i bswap = _mm256_load_si256(reinterpret_cast<const __m256i*>(BSWAP_TBL));
235
236 const __m256i output = _mm256_shuffle_epi8(m_avx2, bswap);
237
238 return SIMD_8x32(output);
239 }
240
241 // Equivalent to rev_words().bswap()
242 BOTAN_FN_ISA_AVX2
243 SIMD_8x32 reverse() const noexcept {
244 alignas(32) const uint8_t REV_TBL[32] = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
245 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
246
247 const __m256i bswap = _mm256_load_si256(reinterpret_cast<const __m256i*>(REV_TBL));
248 const __m256i output = _mm256_shuffle_epi8(m_avx2, bswap);
249 return SIMD_8x32(output);
250 }
251
252 BOTAN_FN_ISA_AVX2
253 SIMD_8x32 rev_words() const noexcept { return SIMD_8x32(_mm256_shuffle_epi32(raw(), 0b00011011)); }
254
255 BOTAN_FN_ISA_AVX2
256 static void transpose(SIMD_8x32& B0, SIMD_8x32& B1, SIMD_8x32& B2, SIMD_8x32& B3) noexcept {
257 const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2, B1.m_avx2);
258 const __m256i T1 = _mm256_unpacklo_epi32(B2.m_avx2, B3.m_avx2);
259 const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2, B1.m_avx2);
260 const __m256i T3 = _mm256_unpackhi_epi32(B2.m_avx2, B3.m_avx2);
261
262 B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1);
263 B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1);
264 B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3);
265 B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3);
266 }
267
268 BOTAN_FN_ISA_AVX2
269 static void transpose(SIMD_8x32& B0,
270 SIMD_8x32& B1,
271 SIMD_8x32& B2,
272 SIMD_8x32& B3,
273 SIMD_8x32& B4,
274 SIMD_8x32& B5,
275 SIMD_8x32& B6,
276 SIMD_8x32& B7) noexcept {
277 transpose(B0, B1, B2, B3);
278 transpose(B4, B5, B6, B7);
279
280 swap_tops(B0, B4);
281 swap_tops(B1, B5);
282 swap_tops(B2, B6);
283 swap_tops(B3, B7);
284 }
285
286 BOTAN_FN_ISA_AVX2
287 static SIMD_8x32 choose(const SIMD_8x32& mask, const SIMD_8x32& a, const SIMD_8x32& b) noexcept {
288#if defined(__AVX512VL__)
289 return _mm256_ternarylogic_epi32(mask.raw(), a.raw(), b.raw(), 0xca);
290#else
291 return (mask & a) ^ mask.andc(b);
292#endif
293 }
294
295 BOTAN_FN_ISA_AVX2
296 static SIMD_8x32 majority(const SIMD_8x32& x, const SIMD_8x32& y, const SIMD_8x32& z) noexcept {
297#if defined(__AVX512VL__)
298 return _mm256_ternarylogic_epi32(x.raw(), y.raw(), z.raw(), 0xe8);
299#else
300 return SIMD_8x32::choose(x ^ y, z, y);
301#endif
302 }
303
304 static inline SIMD_8x32 BOTAN_FN_ISA_AVX2 byte_shuffle(const SIMD_8x32& tbl, const SIMD_8x32& idx) {
305 return SIMD_8x32(_mm256_shuffle_epi8(tbl.raw(), idx.raw()));
306 }
307
308 BOTAN_FN_ISA_AVX2
309 static void reset_registers() noexcept { _mm256_zeroupper(); }
310
311 BOTAN_FN_ISA_AVX2
312 static void zero_registers() noexcept { _mm256_zeroall(); }
313
314 __m256i BOTAN_FN_ISA_AVX2 raw() const noexcept { return m_avx2; }
315
316 BOTAN_FN_ISA_AVX2
317 SIMD_8x32(__m256i x) noexcept : m_avx2(x) {}
318
319 private:
320 BOTAN_FN_ISA_AVX2
321 static void swap_tops(SIMD_8x32& A, SIMD_8x32& B) {
322 SIMD_8x32 T0 = _mm256_permute2x128_si256(A.raw(), B.raw(), 0 + (2 << 4));
323 SIMD_8x32 T1 = _mm256_permute2x128_si256(A.raw(), B.raw(), 1 + (3 << 4));
324 A = T0;
325 B = T1;
326 }
327
328 __m256i m_avx2;
329};
330
331template <size_t R>
332inline SIMD_8x32 rotl(SIMD_8x32 input) {
333 return input.rotl<R>();
334}
335
336template <size_t R>
337inline SIMD_8x32 rotr(SIMD_8x32 input) {
338 return input.rotr<R>();
339}
340
341// For Serpent:
342template <size_t S>
343inline SIMD_8x32 shl(SIMD_8x32 input) {
344 return input.shl<S>();
345}
346
347} // namespace Botan
348
349#endif
SIMD_8x32 BOTAN_FN_ISA_AVX2 sigma0() const noexcept
Definition simd_avx2.h:141
BOTAN_FN_ISA_AVX2 SIMD_8x32 operator^(const SIMD_8x32 &other) const noexcept
Definition simd_avx2.h:170
__m256i BOTAN_FN_ISA_AVX2 raw() const noexcept
Definition simd_avx2.h:314
SIMD_8x32 & operator=(SIMD_8x32 &&other)=default
BOTAN_FN_ISA_AVX2 SIMD_8x32 shl() const noexcept
Definition simd_avx2.h:209
SIMD_8x32(const SIMD_8x32 &other)=default
BOTAN_FN_ISA_AVX2 SIMD_8x32 shr() const noexcept
Definition simd_avx2.h:214
SIMD_8x32 BOTAN_FN_ISA_AVX2 sigma1() const noexcept
Definition simd_avx2.h:148
BOTAN_FN_ISA_AVX2 SIMD_8x32(__m256i x) noexcept
Definition simd_avx2.h:317
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_le128(const uint8_t *in) noexcept
Definition simd_avx2.h:66
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_le128(const uint32_t *in) noexcept
Definition simd_avx2.h:71
static BOTAN_FN_ISA_AVX2 void reset_registers() noexcept
Definition simd_avx2.h:309
BOTAN_FN_ISA_AVX2 SIMD_8x32 rev_words() const noexcept
Definition simd_avx2.h:253
BOTAN_FN_ISA_AVX2 SIMD_8x32 rotr() const noexcept
Definition simd_avx2.h:137
BOTAN_FN_ISA_AVX2 void operator&=(const SIMD_8x32 &other)
Definition simd_avx2.h:206
BOTAN_FN_ISA_AVX2 SIMD_8x32 rotl() const noexcept
Definition simd_avx2.h:108
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_le128(const uint32_t in1[], const uint32_t in2[]) noexcept
Definition simd_avx2.h:87
~SIMD_8x32()=default
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_le(const uint32_t *in) noexcept
Definition simd_avx2.h:61
BOTAN_FN_ISA_AVX2 SIMD_8x32 reverse() const noexcept
Definition simd_avx2.h:243
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_le(const uint8_t *in) noexcept
Definition simd_avx2.h:56
BOTAN_FN_ISA_AVX2 void operator^=(uint32_t other)
Definition simd_avx2.h:200
BOTAN_FN_ISA_AVX2 void store_le128(uint32_t out1[], uint32_t out2[]) const noexcept
Definition simd_avx2.h:100
BOTAN_FN_ISA_AVX2 void operator|=(const SIMD_8x32 &other)
Definition simd_avx2.h:203
SIMD_8x32(SIMD_8x32 &&other)=default
BOTAN_FN_ISA_AVX2 SIMD_8x32 operator~() const noexcept
Definition simd_avx2.h:219
BOTAN_FN_ISA_AVX2 void operator^=(const SIMD_8x32 &other)
Definition simd_avx2.h:197
BOTAN_FN_ISA_AVX2 void operator-=(const SIMD_8x32 &other)
Definition simd_avx2.h:194
BOTAN_FN_ISA_AVX2 SIMD_8x32(const uint32_t B[8]) noexcept
Definition simd_avx2.h:31
BOTAN_FN_ISA_AVX2 void store_le128(uint8_t out[]) const noexcept
Definition simd_avx2.h:82
static BOTAN_FN_ISA_AVX2 void transpose(SIMD_8x32 &B0, SIMD_8x32 &B1, SIMD_8x32 &B2, SIMD_8x32 &B3) noexcept
Definition simd_avx2.h:256
static BOTAN_FN_ISA_AVX2 void zero_registers() noexcept
Definition simd_avx2.h:312
static BOTAN_FN_ISA_AVX2 SIMD_8x32 majority(const SIMD_8x32 &x, const SIMD_8x32 &y, const SIMD_8x32 &z) noexcept
Definition simd_avx2.h:296
BOTAN_FN_ISA_AVX2 void store_be(uint8_t out[]) const noexcept
Definition simd_avx2.h:105
static BOTAN_FN_ISA_AVX2 SIMD_8x32 splat(uint32_t B) noexcept
Definition simd_avx2.h:53
static BOTAN_FN_ISA_AVX2 void transpose(SIMD_8x32 &B0, SIMD_8x32 &B1, SIMD_8x32 &B2, SIMD_8x32 &B3, SIMD_8x32 &B4, SIMD_8x32 &B5, SIMD_8x32 &B6, SIMD_8x32 &B7) noexcept
Definition simd_avx2.h:269
BOTAN_FN_ISA_AVX2 SIMD_8x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3) noexcept
Definition simd_avx2.h:48
static BOTAN_FN_ISA_AVX2 SIMD_8x32 choose(const SIMD_8x32 &mask, const SIMD_8x32 &a, const SIMD_8x32 &b) noexcept
Definition simd_avx2.h:287
BOTAN_FN_ISA_AVX2 SIMD_8x32 andc(const SIMD_8x32 &other) const noexcept
Definition simd_avx2.h:225
BOTAN_FN_ISA_AVX2 SIMD_8x32 operator-(const SIMD_8x32 &other) const noexcept
Definition simd_avx2.h:163
SIMD_8x32 & operator=(const SIMD_8x32 &other)=default
BOTAN_FN_ISA_AVX2 void operator+=(const SIMD_8x32 &other)
Definition simd_avx2.h:191
static SIMD_8x32 BOTAN_FN_ISA_AVX2 byte_shuffle(const SIMD_8x32 &tbl, const SIMD_8x32 &idx)
Definition simd_avx2.h:304
BOTAN_FN_ISA_AVX2 void store_le(uint8_t out[]) const noexcept
Definition simd_avx2.h:79
BOTAN_FN_ISA_AVX2 SIMD_8x32 operator&(const SIMD_8x32 &other) const noexcept
Definition simd_avx2.h:184
BOTAN_FN_ISA_AVX2 SIMD_8x32 operator+(const SIMD_8x32 &other) const noexcept
Definition simd_avx2.h:156
BOTAN_FN_ISA_AVX2 SIMD_8x32 bswap() const noexcept
Definition simd_avx2.h:230
BOTAN_FN_ISA_AVX2 BOTAN_FORCE_INLINE SIMD_8x32() noexcept
Definition simd_avx2.h:28
BOTAN_FN_ISA_AVX2 SIMD_8x32 operator|(const SIMD_8x32 &other) const noexcept
Definition simd_avx2.h:177
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be128(const uint8_t in1[], const uint8_t in2[]) noexcept
Definition simd_avx2.h:93
BOTAN_FN_ISA_AVX2 SIMD_8x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3, uint32_t B4, uint32_t B5, uint32_t B6, uint32_t B7) noexcept
Definition simd_avx2.h:36
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be(const uint8_t *in) noexcept
Definition simd_avx2.h:76
#define BOTAN_FORCE_INLINE
Definition compiler.h:85
constexpr T rotl(T input)
Definition rotate.h:21
constexpr T rotr(T input)
Definition rotate.h:33
SIMD_4x32 shl(SIMD_4x32 input)
Definition simd_4x32.h:774