Botan 3.11.0
Crypto and TLS for C&
simd_avx512.h
Go to the documentation of this file.
1/*
2* (C) 2023 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#ifndef BOTAN_SIMD_AVX512_H_
8#define BOTAN_SIMD_AVX512_H_
9
10#include <botan/compiler.h>
11#include <botan/types.h>
12#include <botan/internal/isa_extn.h>
13#include <immintrin.h>
14
15namespace Botan {
16
17// NOLINTBEGIN(portability-simd-intrinsics)
18
19class SIMD_16x32 final {
20 public:
21 SIMD_16x32& operator=(const SIMD_16x32& other) = default;
22 SIMD_16x32(const SIMD_16x32& other) = default;
23
24 SIMD_16x32& operator=(SIMD_16x32&& other) = default;
25 SIMD_16x32(SIMD_16x32&& other) = default;
26
27 ~SIMD_16x32() = default;
28
29 BOTAN_FN_ISA_AVX512
30 BOTAN_FORCE_INLINE SIMD_16x32() : m_avx512(_mm512_setzero_si512()) {}
31
32 BOTAN_FN_ISA_AVX512
33 explicit SIMD_16x32(const uint32_t B[16]) {
34 // NOLINTNEXTLINE(*-prefer-member-initializer)
35 m_avx512 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(B));
36 }
37
38 BOTAN_FN_ISA_AVX512
39 explicit SIMD_16x32(uint32_t B0,
40 uint32_t B1,
41 uint32_t B2,
42 uint32_t B3,
43 uint32_t B4,
44 uint32_t B5,
45 uint32_t B6,
46 uint32_t B7,
47 uint32_t B8,
48 uint32_t B9,
49 uint32_t BA,
50 uint32_t BB,
51 uint32_t BC,
52 uint32_t BD,
53 uint32_t BE,
54 uint32_t BF) {
55 // NOLINTNEXTLINE(*-prefer-member-initializer)
56 m_avx512 = _mm512_set_epi32(BF, BE, BD, BC, BB, BA, B9, B8, B7, B6, B5, B4, B3, B2, B1, B0);
57 }
58
59 BOTAN_FN_ISA_AVX512
60 static SIMD_16x32 splat(uint32_t B) { return SIMD_16x32(_mm512_set1_epi32(B)); }
61
62 BOTAN_FN_ISA_AVX512
63 static SIMD_16x32 load_le(const uint8_t* in) {
64 return SIMD_16x32(_mm512_loadu_si512(reinterpret_cast<const __m512i*>(in)));
65 }
66
67 BOTAN_FN_ISA_AVX512
68 static SIMD_16x32 load_be(const uint8_t* in) { return load_le(in).bswap(); }
69
70 BOTAN_FN_ISA_AVX512
71 void store_le(uint8_t out[]) const { _mm512_storeu_si512(reinterpret_cast<__m512i*>(out), m_avx512); }
72
73 BOTAN_FN_ISA_AVX512
74 void store_be(uint8_t out[]) const { bswap().store_le(out); }
75
76 template <size_t ROT>
77 BOTAN_FN_ISA_AVX512 SIMD_16x32 rotl() const
78 requires(ROT > 0 && ROT < 32)
79 {
80 return SIMD_16x32(_mm512_rol_epi32(m_avx512, ROT));
81 }
82
83 template <size_t ROT>
84 BOTAN_FN_ISA_AVX512 SIMD_16x32 rotr() const {
85 return this->rotl<32 - ROT>();
86 }
87
88 SIMD_16x32 BOTAN_FN_ISA_AVX512 sigma0() const {
89 const SIMD_16x32 r1 = this->rotr<2>();
90 const SIMD_16x32 r2 = this->rotr<13>();
91 const SIMD_16x32 r3 = this->rotr<22>();
92 return r1 ^ r2 ^ r3;
93 }
94
95 SIMD_16x32 BOTAN_FN_ISA_AVX512 sigma1() const {
96 const SIMD_16x32 r1 = this->rotr<6>();
97 const SIMD_16x32 r2 = this->rotr<11>();
98 const SIMD_16x32 r3 = this->rotr<25>();
99 return r1 ^ r2 ^ r3;
100 }
101
102 BOTAN_FN_ISA_AVX512
103 SIMD_16x32 operator+(const SIMD_16x32& other) const {
104 SIMD_16x32 retval(*this);
105 retval += other;
106 return retval;
107 }
108
109 BOTAN_FN_ISA_AVX512
110 SIMD_16x32 operator-(const SIMD_16x32& other) const {
111 SIMD_16x32 retval(*this);
112 retval -= other;
113 return retval;
114 }
115
116 BOTAN_FN_ISA_AVX512
117 SIMD_16x32 operator^(const SIMD_16x32& other) const {
118 SIMD_16x32 retval(*this);
119 retval ^= other;
120 return retval;
121 }
122
123 BOTAN_FN_ISA_AVX512
124 SIMD_16x32 operator|(const SIMD_16x32& other) const {
125 SIMD_16x32 retval(*this);
126 retval |= other;
127 return retval;
128 }
129
130 BOTAN_FN_ISA_AVX512
131 SIMD_16x32 operator&(const SIMD_16x32& other) const {
132 SIMD_16x32 retval(*this);
133 retval &= other;
134 return retval;
135 }
136
137 BOTAN_FN_ISA_AVX512
138 void operator+=(const SIMD_16x32& other) { m_avx512 = _mm512_add_epi32(m_avx512, other.m_avx512); }
139
140 BOTAN_FN_ISA_AVX512
141 void operator-=(const SIMD_16x32& other) { m_avx512 = _mm512_sub_epi32(m_avx512, other.m_avx512); }
142
143 BOTAN_FN_ISA_AVX512
144 void operator^=(const SIMD_16x32& other) { m_avx512 = _mm512_xor_si512(m_avx512, other.m_avx512); }
145
146 BOTAN_FN_ISA_AVX512
147 void operator^=(uint32_t other) { *this ^= SIMD_16x32::splat(other); }
148
149 BOTAN_FN_ISA_AVX512
150 void operator|=(const SIMD_16x32& other) { m_avx512 = _mm512_or_si512(m_avx512, other.m_avx512); }
151
152 BOTAN_FN_ISA_AVX512
153 void operator&=(const SIMD_16x32& other) { m_avx512 = _mm512_and_si512(m_avx512, other.m_avx512); }
154
155 template <int SHIFT>
156 BOTAN_FN_ISA_AVX512 SIMD_16x32 shl() const {
157 return SIMD_16x32(_mm512_slli_epi32(m_avx512, SHIFT));
158 }
159
160 template <int SHIFT>
161 BOTAN_FN_ISA_AVX512 SIMD_16x32 shr() const {
162 return SIMD_16x32(_mm512_srli_epi32(m_avx512, SHIFT));
163 }
164
165 BOTAN_FN_ISA_AVX512
166 SIMD_16x32 operator~() const { return SIMD_16x32(_mm512_xor_si512(m_avx512, _mm512_set1_epi32(0xFFFFFFFF))); }
167
168 // (~reg) & other
169 BOTAN_FN_ISA_AVX512
170 SIMD_16x32 andc(const SIMD_16x32& other) const {
171 return SIMD_16x32(_mm512_andnot_si512(m_avx512, other.m_avx512));
172 }
173
174 template <uint8_t TBL>
175 BOTAN_FN_ISA_AVX512 static SIMD_16x32 ternary_fn(const SIMD_16x32& a, const SIMD_16x32& b, const SIMD_16x32& c) {
176 return SIMD_16x32(_mm512_ternarylogic_epi32(a.raw(), b.raw(), c.raw(), TBL));
177 }
178
179 BOTAN_FN_ISA_AVX512
181 const uint8_t BSWAP_MASK[64] = {
182 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
183 21, 20, 27, 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40,
184 47, 46, 45, 44, 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60,
185 };
186
187 const __m512i bswap = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(BSWAP_MASK));
188
189 const __m512i output = _mm512_shuffle_epi8(m_avx512, bswap);
190
191 return SIMD_16x32(output);
192 }
193
194 BOTAN_FN_ISA_AVX512
195 SIMD_16x32 rev_words() const noexcept { return SIMD_16x32(_mm512_shuffle_epi32(raw(), _MM_PERM_ABCD)); }
196
197 BOTAN_FN_ISA_AVX512
198 static void transpose(SIMD_16x32& B0, SIMD_16x32& B1, SIMD_16x32& B2, SIMD_16x32& B3) {
199 const __m512i T0 = _mm512_unpacklo_epi32(B0.m_avx512, B1.m_avx512);
200 const __m512i T1 = _mm512_unpacklo_epi32(B2.m_avx512, B3.m_avx512);
201 const __m512i T2 = _mm512_unpackhi_epi32(B0.m_avx512, B1.m_avx512);
202 const __m512i T3 = _mm512_unpackhi_epi32(B2.m_avx512, B3.m_avx512);
203
204 B0.m_avx512 = _mm512_unpacklo_epi64(T0, T1);
205 B1.m_avx512 = _mm512_unpackhi_epi64(T0, T1);
206 B2.m_avx512 = _mm512_unpacklo_epi64(T2, T3);
207 B3.m_avx512 = _mm512_unpackhi_epi64(T2, T3);
208 }
209
210 BOTAN_FN_ISA_AVX512
211 static void transpose(SIMD_16x32& B0,
212 SIMD_16x32& B1,
213 SIMD_16x32& B2,
214 SIMD_16x32& B3,
215 SIMD_16x32& B4,
216 SIMD_16x32& B5,
217 SIMD_16x32& B6,
218 SIMD_16x32& B7,
219 SIMD_16x32& B8,
220 SIMD_16x32& B9,
221 SIMD_16x32& BA,
222 SIMD_16x32& BB,
223 SIMD_16x32& BC,
224 SIMD_16x32& BD,
225 SIMD_16x32& BE,
226 SIMD_16x32& BF) {
227 auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());
228 auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());
229 auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());
230 auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());
231 auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());
232 auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());
233 auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());
234 auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());
235 auto t8 = _mm512_unpacklo_epi32(B8.raw(), B9.raw());
236 auto t9 = _mm512_unpackhi_epi32(B8.raw(), B9.raw());
237 auto ta = _mm512_unpacklo_epi32(BA.raw(), BB.raw());
238 auto tb = _mm512_unpackhi_epi32(BA.raw(), BB.raw());
239 auto tc = _mm512_unpacklo_epi32(BC.raw(), BD.raw());
240 auto td = _mm512_unpackhi_epi32(BC.raw(), BD.raw());
241 auto te = _mm512_unpacklo_epi32(BE.raw(), BF.raw());
242 auto tf = _mm512_unpackhi_epi32(BE.raw(), BF.raw());
243
244 auto r0 = _mm512_unpacklo_epi64(t0, t2);
245 auto r1 = _mm512_unpackhi_epi64(t0, t2);
246 auto r2 = _mm512_unpacklo_epi64(t1, t3);
247 auto r3 = _mm512_unpackhi_epi64(t1, t3);
248 auto r4 = _mm512_unpacklo_epi64(t4, t6);
249 auto r5 = _mm512_unpackhi_epi64(t4, t6);
250 auto r6 = _mm512_unpacklo_epi64(t5, t7);
251 auto r7 = _mm512_unpackhi_epi64(t5, t7);
252 auto r8 = _mm512_unpacklo_epi64(t8, ta);
253 auto r9 = _mm512_unpackhi_epi64(t8, ta);
254 auto ra = _mm512_unpacklo_epi64(t9, tb);
255 auto rb = _mm512_unpackhi_epi64(t9, tb);
256 auto rc = _mm512_unpacklo_epi64(tc, te);
257 auto rd = _mm512_unpackhi_epi64(tc, te);
258 auto re = _mm512_unpacklo_epi64(td, tf);
259 auto rf = _mm512_unpackhi_epi64(td, tf);
260
261 t0 = _mm512_shuffle_i32x4(r0, r4, 0x88);
262 t1 = _mm512_shuffle_i32x4(r1, r5, 0x88);
263 t2 = _mm512_shuffle_i32x4(r2, r6, 0x88);
264 t3 = _mm512_shuffle_i32x4(r3, r7, 0x88);
265 t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd);
266 t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd);
267 t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd);
268 t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd);
269 t8 = _mm512_shuffle_i32x4(r8, rc, 0x88);
270 t9 = _mm512_shuffle_i32x4(r9, rd, 0x88);
271 ta = _mm512_shuffle_i32x4(ra, re, 0x88);
272 tb = _mm512_shuffle_i32x4(rb, rf, 0x88);
273 tc = _mm512_shuffle_i32x4(r8, rc, 0xdd);
274 td = _mm512_shuffle_i32x4(r9, rd, 0xdd);
275 te = _mm512_shuffle_i32x4(ra, re, 0xdd);
276 tf = _mm512_shuffle_i32x4(rb, rf, 0xdd);
277
278 B0.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0x88);
279 B1.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0x88);
280 B2.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0x88);
281 B3.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0x88);
282 B4.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0x88);
283 B5.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0x88);
284 B6.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0x88);
285 B7.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0x88);
286 B8.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0xdd);
287 B9.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0xdd);
288 BA.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0xdd);
289 BB.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0xdd);
290 BC.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0xdd);
291 BD.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0xdd);
292 BE.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0xdd);
293 BF.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0xdd);
294 }
295
296 BOTAN_FN_ISA_AVX512
297 static SIMD_16x32 choose(const SIMD_16x32& mask, const SIMD_16x32& a, const SIMD_16x32& b) {
298 return SIMD_16x32::ternary_fn<0xca>(mask, a, b);
299 }
300
301 BOTAN_FN_ISA_AVX512
302 static SIMD_16x32 majority(const SIMD_16x32& x, const SIMD_16x32& y, const SIMD_16x32& z) {
303 return SIMD_16x32::ternary_fn<0xe8>(x, y, z);
304 }
305
306 BOTAN_FN_ISA_AVX512 static void zero_registers() {
307 // Unfortunately this only zeros zmm0-zmm15 and not zmm16-zmm32
308 _mm256_zeroall();
309 }
310
311 __m512i BOTAN_FN_ISA_AVX512 raw() const { return m_avx512; }
312
313 BOTAN_FN_ISA_AVX512
314 explicit SIMD_16x32(__m512i x) noexcept : m_avx512(x) {}
315
316 private:
317 __m512i m_avx512;
318};
319
320// NOLINTEND(portability-simd-intrinsics)
321
322template <size_t R>
323inline SIMD_16x32 BOTAN_FN_ISA_AVX512 rotl(SIMD_16x32 input) {
324 return input.rotl<R>();
325}
326
327template <size_t R>
328inline SIMD_16x32 BOTAN_FN_ISA_AVX512 rotr(SIMD_16x32 input) {
329 return input.rotr<R>();
330}
331
332// For Serpent:
333template <size_t S>
334inline SIMD_16x32 BOTAN_FN_ISA_AVX512 shl(SIMD_16x32 input) {
335 return input.shl<S>();
336}
337
338} // namespace Botan
339
340#endif
SIMD_16x32(SIMD_16x32 &&other)=default
static BOTAN_FN_ISA_AVX512 void transpose(SIMD_16x32 &B0, SIMD_16x32 &B1, SIMD_16x32 &B2, SIMD_16x32 &B3)
BOTAN_FN_ISA_AVX512 void operator^=(const SIMD_16x32 &other)
BOTAN_FN_ISA_AVX512 void store_le(uint8_t out[]) const
Definition simd_avx512.h:71
~SIMD_16x32()=default
static BOTAN_FN_ISA_AVX512 SIMD_16x32 choose(const SIMD_16x32 &mask, const SIMD_16x32 &a, const SIMD_16x32 &b)
BOTAN_FN_ISA_AVX512 SIMD_16x32 andc(const SIMD_16x32 &other) const
SIMD_16x32 BOTAN_FN_ISA_AVX512 sigma0() const
Definition simd_avx512.h:88
BOTAN_FN_ISA_AVX512 void operator|=(const SIMD_16x32 &other)
BOTAN_FN_ISA_AVX512 BOTAN_FORCE_INLINE SIMD_16x32()
Definition simd_avx512.h:30
BOTAN_FN_ISA_AVX512 SIMD_16x32 operator|(const SIMD_16x32 &other) const
BOTAN_FN_ISA_AVX512 void operator-=(const SIMD_16x32 &other)
BOTAN_FN_ISA_AVX512 SIMD_16x32 rev_words() const noexcept
SIMD_16x32 & operator=(SIMD_16x32 &&other)=default
BOTAN_FN_ISA_AVX512 SIMD_16x32 rotl() const
Definition simd_avx512.h:77
static BOTAN_FN_ISA_AVX512 SIMD_16x32 ternary_fn(const SIMD_16x32 &a, const SIMD_16x32 &b, const SIMD_16x32 &c)
BOTAN_FN_ISA_AVX512 SIMD_16x32 bswap() const
BOTAN_FN_ISA_AVX512 void operator&=(const SIMD_16x32 &other)
BOTAN_FN_ISA_AVX512 SIMD_16x32 shr() const
static BOTAN_FN_ISA_AVX512 SIMD_16x32 majority(const SIMD_16x32 &x, const SIMD_16x32 &y, const SIMD_16x32 &z)
BOTAN_FN_ISA_AVX512 void operator+=(const SIMD_16x32 &other)
static BOTAN_FN_ISA_AVX512 SIMD_16x32 load_le(const uint8_t *in)
Definition simd_avx512.h:63
BOTAN_FN_ISA_AVX512 SIMD_16x32 operator-(const SIMD_16x32 &other) const
BOTAN_FN_ISA_AVX512 void store_be(uint8_t out[]) const
Definition simd_avx512.h:74
static BOTAN_FN_ISA_AVX512 void zero_registers()
SIMD_16x32 & operator=(const SIMD_16x32 &other)=default
BOTAN_FN_ISA_AVX512 SIMD_16x32 rotr() const
Definition simd_avx512.h:84
BOTAN_FN_ISA_AVX512 SIMD_16x32(const uint32_t B[16])
Definition simd_avx512.h:33
BOTAN_FN_ISA_AVX512 SIMD_16x32(__m512i x) noexcept
__m512i BOTAN_FN_ISA_AVX512 raw() const
SIMD_16x32(const SIMD_16x32 &other)=default
BOTAN_FN_ISA_AVX512 SIMD_16x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3, uint32_t B4, uint32_t B5, uint32_t B6, uint32_t B7, uint32_t B8, uint32_t B9, uint32_t BA, uint32_t BB, uint32_t BC, uint32_t BD, uint32_t BE, uint32_t BF)
Definition simd_avx512.h:39
BOTAN_FN_ISA_AVX512 SIMD_16x32 operator+(const SIMD_16x32 &other) const
static BOTAN_FN_ISA_AVX512 void transpose(SIMD_16x32 &B0, SIMD_16x32 &B1, SIMD_16x32 &B2, SIMD_16x32 &B3, SIMD_16x32 &B4, SIMD_16x32 &B5, SIMD_16x32 &B6, SIMD_16x32 &B7, SIMD_16x32 &B8, SIMD_16x32 &B9, SIMD_16x32 &BA, SIMD_16x32 &BB, SIMD_16x32 &BC, SIMD_16x32 &BD, SIMD_16x32 &BE, SIMD_16x32 &BF)
SIMD_16x32 BOTAN_FN_ISA_AVX512 sigma1() const
Definition simd_avx512.h:95
BOTAN_FN_ISA_AVX512 SIMD_16x32 operator^(const SIMD_16x32 &other) const
static BOTAN_FN_ISA_AVX512 SIMD_16x32 splat(uint32_t B)
Definition simd_avx512.h:60
static BOTAN_FN_ISA_AVX512 SIMD_16x32 load_be(const uint8_t *in)
Definition simd_avx512.h:68
BOTAN_FN_ISA_AVX512 SIMD_16x32 shl() const
BOTAN_FN_ISA_AVX512 SIMD_16x32 operator&(const SIMD_16x32 &other) const
BOTAN_FN_ISA_AVX512 SIMD_16x32 operator~() const
BOTAN_FN_ISA_AVX512 void operator^=(uint32_t other)
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
BOTAN_FORCE_INLINE constexpr T rotr(T input)
Definition rotate.h:35
BOTAN_FORCE_INLINE constexpr T rotl(T input)
Definition rotate.h:23
SIMD_4x32 shl(SIMD_4x32 input)
Definition simd_4x32.h:938