Botan 3.9.0
Crypto and TLS for C&
simd_avx512.h
Go to the documentation of this file.
1/*
2* (C) 2023 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#ifndef BOTAN_SIMD_AVX512_H_
8#define BOTAN_SIMD_AVX512_H_
9
10#include <botan/compiler.h>
11#include <botan/types.h>
12#include <botan/internal/isa_extn.h>
13#include <immintrin.h>
14
15namespace Botan {
16
17// NOLINTBEGIN(portability-simd-intrinsics)
18
19class SIMD_16x32 final {
20 public:
21 SIMD_16x32& operator=(const SIMD_16x32& other) = default;
22 SIMD_16x32(const SIMD_16x32& other) = default;
23
24 SIMD_16x32& operator=(SIMD_16x32&& other) = default;
25 SIMD_16x32(SIMD_16x32&& other) = default;
26
27 ~SIMD_16x32() = default;
28
29 BOTAN_FN_ISA_AVX512
30 BOTAN_FORCE_INLINE SIMD_16x32() : m_avx512(_mm512_setzero_si512()) {}
31
32 BOTAN_FN_ISA_AVX512
33 explicit SIMD_16x32(const uint32_t B[16]) {
34 // NOLINTNEXTLINE(*-prefer-member-initializer)
35 m_avx512 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(B));
36 }
37
38 BOTAN_FN_ISA_AVX512
39 explicit SIMD_16x32(uint32_t B0,
40 uint32_t B1,
41 uint32_t B2,
42 uint32_t B3,
43 uint32_t B4,
44 uint32_t B5,
45 uint32_t B6,
46 uint32_t B7,
47 uint32_t B8,
48 uint32_t B9,
49 uint32_t BA,
50 uint32_t BB,
51 uint32_t BC,
52 uint32_t BD,
53 uint32_t BE,
54 uint32_t BF) {
55 // NOLINTNEXTLINE(*-prefer-member-initializer)
56 m_avx512 = _mm512_set_epi32(BF, BE, BD, BC, BB, BA, B9, B8, B7, B6, B5, B4, B3, B2, B1, B0);
57 }
58
59 BOTAN_FN_ISA_AVX512
60 static SIMD_16x32 splat(uint32_t B) { return SIMD_16x32(_mm512_set1_epi32(B)); }
61
62 BOTAN_FN_ISA_AVX512
63 static SIMD_16x32 load_le(const uint8_t* in) {
64 return SIMD_16x32(_mm512_loadu_si512(reinterpret_cast<const __m512i*>(in)));
65 }
66
67 BOTAN_FN_ISA_AVX512
68 static SIMD_16x32 load_be(const uint8_t* in) { return load_le(in).bswap(); }
69
70 BOTAN_FN_ISA_AVX512
71 void store_le(uint8_t out[]) const { _mm512_storeu_si512(reinterpret_cast<__m512i*>(out), m_avx512); }
72
73 BOTAN_FN_ISA_AVX512
74 void store_be(uint8_t out[]) const { bswap().store_le(out); }
75
76 template <size_t ROT>
77 BOTAN_FN_ISA_AVX512 SIMD_16x32 rotl() const
78 requires(ROT > 0 && ROT < 32)
79 {
80 return SIMD_16x32(_mm512_rol_epi32(m_avx512, ROT));
81 }
82
83 template <size_t ROT>
84 BOTAN_FN_ISA_AVX512 SIMD_16x32 rotr() const {
85 return this->rotl<32 - ROT>();
86 }
87
88 SIMD_16x32 BOTAN_FN_ISA_AVX512 sigma0() const {
89 const SIMD_16x32 rot1 = this->rotr<2>();
90 const SIMD_16x32 rot2 = this->rotr<13>();
91 const SIMD_16x32 rot3 = this->rotr<22>();
92 return rot1 ^ rot2 ^ rot3;
93 }
94
95 SIMD_16x32 BOTAN_FN_ISA_AVX512 sigma1() const {
96 const SIMD_16x32 rot1 = this->rotr<6>();
97 const SIMD_16x32 rot2 = this->rotr<11>();
98 const SIMD_16x32 rot3 = this->rotr<25>();
99 return rot1 ^ rot2 ^ rot3;
100 }
101
102 BOTAN_FN_ISA_AVX512
103 SIMD_16x32 operator+(const SIMD_16x32& other) const {
104 SIMD_16x32 retval(*this);
105 retval += other;
106 return retval;
107 }
108
109 BOTAN_FN_ISA_AVX512
110 SIMD_16x32 operator-(const SIMD_16x32& other) const {
111 SIMD_16x32 retval(*this);
112 retval -= other;
113 return retval;
114 }
115
116 BOTAN_FN_ISA_AVX512
117 SIMD_16x32 operator^(const SIMD_16x32& other) const {
118 SIMD_16x32 retval(*this);
119 retval ^= other;
120 return retval;
121 }
122
123 BOTAN_FN_ISA_AVX512
124 SIMD_16x32 operator|(const SIMD_16x32& other) const {
125 SIMD_16x32 retval(*this);
126 retval |= other;
127 return retval;
128 }
129
130 BOTAN_FN_ISA_AVX512
131 SIMD_16x32 operator&(const SIMD_16x32& other) const {
132 SIMD_16x32 retval(*this);
133 retval &= other;
134 return retval;
135 }
136
137 BOTAN_FN_ISA_AVX512
138 void operator+=(const SIMD_16x32& other) { m_avx512 = _mm512_add_epi32(m_avx512, other.m_avx512); }
139
140 BOTAN_FN_ISA_AVX512
141 void operator-=(const SIMD_16x32& other) { m_avx512 = _mm512_sub_epi32(m_avx512, other.m_avx512); }
142
143 BOTAN_FN_ISA_AVX512
144 void operator^=(const SIMD_16x32& other) { m_avx512 = _mm512_xor_si512(m_avx512, other.m_avx512); }
145
146 BOTAN_FN_ISA_AVX512
147 void operator^=(uint32_t other) { *this ^= SIMD_16x32::splat(other); }
148
149 BOTAN_FN_ISA_AVX512
150 void operator|=(const SIMD_16x32& other) { m_avx512 = _mm512_or_si512(m_avx512, other.m_avx512); }
151
152 BOTAN_FN_ISA_AVX512
153 void operator&=(const SIMD_16x32& other) { m_avx512 = _mm512_and_si512(m_avx512, other.m_avx512); }
154
155 template <int SHIFT>
156 BOTAN_FN_ISA_AVX512 SIMD_16x32 shl() const {
157 return SIMD_16x32(_mm512_slli_epi32(m_avx512, SHIFT));
158 }
159
160 template <int SHIFT>
161 BOTAN_FN_ISA_AVX512 SIMD_16x32 shr() const {
162 return SIMD_16x32(_mm512_srli_epi32(m_avx512, SHIFT));
163 }
164
165 BOTAN_FN_ISA_AVX512
166 SIMD_16x32 operator~() const { return SIMD_16x32(_mm512_xor_si512(m_avx512, _mm512_set1_epi32(0xFFFFFFFF))); }
167
168 // (~reg) & other
169 BOTAN_FN_ISA_AVX512
170 SIMD_16x32 andc(const SIMD_16x32& other) const {
171 return SIMD_16x32(_mm512_andnot_si512(m_avx512, other.m_avx512));
172 }
173
174 template <uint8_t TBL>
175 BOTAN_FN_ISA_AVX512 static SIMD_16x32 ternary_fn(const SIMD_16x32& a, const SIMD_16x32& b, const SIMD_16x32& c) {
176 return SIMD_16x32(_mm512_ternarylogic_epi32(a.raw(), b.raw(), c.raw(), TBL));
177 }
178
179 BOTAN_FN_ISA_AVX512
181 const uint8_t BSWAP_MASK[64] = {
182 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
183 21, 20, 27, 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40,
184 47, 46, 45, 44, 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60,
185 };
186
187 const __m512i bswap = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(BSWAP_MASK));
188
189 const __m512i output = _mm512_shuffle_epi8(m_avx512, bswap);
190
191 return SIMD_16x32(output);
192 }
193
194 BOTAN_FN_ISA_AVX512
195 static void transpose(SIMD_16x32& B0, SIMD_16x32& B1, SIMD_16x32& B2, SIMD_16x32& B3) {
196 const __m512i T0 = _mm512_unpacklo_epi32(B0.m_avx512, B1.m_avx512);
197 const __m512i T1 = _mm512_unpacklo_epi32(B2.m_avx512, B3.m_avx512);
198 const __m512i T2 = _mm512_unpackhi_epi32(B0.m_avx512, B1.m_avx512);
199 const __m512i T3 = _mm512_unpackhi_epi32(B2.m_avx512, B3.m_avx512);
200
201 B0.m_avx512 = _mm512_unpacklo_epi64(T0, T1);
202 B1.m_avx512 = _mm512_unpackhi_epi64(T0, T1);
203 B2.m_avx512 = _mm512_unpacklo_epi64(T2, T3);
204 B3.m_avx512 = _mm512_unpackhi_epi64(T2, T3);
205 }
206
207 BOTAN_FN_ISA_AVX512
208 static void transpose(SIMD_16x32& B0,
209 SIMD_16x32& B1,
210 SIMD_16x32& B2,
211 SIMD_16x32& B3,
212 SIMD_16x32& B4,
213 SIMD_16x32& B5,
214 SIMD_16x32& B6,
215 SIMD_16x32& B7,
216 SIMD_16x32& B8,
217 SIMD_16x32& B9,
218 SIMD_16x32& BA,
219 SIMD_16x32& BB,
220 SIMD_16x32& BC,
221 SIMD_16x32& BD,
222 SIMD_16x32& BE,
223 SIMD_16x32& BF) {
224 auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());
225 auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());
226 auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());
227 auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());
228 auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());
229 auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());
230 auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());
231 auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());
232 auto t8 = _mm512_unpacklo_epi32(B8.raw(), B9.raw());
233 auto t9 = _mm512_unpackhi_epi32(B8.raw(), B9.raw());
234 auto ta = _mm512_unpacklo_epi32(BA.raw(), BB.raw());
235 auto tb = _mm512_unpackhi_epi32(BA.raw(), BB.raw());
236 auto tc = _mm512_unpacklo_epi32(BC.raw(), BD.raw());
237 auto td = _mm512_unpackhi_epi32(BC.raw(), BD.raw());
238 auto te = _mm512_unpacklo_epi32(BE.raw(), BF.raw());
239 auto tf = _mm512_unpackhi_epi32(BE.raw(), BF.raw());
240
241 auto r0 = _mm512_unpacklo_epi64(t0, t2);
242 auto r1 = _mm512_unpackhi_epi64(t0, t2);
243 auto r2 = _mm512_unpacklo_epi64(t1, t3);
244 auto r3 = _mm512_unpackhi_epi64(t1, t3);
245 auto r4 = _mm512_unpacklo_epi64(t4, t6);
246 auto r5 = _mm512_unpackhi_epi64(t4, t6);
247 auto r6 = _mm512_unpacklo_epi64(t5, t7);
248 auto r7 = _mm512_unpackhi_epi64(t5, t7);
249 auto r8 = _mm512_unpacklo_epi64(t8, ta);
250 auto r9 = _mm512_unpackhi_epi64(t8, ta);
251 auto ra = _mm512_unpacklo_epi64(t9, tb);
252 auto rb = _mm512_unpackhi_epi64(t9, tb);
253 auto rc = _mm512_unpacklo_epi64(tc, te);
254 auto rd = _mm512_unpackhi_epi64(tc, te);
255 auto re = _mm512_unpacklo_epi64(td, tf);
256 auto rf = _mm512_unpackhi_epi64(td, tf);
257
258 t0 = _mm512_shuffle_i32x4(r0, r4, 0x88);
259 t1 = _mm512_shuffle_i32x4(r1, r5, 0x88);
260 t2 = _mm512_shuffle_i32x4(r2, r6, 0x88);
261 t3 = _mm512_shuffle_i32x4(r3, r7, 0x88);
262 t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd);
263 t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd);
264 t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd);
265 t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd);
266 t8 = _mm512_shuffle_i32x4(r8, rc, 0x88);
267 t9 = _mm512_shuffle_i32x4(r9, rd, 0x88);
268 ta = _mm512_shuffle_i32x4(ra, re, 0x88);
269 tb = _mm512_shuffle_i32x4(rb, rf, 0x88);
270 tc = _mm512_shuffle_i32x4(r8, rc, 0xdd);
271 td = _mm512_shuffle_i32x4(r9, rd, 0xdd);
272 te = _mm512_shuffle_i32x4(ra, re, 0xdd);
273 tf = _mm512_shuffle_i32x4(rb, rf, 0xdd);
274
275 B0.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0x88);
276 B1.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0x88);
277 B2.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0x88);
278 B3.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0x88);
279 B4.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0x88);
280 B5.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0x88);
281 B6.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0x88);
282 B7.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0x88);
283 B8.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0xdd);
284 B9.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0xdd);
285 BA.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0xdd);
286 BB.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0xdd);
287 BC.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0xdd);
288 BD.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0xdd);
289 BE.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0xdd);
290 BF.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0xdd);
291 }
292
293 BOTAN_FN_ISA_AVX512
294 static SIMD_16x32 choose(const SIMD_16x32& mask, const SIMD_16x32& a, const SIMD_16x32& b) {
295 return SIMD_16x32::ternary_fn<0xca>(mask, a, b);
296 }
297
298 BOTAN_FN_ISA_AVX512
299 static SIMD_16x32 majority(const SIMD_16x32& x, const SIMD_16x32& y, const SIMD_16x32& z) {
300 return SIMD_16x32::ternary_fn<0xe8>(x, y, z);
301 }
302
303 BOTAN_FN_ISA_AVX512 static void zero_registers() {
304 // Unfortunately this only zeros zmm0-zmm15 and not zmm16-zmm32
305 _mm256_zeroall();
306 }
307
308 __m512i BOTAN_FN_ISA_AVX512 raw() const { return m_avx512; }
309
310 BOTAN_FN_ISA_AVX512
311 explicit SIMD_16x32(__m512i x) noexcept : m_avx512(x) {}
312
313 private:
314 __m512i m_avx512;
315};
316
317// NOLINTEND(portability-simd-intrinsics)
318
319template <size_t R>
321 return input.rotl<R>();
322}
323
324template <size_t R>
326 return input.rotr<R>();
327}
328
329// For Serpent:
330template <size_t S>
331inline SIMD_16x32 shl(SIMD_16x32 input) {
332 return input.shl<S>();
333}
334
335} // namespace Botan
336
337#endif
SIMD_16x32(SIMD_16x32 &&other)=default
static BOTAN_FN_ISA_AVX512 void transpose(SIMD_16x32 &B0, SIMD_16x32 &B1, SIMD_16x32 &B2, SIMD_16x32 &B3)
BOTAN_FN_ISA_AVX512 void operator^=(const SIMD_16x32 &other)
BOTAN_FN_ISA_AVX512 void store_le(uint8_t out[]) const
Definition simd_avx512.h:71
~SIMD_16x32()=default
static BOTAN_FN_ISA_AVX512 SIMD_16x32 choose(const SIMD_16x32 &mask, const SIMD_16x32 &a, const SIMD_16x32 &b)
BOTAN_FN_ISA_AVX512 SIMD_16x32 andc(const SIMD_16x32 &other) const
SIMD_16x32 BOTAN_FN_ISA_AVX512 sigma0() const
Definition simd_avx512.h:88
BOTAN_FN_ISA_AVX512 void operator|=(const SIMD_16x32 &other)
BOTAN_FN_ISA_AVX512 BOTAN_FORCE_INLINE SIMD_16x32()
Definition simd_avx512.h:30
BOTAN_FN_ISA_AVX512 SIMD_16x32 operator|(const SIMD_16x32 &other) const
BOTAN_FN_ISA_AVX512 void operator-=(const SIMD_16x32 &other)
SIMD_16x32 & operator=(SIMD_16x32 &&other)=default
BOTAN_FN_ISA_AVX512 SIMD_16x32 rotl() const
Definition simd_avx512.h:77
static BOTAN_FN_ISA_AVX512 SIMD_16x32 ternary_fn(const SIMD_16x32 &a, const SIMD_16x32 &b, const SIMD_16x32 &c)
BOTAN_FN_ISA_AVX512 SIMD_16x32 bswap() const
BOTAN_FN_ISA_AVX512 void operator&=(const SIMD_16x32 &other)
BOTAN_FN_ISA_AVX512 SIMD_16x32 shr() const
static BOTAN_FN_ISA_AVX512 SIMD_16x32 majority(const SIMD_16x32 &x, const SIMD_16x32 &y, const SIMD_16x32 &z)
BOTAN_FN_ISA_AVX512 void operator+=(const SIMD_16x32 &other)
static BOTAN_FN_ISA_AVX512 SIMD_16x32 load_le(const uint8_t *in)
Definition simd_avx512.h:63
BOTAN_FN_ISA_AVX512 SIMD_16x32 operator-(const SIMD_16x32 &other) const
BOTAN_FN_ISA_AVX512 void store_be(uint8_t out[]) const
Definition simd_avx512.h:74
static BOTAN_FN_ISA_AVX512 void zero_registers()
SIMD_16x32 & operator=(const SIMD_16x32 &other)=default
BOTAN_FN_ISA_AVX512 SIMD_16x32 rotr() const
Definition simd_avx512.h:84
BOTAN_FN_ISA_AVX512 SIMD_16x32(const uint32_t B[16])
Definition simd_avx512.h:33
BOTAN_FN_ISA_AVX512 SIMD_16x32(__m512i x) noexcept
__m512i BOTAN_FN_ISA_AVX512 raw() const
SIMD_16x32(const SIMD_16x32 &other)=default
BOTAN_FN_ISA_AVX512 SIMD_16x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3, uint32_t B4, uint32_t B5, uint32_t B6, uint32_t B7, uint32_t B8, uint32_t B9, uint32_t BA, uint32_t BB, uint32_t BC, uint32_t BD, uint32_t BE, uint32_t BF)
Definition simd_avx512.h:39
BOTAN_FN_ISA_AVX512 SIMD_16x32 operator+(const SIMD_16x32 &other) const
static BOTAN_FN_ISA_AVX512 void transpose(SIMD_16x32 &B0, SIMD_16x32 &B1, SIMD_16x32 &B2, SIMD_16x32 &B3, SIMD_16x32 &B4, SIMD_16x32 &B5, SIMD_16x32 &B6, SIMD_16x32 &B7, SIMD_16x32 &B8, SIMD_16x32 &B9, SIMD_16x32 &BA, SIMD_16x32 &BB, SIMD_16x32 &BC, SIMD_16x32 &BD, SIMD_16x32 &BE, SIMD_16x32 &BF)
SIMD_16x32 BOTAN_FN_ISA_AVX512 sigma1() const
Definition simd_avx512.h:95
BOTAN_FN_ISA_AVX512 SIMD_16x32 operator^(const SIMD_16x32 &other) const
static BOTAN_FN_ISA_AVX512 SIMD_16x32 splat(uint32_t B)
Definition simd_avx512.h:60
static BOTAN_FN_ISA_AVX512 SIMD_16x32 load_be(const uint8_t *in)
Definition simd_avx512.h:68
BOTAN_FN_ISA_AVX512 SIMD_16x32 shl() const
BOTAN_FN_ISA_AVX512 SIMD_16x32 operator&(const SIMD_16x32 &other) const
BOTAN_FN_ISA_AVX512 SIMD_16x32 operator~() const
BOTAN_FN_ISA_AVX512 void operator^=(uint32_t other)
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
BOTAN_FORCE_INLINE constexpr T rotr(T input)
Definition rotate.h:35
BOTAN_FORCE_INLINE constexpr T rotl(T input)
Definition rotate.h:23
SIMD_4x32 shl(SIMD_4x32 input)
Definition simd_4x32.h:790