Botan 3.7.1
Crypto and TLS for C&
simd_avx512.h
Go to the documentation of this file.
1/*
2* (C) 2023 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#ifndef BOTAN_SIMD_AVX512_H_
8#define BOTAN_SIMD_AVX512_H_
9
10#include <botan/compiler.h>
11#include <botan/types.h>
12#include <immintrin.h>
13
14namespace Botan {
15
16#define BOTAN_AVX512_FN BOTAN_FUNC_ISA("avx512f,avx512dq,avx512bw")
17
19 public:
20 SIMD_16x32& operator=(const SIMD_16x32& other) = default;
21 SIMD_16x32(const SIMD_16x32& other) = default;
22
23 SIMD_16x32& operator=(SIMD_16x32&& other) = default;
24 SIMD_16x32(SIMD_16x32&& other) = default;
25
27 BOTAN_FORCE_INLINE SIMD_16x32() { m_avx512 = _mm512_setzero_si512(); }
28
30 explicit SIMD_16x32(const uint32_t B[16]) { m_avx512 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(B)); }
31
33 explicit SIMD_16x32(uint32_t B0,
34 uint32_t B1,
35 uint32_t B2,
36 uint32_t B3,
37 uint32_t B4,
38 uint32_t B5,
39 uint32_t B6,
40 uint32_t B7,
41 uint32_t B8,
42 uint32_t B9,
43 uint32_t BA,
44 uint32_t BB,
45 uint32_t BC,
46 uint32_t BD,
47 uint32_t BE,
48 uint32_t BF) {
49 m_avx512 = _mm512_set_epi32(BF, BE, BD, BC, BB, BA, B9, B8, B7, B6, B5, B4, B3, B2, B1, B0);
50 }
51
53 static SIMD_16x32 splat(uint32_t B) { return SIMD_16x32(_mm512_set1_epi32(B)); }
54
56 static SIMD_16x32 load_le(const uint8_t* in) {
57 return SIMD_16x32(_mm512_loadu_si512(reinterpret_cast<const __m512i*>(in)));
58 }
59
61 static SIMD_16x32 load_be(const uint8_t* in) { return load_le(in).bswap(); }
62
64 void store_le(uint8_t out[]) const { _mm512_storeu_si512(reinterpret_cast<__m512i*>(out), m_avx512); }
65
67 void store_be(uint8_t out[]) const { bswap().store_le(out); }
68
69 template <size_t ROT>
71 requires(ROT > 0 && ROT < 32)
72 {
73 return SIMD_16x32(_mm512_rol_epi32(m_avx512, ROT));
74 }
75
76 template <size_t ROT>
78 return this->rotl<32 - ROT>();
79 }
80
81 SIMD_16x32 BOTAN_AVX512_FN sigma0() const {
82 const SIMD_16x32 rot1 = this->rotr<2>();
83 const SIMD_16x32 rot2 = this->rotr<13>();
84 const SIMD_16x32 rot3 = this->rotr<22>();
85 return rot1 ^ rot2 ^ rot3;
86 }
87
88 SIMD_16x32 BOTAN_AVX512_FN sigma1() const {
89 const SIMD_16x32 rot1 = this->rotr<6>();
90 const SIMD_16x32 rot2 = this->rotr<11>();
91 const SIMD_16x32 rot3 = this->rotr<25>();
92 return rot1 ^ rot2 ^ rot3;
93 }
94
96 SIMD_16x32 operator+(const SIMD_16x32& other) const {
97 SIMD_16x32 retval(*this);
98 retval += other;
99 return retval;
100 }
101
103 SIMD_16x32 operator-(const SIMD_16x32& other) const {
104 SIMD_16x32 retval(*this);
105 retval -= other;
106 return retval;
107 }
108
110 SIMD_16x32 operator^(const SIMD_16x32& other) const {
111 SIMD_16x32 retval(*this);
112 retval ^= other;
113 return retval;
114 }
115
117 SIMD_16x32 operator|(const SIMD_16x32& other) const {
118 SIMD_16x32 retval(*this);
119 retval |= other;
120 return retval;
121 }
122
124 SIMD_16x32 operator&(const SIMD_16x32& other) const {
125 SIMD_16x32 retval(*this);
126 retval &= other;
127 return retval;
128 }
129
131 void operator+=(const SIMD_16x32& other) { m_avx512 = _mm512_add_epi32(m_avx512, other.m_avx512); }
132
134 void operator-=(const SIMD_16x32& other) { m_avx512 = _mm512_sub_epi32(m_avx512, other.m_avx512); }
135
137 void operator^=(const SIMD_16x32& other) { m_avx512 = _mm512_xor_si512(m_avx512, other.m_avx512); }
138
140 void operator^=(uint32_t other) { *this ^= SIMD_16x32::splat(other); }
141
143 void operator|=(const SIMD_16x32& other) { m_avx512 = _mm512_or_si512(m_avx512, other.m_avx512); }
144
146 void operator&=(const SIMD_16x32& other) { m_avx512 = _mm512_and_si512(m_avx512, other.m_avx512); }
147
148 template <int SHIFT>
150 return SIMD_16x32(_mm512_slli_epi32(m_avx512, SHIFT));
151 }
152
153 template <int SHIFT>
154 BOTAN_AVX512_FN SIMD_16x32 shr() const {
155 return SIMD_16x32(_mm512_srli_epi32(m_avx512, SHIFT));
156 }
157
159 SIMD_16x32 operator~() const { return SIMD_16x32(_mm512_xor_si512(m_avx512, _mm512_set1_epi32(0xFFFFFFFF))); }
160
161 // (~reg) & other
163 SIMD_16x32 andc(const SIMD_16x32& other) const {
164 return SIMD_16x32(_mm512_andnot_si512(m_avx512, other.m_avx512));
165 }
166
167 template <uint8_t TBL>
168 BOTAN_AVX512_FN static SIMD_16x32 ternary_fn(const SIMD_16x32& a, const SIMD_16x32& b, const SIMD_16x32& c) {
169 return _mm512_ternarylogic_epi32(a.raw(), b.raw(), c.raw(), TBL);
170 }
171
173 SIMD_16x32 bswap() const {
174 const uint8_t BSWAP_MASK[64] = {
175 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
176 21, 20, 27, 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40,
177 47, 46, 45, 44, 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60,
178 };
179
180 const __m512i bswap = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(BSWAP_MASK));
181
182 const __m512i output = _mm512_shuffle_epi8(m_avx512, bswap);
183
184 return SIMD_16x32(output);
185 }
186
188 static void transpose(SIMD_16x32& B0, SIMD_16x32& B1, SIMD_16x32& B2, SIMD_16x32& B3) {
189 const __m512i T0 = _mm512_unpacklo_epi32(B0.m_avx512, B1.m_avx512);
190 const __m512i T1 = _mm512_unpacklo_epi32(B2.m_avx512, B3.m_avx512);
191 const __m512i T2 = _mm512_unpackhi_epi32(B0.m_avx512, B1.m_avx512);
192 const __m512i T3 = _mm512_unpackhi_epi32(B2.m_avx512, B3.m_avx512);
193
194 B0.m_avx512 = _mm512_unpacklo_epi64(T0, T1);
195 B1.m_avx512 = _mm512_unpackhi_epi64(T0, T1);
196 B2.m_avx512 = _mm512_unpacklo_epi64(T2, T3);
197 B3.m_avx512 = _mm512_unpackhi_epi64(T2, T3);
198 }
199
201 static void transpose(SIMD_16x32& B0,
202 SIMD_16x32& B1,
203 SIMD_16x32& B2,
204 SIMD_16x32& B3,
205 SIMD_16x32& B4,
206 SIMD_16x32& B5,
207 SIMD_16x32& B6,
208 SIMD_16x32& B7,
209 SIMD_16x32& B8,
210 SIMD_16x32& B9,
211 SIMD_16x32& BA,
212 SIMD_16x32& BB,
213 SIMD_16x32& BC,
214 SIMD_16x32& BD,
215 SIMD_16x32& BE,
216 SIMD_16x32& BF) {
217 auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());
218 auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());
219 auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());
220 auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());
221 auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());
222 auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());
223 auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());
224 auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());
225 auto t8 = _mm512_unpacklo_epi32(B8.raw(), B9.raw());
226 auto t9 = _mm512_unpackhi_epi32(B8.raw(), B9.raw());
227 auto ta = _mm512_unpacklo_epi32(BA.raw(), BB.raw());
228 auto tb = _mm512_unpackhi_epi32(BA.raw(), BB.raw());
229 auto tc = _mm512_unpacklo_epi32(BC.raw(), BD.raw());
230 auto td = _mm512_unpackhi_epi32(BC.raw(), BD.raw());
231 auto te = _mm512_unpacklo_epi32(BE.raw(), BF.raw());
232 auto tf = _mm512_unpackhi_epi32(BE.raw(), BF.raw());
233
234 auto r0 = _mm512_unpacklo_epi64(t0, t2);
235 auto r1 = _mm512_unpackhi_epi64(t0, t2);
236 auto r2 = _mm512_unpacklo_epi64(t1, t3);
237 auto r3 = _mm512_unpackhi_epi64(t1, t3);
238 auto r4 = _mm512_unpacklo_epi64(t4, t6);
239 auto r5 = _mm512_unpackhi_epi64(t4, t6);
240 auto r6 = _mm512_unpacklo_epi64(t5, t7);
241 auto r7 = _mm512_unpackhi_epi64(t5, t7);
242 auto r8 = _mm512_unpacklo_epi64(t8, ta);
243 auto r9 = _mm512_unpackhi_epi64(t8, ta);
244 auto ra = _mm512_unpacklo_epi64(t9, tb);
245 auto rb = _mm512_unpackhi_epi64(t9, tb);
246 auto rc = _mm512_unpacklo_epi64(tc, te);
247 auto rd = _mm512_unpackhi_epi64(tc, te);
248 auto re = _mm512_unpacklo_epi64(td, tf);
249 auto rf = _mm512_unpackhi_epi64(td, tf);
250
251 t0 = _mm512_shuffle_i32x4(r0, r4, 0x88);
252 t1 = _mm512_shuffle_i32x4(r1, r5, 0x88);
253 t2 = _mm512_shuffle_i32x4(r2, r6, 0x88);
254 t3 = _mm512_shuffle_i32x4(r3, r7, 0x88);
255 t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd);
256 t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd);
257 t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd);
258 t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd);
259 t8 = _mm512_shuffle_i32x4(r8, rc, 0x88);
260 t9 = _mm512_shuffle_i32x4(r9, rd, 0x88);
261 ta = _mm512_shuffle_i32x4(ra, re, 0x88);
262 tb = _mm512_shuffle_i32x4(rb, rf, 0x88);
263 tc = _mm512_shuffle_i32x4(r8, rc, 0xdd);
264 td = _mm512_shuffle_i32x4(r9, rd, 0xdd);
265 te = _mm512_shuffle_i32x4(ra, re, 0xdd);
266 tf = _mm512_shuffle_i32x4(rb, rf, 0xdd);
267
268 B0.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0x88);
269 B1.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0x88);
270 B2.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0x88);
271 B3.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0x88);
272 B4.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0x88);
273 B5.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0x88);
274 B6.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0x88);
275 B7.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0x88);
276 B8.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0xdd);
277 B9.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0xdd);
278 BA.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0xdd);
279 BB.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0xdd);
280 BC.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0xdd);
281 BD.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0xdd);
282 BE.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0xdd);
283 BF.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0xdd);
284 }
285
287 static SIMD_16x32 choose(const SIMD_16x32& mask, const SIMD_16x32& a, const SIMD_16x32& b) {
288 return SIMD_16x32::ternary_fn<0xca>(mask, a, b);
289 }
290
292 static SIMD_16x32 majority(const SIMD_16x32& x, const SIMD_16x32& y, const SIMD_16x32& z) {
293 return SIMD_16x32::ternary_fn<0xe8>(x, y, z);
294 }
295
296 BOTAN_FUNC_ISA("avx2") static void zero_registers() {
297 // Unfortunately this only zeros zmm0-zmm15 and not zmm16-zmm32
298 _mm256_zeroall();
299 }
300
301 __m512i BOTAN_AVX512_FN raw() const { return m_avx512; }
302
304 SIMD_16x32(__m512i x) : m_avx512(x) {}
305
306 private:
307 __m512i m_avx512;
308};
309
310template <size_t R>
312 return input.rotl<R>();
313}
314
315template <size_t R>
317 return input.rotr<R>();
318}
319
320// For Serpent:
321template <size_t S>
322inline SIMD_16x32 shl(SIMD_16x32 input) {
323 return input.shl<S>();
324}
325
326} // namespace Botan
327
328#endif
SIMD_16x32(SIMD_16x32 &&other)=default
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t BA
Definition simd_avx512.h:43
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t B6
Definition simd_avx512.h:39
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t BF
Definition simd_avx512.h:48
SIMD_16x32 & operator=(SIMD_16x32 &&other)=default
uint32_t uint32_t uint32_t B3
Definition simd_avx512.h:36
uint32_t uint32_t B2
Definition simd_avx512.h:35
uint32_t uint32_t uint32_t uint32_t B4
Definition simd_avx512.h:37
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t BD
Definition simd_avx512.h:46
SIMD_16x32 & operator=(const SIMD_16x32 &other)=default
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t BB
Definition simd_avx512.h:44
SIMD_16x32(const SIMD_16x32 &other)=default
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t B9
Definition simd_avx512.h:42
uint32_t uint32_t uint32_t uint32_t uint32_t B5
Definition simd_avx512.h:38
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t B8
Definition simd_avx512.h:41
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t BE
Definition simd_avx512.h:47
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t B7
Definition simd_avx512.h:40
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t BC
Definition simd_avx512.h:45
int(* final)(unsigned char *, CTX *)
#define BOTAN_FUNC_ISA(isa)
Definition compiler.h:42
#define BOTAN_FORCE_INLINE
Definition compiler.h:71
ASN1_Type operator|(ASN1_Type x, ASN1_Type y)
Definition asn1_obj.h:75
constexpr T rotl(T input)
Definition rotate.h:21
OctetString operator^(const OctetString &k1, const OctetString &k2)
Definition symkey.cpp:109
OctetString operator+(const OctetString &k1, const OctetString &k2)
Definition symkey.cpp:99
constexpr T choose(T mask, T a, T b)
Definition bit_ops.h:204
constexpr T rotr(T input)
Definition rotate.h:33
BigInt operator-(const BigInt &x, const BigInt &y)
Definition bigint.h:1094
constexpr auto operator|=(Strong< T1, Tags... > &a, T2 b)
constexpr auto store_le(ParamTs &&... params)
Definition loadstor.h:764
constexpr auto operator&=(Strong< T1, Tags... > &a, T2 b)
constexpr T majority(T a, T b, T c)
Definition bit_ops.h:210
std::vector< uint8_t, Alloc > & operator^=(std::vector< uint8_t, Alloc > &out, const std::vector< uint8_t, Alloc2 > &in)
Definition mem_ops.h:446
std::vector< T, Alloc > & operator+=(std::vector< T, Alloc > &out, const std::vector< T, Alloc2 > &in)
Definition secmem.h:80
SIMD_4x32 shl(SIMD_4x32 input)
Definition simd_32.h:634
constexpr auto load_le(ParamTs &&... params)
Definition loadstor.h:521
constexpr auto operator-=(Strong< T1, Tags... > &a, T2 b)
const SIMD_8x32 & b
constexpr auto store_be(ParamTs &&... params)
Definition loadstor.h:773
constexpr auto load_be(ParamTs &&... params)
Definition loadstor.h:530
ECIES_Flags operator&(ECIES_Flags a, ECIES_Flags b)
Definition ecies.h:55
#define BOTAN_AVX512_FN
Definition simd_avx512.h:16