Botan 3.5.0
Crypto and TLS for C&
simd_avx512.h
Go to the documentation of this file.
1/*
2* (C) 2023 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#ifndef BOTAN_SIMD_AVX512_H_
8#define BOTAN_SIMD_AVX512_H_
9
10#include <botan/types.h>
11#include <immintrin.h>
12
13namespace Botan {
14
15#define BOTAN_AVX512_FN BOTAN_FUNC_ISA("avx512f,avx512dq,avx512bw")
16
18 public:
19 SIMD_16x32& operator=(const SIMD_16x32& other) = default;
20 SIMD_16x32(const SIMD_16x32& other) = default;
21
22 SIMD_16x32& operator=(SIMD_16x32&& other) = default;
23 SIMD_16x32(SIMD_16x32&& other) = default;
24
26 BOTAN_FORCE_INLINE SIMD_16x32() { m_avx512 = _mm512_setzero_si512(); }
27
29 explicit SIMD_16x32(const uint32_t B[16]) { m_avx512 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(B)); }
30
32 explicit SIMD_16x32(uint32_t B0,
33 uint32_t B1,
34 uint32_t B2,
35 uint32_t B3,
36 uint32_t B4,
37 uint32_t B5,
38 uint32_t B6,
39 uint32_t B7,
40 uint32_t B8,
41 uint32_t B9,
42 uint32_t BA,
43 uint32_t BB,
44 uint32_t BC,
45 uint32_t BD,
46 uint32_t BE,
47 uint32_t BF) {
48 m_avx512 = _mm512_set_epi32(BF, BE, BD, BC, BB, BA, B9, B8, B7, B6, B5, B4, B3, B2, B1, B0);
49 }
50
52 static SIMD_16x32 splat(uint32_t B) { return SIMD_16x32(_mm512_set1_epi32(B)); }
53
55 static SIMD_16x32 load_le(const uint8_t* in) {
56 return SIMD_16x32(_mm512_loadu_si512(reinterpret_cast<const __m512i*>(in)));
57 }
58
60 static SIMD_16x32 load_be(const uint8_t* in) { return load_le(in).bswap(); }
61
63 void store_le(uint8_t out[]) const { _mm512_storeu_si512(reinterpret_cast<__m512i*>(out), m_avx512); }
64
66 void store_be(uint8_t out[]) const { bswap().store_le(out); }
67
68 template <size_t ROT>
70 requires(ROT > 0 && ROT < 32)
71 {
72 return SIMD_16x32(_mm512_rol_epi32(m_avx512, ROT));
73 }
74
75 template <size_t ROT>
77 return this->rotl<32 - ROT>();
78 }
79
80 SIMD_16x32 BOTAN_AVX512_FN sigma0() const {
81 const SIMD_16x32 rot1 = this->rotr<2>();
82 const SIMD_16x32 rot2 = this->rotr<13>();
83 const SIMD_16x32 rot3 = this->rotr<22>();
84 return rot1 ^ rot2 ^ rot3;
85 }
86
87 SIMD_16x32 BOTAN_AVX512_FN sigma1() const {
88 const SIMD_16x32 rot1 = this->rotr<6>();
89 const SIMD_16x32 rot2 = this->rotr<11>();
90 const SIMD_16x32 rot3 = this->rotr<25>();
91 return rot1 ^ rot2 ^ rot3;
92 }
93
95 SIMD_16x32 operator+(const SIMD_16x32& other) const {
96 SIMD_16x32 retval(*this);
97 retval += other;
98 return retval;
99 }
100
102 SIMD_16x32 operator-(const SIMD_16x32& other) const {
103 SIMD_16x32 retval(*this);
104 retval -= other;
105 return retval;
106 }
107
109 SIMD_16x32 operator^(const SIMD_16x32& other) const {
110 SIMD_16x32 retval(*this);
111 retval ^= other;
112 return retval;
113 }
114
116 SIMD_16x32 operator|(const SIMD_16x32& other) const {
117 SIMD_16x32 retval(*this);
118 retval |= other;
119 return retval;
120 }
121
123 SIMD_16x32 operator&(const SIMD_16x32& other) const {
124 SIMD_16x32 retval(*this);
125 retval &= other;
126 return retval;
127 }
128
130 void operator+=(const SIMD_16x32& other) { m_avx512 = _mm512_add_epi32(m_avx512, other.m_avx512); }
131
133 void operator-=(const SIMD_16x32& other) { m_avx512 = _mm512_sub_epi32(m_avx512, other.m_avx512); }
134
136 void operator^=(const SIMD_16x32& other) { m_avx512 = _mm512_xor_si512(m_avx512, other.m_avx512); }
137
139 void operator^=(uint32_t other) { *this ^= SIMD_16x32::splat(other); }
140
142 void operator|=(const SIMD_16x32& other) { m_avx512 = _mm512_or_si512(m_avx512, other.m_avx512); }
143
145 void operator&=(const SIMD_16x32& other) { m_avx512 = _mm512_and_si512(m_avx512, other.m_avx512); }
146
147 template <int SHIFT>
149 return SIMD_16x32(_mm512_slli_epi32(m_avx512, SHIFT));
150 }
151
152 template <int SHIFT>
153 BOTAN_AVX512_FN SIMD_16x32 shr() const {
154 return SIMD_16x32(_mm512_srli_epi32(m_avx512, SHIFT));
155 }
156
158 SIMD_16x32 operator~() const { return SIMD_16x32(_mm512_xor_si512(m_avx512, _mm512_set1_epi32(0xFFFFFFFF))); }
159
160 // (~reg) & other
162 SIMD_16x32 andc(const SIMD_16x32& other) const {
163 return SIMD_16x32(_mm512_andnot_si512(m_avx512, other.m_avx512));
164 }
165
166 template <uint8_t TBL>
167 BOTAN_AVX512_FN static SIMD_16x32 ternary_fn(const SIMD_16x32& a, const SIMD_16x32& b, const SIMD_16x32& c) {
168 return _mm512_ternarylogic_epi32(a.raw(), b.raw(), c.raw(), TBL);
169 }
170
172 SIMD_16x32 bswap() const {
173 const uint8_t BSWAP_MASK[64] = {
174 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
175 21, 20, 27, 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40,
176 47, 46, 45, 44, 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60,
177 };
178
179 const __m512i bswap = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(BSWAP_MASK));
180
181 const __m512i output = _mm512_shuffle_epi8(m_avx512, bswap);
182
183 return SIMD_16x32(output);
184 }
185
187 static void transpose(SIMD_16x32& B0, SIMD_16x32& B1, SIMD_16x32& B2, SIMD_16x32& B3) {
188 const __m512i T0 = _mm512_unpacklo_epi32(B0.m_avx512, B1.m_avx512);
189 const __m512i T1 = _mm512_unpacklo_epi32(B2.m_avx512, B3.m_avx512);
190 const __m512i T2 = _mm512_unpackhi_epi32(B0.m_avx512, B1.m_avx512);
191 const __m512i T3 = _mm512_unpackhi_epi32(B2.m_avx512, B3.m_avx512);
192
193 B0.m_avx512 = _mm512_unpacklo_epi64(T0, T1);
194 B1.m_avx512 = _mm512_unpackhi_epi64(T0, T1);
195 B2.m_avx512 = _mm512_unpacklo_epi64(T2, T3);
196 B3.m_avx512 = _mm512_unpackhi_epi64(T2, T3);
197 }
198
200 static void transpose(SIMD_16x32& B0,
201 SIMD_16x32& B1,
202 SIMD_16x32& B2,
203 SIMD_16x32& B3,
204 SIMD_16x32& B4,
205 SIMD_16x32& B5,
206 SIMD_16x32& B6,
207 SIMD_16x32& B7,
208 SIMD_16x32& B8,
209 SIMD_16x32& B9,
210 SIMD_16x32& BA,
211 SIMD_16x32& BB,
212 SIMD_16x32& BC,
213 SIMD_16x32& BD,
214 SIMD_16x32& BE,
215 SIMD_16x32& BF) {
216 auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());
217 auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());
218 auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());
219 auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());
220 auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());
221 auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());
222 auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());
223 auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());
224 auto t8 = _mm512_unpacklo_epi32(B8.raw(), B9.raw());
225 auto t9 = _mm512_unpackhi_epi32(B8.raw(), B9.raw());
226 auto ta = _mm512_unpacklo_epi32(BA.raw(), BB.raw());
227 auto tb = _mm512_unpackhi_epi32(BA.raw(), BB.raw());
228 auto tc = _mm512_unpacklo_epi32(BC.raw(), BD.raw());
229 auto td = _mm512_unpackhi_epi32(BC.raw(), BD.raw());
230 auto te = _mm512_unpacklo_epi32(BE.raw(), BF.raw());
231 auto tf = _mm512_unpackhi_epi32(BE.raw(), BF.raw());
232
233 auto r0 = _mm512_unpacklo_epi64(t0, t2);
234 auto r1 = _mm512_unpackhi_epi64(t0, t2);
235 auto r2 = _mm512_unpacklo_epi64(t1, t3);
236 auto r3 = _mm512_unpackhi_epi64(t1, t3);
237 auto r4 = _mm512_unpacklo_epi64(t4, t6);
238 auto r5 = _mm512_unpackhi_epi64(t4, t6);
239 auto r6 = _mm512_unpacklo_epi64(t5, t7);
240 auto r7 = _mm512_unpackhi_epi64(t5, t7);
241 auto r8 = _mm512_unpacklo_epi64(t8, ta);
242 auto r9 = _mm512_unpackhi_epi64(t8, ta);
243 auto ra = _mm512_unpacklo_epi64(t9, tb);
244 auto rb = _mm512_unpackhi_epi64(t9, tb);
245 auto rc = _mm512_unpacklo_epi64(tc, te);
246 auto rd = _mm512_unpackhi_epi64(tc, te);
247 auto re = _mm512_unpacklo_epi64(td, tf);
248 auto rf = _mm512_unpackhi_epi64(td, tf);
249
250 t0 = _mm512_shuffle_i32x4(r0, r4, 0x88);
251 t1 = _mm512_shuffle_i32x4(r1, r5, 0x88);
252 t2 = _mm512_shuffle_i32x4(r2, r6, 0x88);
253 t3 = _mm512_shuffle_i32x4(r3, r7, 0x88);
254 t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd);
255 t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd);
256 t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd);
257 t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd);
258 t8 = _mm512_shuffle_i32x4(r8, rc, 0x88);
259 t9 = _mm512_shuffle_i32x4(r9, rd, 0x88);
260 ta = _mm512_shuffle_i32x4(ra, re, 0x88);
261 tb = _mm512_shuffle_i32x4(rb, rf, 0x88);
262 tc = _mm512_shuffle_i32x4(r8, rc, 0xdd);
263 td = _mm512_shuffle_i32x4(r9, rd, 0xdd);
264 te = _mm512_shuffle_i32x4(ra, re, 0xdd);
265 tf = _mm512_shuffle_i32x4(rb, rf, 0xdd);
266
267 B0.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0x88);
268 B1.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0x88);
269 B2.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0x88);
270 B3.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0x88);
271 B4.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0x88);
272 B5.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0x88);
273 B6.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0x88);
274 B7.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0x88);
275 B8.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0xdd);
276 B9.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0xdd);
277 BA.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0xdd);
278 BB.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0xdd);
279 BC.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0xdd);
280 BD.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0xdd);
281 BE.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0xdd);
282 BF.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0xdd);
283 }
284
286 static SIMD_16x32 choose(const SIMD_16x32& mask, const SIMD_16x32& a, const SIMD_16x32& b) {
287 return SIMD_16x32::ternary_fn<0xca>(mask, a, b);
288 }
289
291 static SIMD_16x32 majority(const SIMD_16x32& x, const SIMD_16x32& y, const SIMD_16x32& z) {
292 return SIMD_16x32::ternary_fn<0xe8>(x, y, z);
293 }
294
295 BOTAN_FUNC_ISA("avx2") static void zero_registers() {
296 // Unfortunately this only zeros zmm0-zmm15 and not zmm16-zmm32
297 _mm256_zeroall();
298 }
299
300 __m512i BOTAN_AVX512_FN raw() const { return m_avx512; }
301
303 SIMD_16x32(__m512i x) : m_avx512(x) {}
304
305 private:
306 __m512i m_avx512;
307};
308
309template <size_t R>
311 return input.rotl<R>();
312}
313
314template <size_t R>
316 return input.rotr<R>();
317}
318
319// For Serpent:
320template <size_t S>
321inline SIMD_16x32 shl(SIMD_16x32 input) {
322 return input.shl<S>();
323}
324
325} // namespace Botan
326
327#endif
SIMD_16x32(SIMD_16x32 &&other)=default
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t BA
Definition simd_avx512.h:42
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t B6
Definition simd_avx512.h:38
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t BF
Definition simd_avx512.h:47
SIMD_16x32 & operator=(SIMD_16x32 &&other)=default
uint32_t uint32_t uint32_t B3
Definition simd_avx512.h:35
uint32_t uint32_t B2
Definition simd_avx512.h:34
uint32_t uint32_t uint32_t uint32_t B4
Definition simd_avx512.h:36
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t BD
Definition simd_avx512.h:45
SIMD_16x32 & operator=(const SIMD_16x32 &other)=default
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t BB
Definition simd_avx512.h:43
SIMD_16x32(const SIMD_16x32 &other)=default
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t B9
Definition simd_avx512.h:41
uint32_t uint32_t uint32_t uint32_t uint32_t B5
Definition simd_avx512.h:37
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t B8
Definition simd_avx512.h:40
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t BE
Definition simd_avx512.h:46
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t B7
Definition simd_avx512.h:39
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t BC
Definition simd_avx512.h:44
int(* final)(unsigned char *, CTX *)
#define BOTAN_FUNC_ISA(isa)
Definition compiler.h:92
#define BOTAN_FORCE_INLINE
Definition compiler.h:165
ASN1_Type operator|(ASN1_Type x, ASN1_Type y)
Definition asn1_obj.h:75
constexpr T rotl(T input)
Definition rotate.h:21
OctetString operator^(const OctetString &k1, const OctetString &k2)
Definition symkey.cpp:109
OctetString operator+(const OctetString &k1, const OctetString &k2)
Definition symkey.cpp:99
constexpr T choose(T mask, T a, T b)
Definition bit_ops.h:193
constexpr T rotr(T input)
Definition rotate.h:33
BigInt operator-(const BigInt &x, const BigInt &y)
Definition bigint.h:1089
constexpr auto operator|=(Strong< T1, Tags... > &a, T2 b)
constexpr auto store_le(ParamTs &&... params)
Definition loadstor.h:698
constexpr auto operator&=(Strong< T1, Tags... > &a, T2 b)
constexpr T majority(T a, T b, T c)
Definition bit_ops.h:199
std::vector< uint8_t, Alloc > & operator^=(std::vector< uint8_t, Alloc > &out, const std::vector< uint8_t, Alloc2 > &in)
Definition mem_ops.h:445
std::vector< T, Alloc > & operator+=(std::vector< T, Alloc > &out, const std::vector< T, Alloc2 > &in)
Definition secmem.h:80
SIMD_4x32 shl(SIMD_4x32 input)
Definition simd_32.h:625
constexpr auto load_le(ParamTs &&... params)
Definition loadstor.h:458
constexpr auto operator-=(Strong< T1, Tags... > &a, T2 b)
constexpr auto store_be(ParamTs &&... params)
Definition loadstor.h:707
constexpr auto load_be(ParamTs &&... params)
Definition loadstor.h:467
ECIES_Flags operator&(ECIES_Flags a, ECIES_Flags b)
Definition ecies.h:50
#define BOTAN_AVX512_FN
Definition simd_avx512.h:15