Botan 3.4.0
Crypto and TLS for C&
simd_avx2.h
Go to the documentation of this file.
1/*
2* (C) 2018 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#ifndef BOTAN_SIMD_AVX2_H_
8#define BOTAN_SIMD_AVX2_H_
9
10#include <botan/types.h>
11#include <immintrin.h>
12
13namespace Botan {
14
15#define BOTAN_AVX2_FN BOTAN_FUNC_ISA("avx2")
16
18 public:
19 SIMD_8x32& operator=(const SIMD_8x32& other) = default;
20 SIMD_8x32(const SIMD_8x32& other) = default;
21
22 SIMD_8x32& operator=(SIMD_8x32&& other) = default;
23 SIMD_8x32(SIMD_8x32&& other) = default;
24
25 ~SIMD_8x32() = default;
26
28 BOTAN_FORCE_INLINE SIMD_8x32() noexcept { m_avx2 = _mm256_setzero_si256(); }
29
31 explicit SIMD_8x32(const uint32_t B[8]) noexcept {
32 m_avx2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(B));
33 }
34
36 explicit SIMD_8x32(uint32_t B0,
37 uint32_t B1,
38 uint32_t B2,
39 uint32_t B3,
40 uint32_t B4,
41 uint32_t B5,
42 uint32_t B6,
43 uint32_t B7) noexcept {
44 m_avx2 = _mm256_set_epi32(B7, B6, B5, B4, B3, B2, B1, B0);
45 }
46
48 static SIMD_8x32 splat(uint32_t B) noexcept { return SIMD_8x32(_mm256_set1_epi32(B)); }
49
51 static SIMD_8x32 load_le(const uint8_t* in) noexcept {
52 return SIMD_8x32(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in)));
53 }
54
56 static SIMD_8x32 load_be(const uint8_t* in) noexcept { return load_le(in).bswap(); }
57
59 void store_le(uint8_t out[]) const noexcept { _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), m_avx2); }
60
62 void store_be(uint8_t out[]) const noexcept { bswap().store_le(out); }
63
64 template <size_t ROT>
66 requires(ROT > 0 && ROT < 32)
67 {
68#if defined(__AVX512VL__)
69 return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT));
70#else
71 if constexpr(ROT == 8) {
72 const __m256i shuf_rotl_8 =
73 _mm256_set_epi64x(0x0e0d0c0f'0a09080b, 0x06050407'02010003, 0x0e0d0c0f'0a09080b, 0x06050407'02010003);
74
75 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8));
76 } else if constexpr(ROT == 16) {
77 const __m256i shuf_rotl_16 =
78 _mm256_set_epi64x(0x0d0c0f0e'09080b0a, 0x05040706'01000302, 0x0d0c0f0e'09080b0a, 0x05040706'01000302);
79
80 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16));
81 } else {
82 return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)),
83 _mm256_srli_epi32(m_avx2, static_cast<int>(32 - ROT))));
84 }
85#endif
86 }
87
88 template <size_t ROT>
90 return this->rotl<32 - ROT>();
91 }
92
94 const SIMD_8x32 rot1 = this->rotr<2>();
95 const SIMD_8x32 rot2 = this->rotr<13>();
96 const SIMD_8x32 rot3 = this->rotr<22>();
97 return rot1 ^ rot2 ^ rot3;
98 }
99
101 const SIMD_8x32 rot1 = this->rotr<6>();
102 const SIMD_8x32 rot2 = this->rotr<11>();
103 const SIMD_8x32 rot3 = this->rotr<25>();
104 return rot1 ^ rot2 ^ rot3;
105 }
106
108 SIMD_8x32 operator+(const SIMD_8x32& other) const noexcept {
109 SIMD_8x32 retval(*this);
110 retval += other;
111 return retval;
112 }
113
115 SIMD_8x32 operator-(const SIMD_8x32& other) const noexcept {
116 SIMD_8x32 retval(*this);
117 retval -= other;
118 return retval;
119 }
120
122 SIMD_8x32 operator^(const SIMD_8x32& other) const noexcept {
123 SIMD_8x32 retval(*this);
124 retval ^= other;
125 return retval;
126 }
127
129 SIMD_8x32 operator|(const SIMD_8x32& other) const noexcept {
130 SIMD_8x32 retval(*this);
131 retval |= other;
132 return retval;
133 }
134
136 SIMD_8x32 operator&(const SIMD_8x32& other) const noexcept {
137 SIMD_8x32 retval(*this);
138 retval &= other;
139 return retval;
140 }
141
143 void operator+=(const SIMD_8x32& other) { m_avx2 = _mm256_add_epi32(m_avx2, other.m_avx2); }
144
146 void operator-=(const SIMD_8x32& other) { m_avx2 = _mm256_sub_epi32(m_avx2, other.m_avx2); }
147
149 void operator^=(const SIMD_8x32& other) { m_avx2 = _mm256_xor_si256(m_avx2, other.m_avx2); }
150
152 void operator^=(uint32_t other) { *this ^= SIMD_8x32::splat(other); }
153
155 void operator|=(const SIMD_8x32& other) { m_avx2 = _mm256_or_si256(m_avx2, other.m_avx2); }
156
158 void operator&=(const SIMD_8x32& other) { m_avx2 = _mm256_and_si256(m_avx2, other.m_avx2); }
159
160 template <int SHIFT>
162 return SIMD_8x32(_mm256_slli_epi32(m_avx2, SHIFT));
163 }
164
165 template <int SHIFT>
166 BOTAN_AVX2_FN SIMD_8x32 shr() const noexcept {
167 return SIMD_8x32(_mm256_srli_epi32(m_avx2, SHIFT));
168 }
169
171 SIMD_8x32 operator~() const noexcept {
172 return SIMD_8x32(_mm256_xor_si256(m_avx2, _mm256_set1_epi32(0xFFFFFFFF)));
173 }
174
175 // (~reg) & other
177 SIMD_8x32 andc(const SIMD_8x32& other) const noexcept {
178 return SIMD_8x32(_mm256_andnot_si256(m_avx2, other.m_avx2));
179 }
180
182 SIMD_8x32 bswap() const noexcept {
183 const uint8_t BSWAP_MASK[32] = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
184 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
185
186 const __m256i bswap = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(BSWAP_MASK));
187
188 const __m256i output = _mm256_shuffle_epi8(m_avx2, bswap);
189
190 return SIMD_8x32(output);
191 }
192
194 static void transpose(SIMD_8x32& B0, SIMD_8x32& B1, SIMD_8x32& B2, SIMD_8x32& B3) noexcept {
195 const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2, B1.m_avx2);
196 const __m256i T1 = _mm256_unpacklo_epi32(B2.m_avx2, B3.m_avx2);
197 const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2, B1.m_avx2);
198 const __m256i T3 = _mm256_unpackhi_epi32(B2.m_avx2, B3.m_avx2);
199
200 B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1);
201 B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1);
202 B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3);
203 B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3);
204 }
205
207 static void transpose(SIMD_8x32& B0,
208 SIMD_8x32& B1,
209 SIMD_8x32& B2,
210 SIMD_8x32& B3,
211 SIMD_8x32& B4,
212 SIMD_8x32& B5,
213 SIMD_8x32& B6,
214 SIMD_8x32& B7) noexcept {
215 transpose(B0, B1, B2, B3);
216 transpose(B4, B5, B6, B7);
217
218 swap_tops(B0, B4);
219 swap_tops(B1, B5);
220 swap_tops(B2, B6);
221 swap_tops(B3, B7);
222 }
223
225 static SIMD_8x32 choose(const SIMD_8x32& mask, const SIMD_8x32& a, const SIMD_8x32& b) noexcept {
226#if defined(__AVX512VL__)
227 return _mm256_ternarylogic_epi32(mask.handle(), a.handle(), b.handle(), 0xca);
228#else
229 return (mask & a) ^ mask.andc(b);
230#endif
231 }
232
234 static SIMD_8x32 majority(const SIMD_8x32& x, const SIMD_8x32& y, const SIMD_8x32& z) noexcept {
235#if defined(__AVX512VL__)
236 return _mm256_ternarylogic_epi32(x.handle(), y.handle(), z.handle(), 0xe8);
237#else
238 return SIMD_8x32::choose(x ^ y, z, y);
239#endif
240 }
241
243 static void reset_registers() noexcept { _mm256_zeroupper(); }
244
246 static void zero_registers() noexcept { _mm256_zeroall(); }
247
248 __m256i BOTAN_AVX2_FN handle() const noexcept { return m_avx2; }
249
251 SIMD_8x32(__m256i x) noexcept : m_avx2(x) {}
252
253 private:
255 static void swap_tops(SIMD_8x32& A, SIMD_8x32& B) {
256 SIMD_8x32 T0 = _mm256_permute2x128_si256(A.handle(), B.handle(), 0 + (2 << 4));
257 SIMD_8x32 T1 = _mm256_permute2x128_si256(A.handle(), B.handle(), 1 + (3 << 4));
258 A = T0;
259 B = T1;
260 }
261
262 __m256i m_avx2;
263};
264
265template <size_t R>
266inline SIMD_8x32 rotl(SIMD_8x32 input) {
267 return input.rotl<R>();
268}
269
270template <size_t R>
271inline SIMD_8x32 rotr(SIMD_8x32 input) {
272 return input.rotr<R>();
273}
274
275// For Serpent:
276template <size_t S>
277inline SIMD_8x32 shl(SIMD_8x32 input) {
278 return input.shl<S>();
279}
280
281} // namespace Botan
282
283#endif
SIMD_8x32 & operator=(SIMD_8x32 &&other)=default
SIMD_8x32(const SIMD_8x32 &other)=default
uint32_t uint32_t B2
Definition simd_avx2.h:38
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t B6
Definition simd_avx2.h:42
~SIMD_8x32()=default
uint32_t uint32_t uint32_t uint32_t uint32_t B5
Definition simd_avx2.h:41
SIMD_8x32(SIMD_8x32 &&other)=default
uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t uint32_t B7 noexcept
Definition simd_avx2.h:43
SIMD_8x32 & operator=(const SIMD_8x32 &other)=default
uint32_t uint32_t uint32_t B3
Definition simd_avx2.h:39
uint32_t uint32_t uint32_t uint32_t B4
Definition simd_avx2.h:40
int(* final)(unsigned char *, CTX *)
#define BOTAN_FORCE_INLINE
Definition compiler.h:165
ASN1_Type operator|(ASN1_Type x, ASN1_Type y)
Definition asn1_obj.h:74
constexpr T rotl(T input)
Definition rotate.h:21
OctetString operator^(const OctetString &k1, const OctetString &k2)
Definition symkey.cpp:109
OctetString operator+(const OctetString &k1, const OctetString &k2)
Definition symkey.cpp:99
constexpr T choose(T mask, T a, T b)
Definition bit_ops.h:180
constexpr T rotr(T input)
Definition rotate.h:33
BigInt operator-(const BigInt &x, const BigInt &y)
Definition bigint.h:963
constexpr auto operator|=(Strong< T1, Tags... > &a, T2 b)
constexpr auto store_le(ParamTs &&... params)
Definition loadstor.h:702
constexpr auto operator&=(Strong< T1, Tags... > &a, T2 b)
constexpr T majority(T a, T b, T c)
Definition bit_ops.h:186
std::vector< uint8_t, Alloc > & operator^=(std::vector< uint8_t, Alloc > &out, const std::vector< uint8_t, Alloc2 > &in)
Definition mem_ops.h:447
std::vector< T, Alloc > & operator+=(std::vector< T, Alloc > &out, const std::vector< T, Alloc2 > &in)
Definition secmem.h:80
SIMD_4x32 shl(SIMD_4x32 input)
Definition simd_32.h:626
constexpr auto load_le(ParamTs &&... params)
Definition loadstor.h:462
constexpr auto operator-=(Strong< T1, Tags... > &a, T2 b)
constexpr auto store_be(ParamTs &&... params)
Definition loadstor.h:711
constexpr auto load_be(ParamTs &&... params)
Definition loadstor.h:471
ECIES_Flags operator&(ECIES_Flags a, ECIES_Flags b)
Definition ecies.h:49
#define BOTAN_AVX2_FN
Definition simd_avx2.h:15