Botan 3.9.0
Crypto and TLS for C&
simd_4x32.h
Go to the documentation of this file.
1/*
2* Lightweight wrappers for SIMD (4x32 bit) operations
3* (C) 2009,2011,2016,2017,2019,2025 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#ifndef BOTAN_SIMD_4X32_H_
9#define BOTAN_SIMD_4X32_H_
10
11#include <botan/compiler.h>
12#include <botan/types.h>
13#include <botan/internal/isa_extn.h>
14#include <botan/internal/target_info.h>
15#include <span>
16
17#if defined(BOTAN_TARGET_ARCH_SUPPORTS_SSSE3)
18 #include <emmintrin.h>
19 #include <tmmintrin.h>
20 #define BOTAN_SIMD_USE_SSSE3
21
22#elif defined(BOTAN_TARGET_ARCH_SUPPORTS_ALTIVEC)
23 #include <botan/internal/loadstor.h>
24 #include <altivec.h>
25 #undef vector
26 #undef bool
27 #define BOTAN_SIMD_USE_ALTIVEC
28 #ifdef __VSX__
29 #define BOTAN_SIMD_USE_VSX
30 #endif
31
32#elif defined(BOTAN_TARGET_ARCH_SUPPORTS_NEON)
33 #include <arm_neon.h>
34 #include <bit>
35 #define BOTAN_SIMD_USE_NEON
36
37#elif defined(BOTAN_TARGET_ARCH_SUPPORTS_LSX)
38 #include <lsxintrin.h>
39 #define BOTAN_SIMD_USE_LSX
40
41#else
42 #error "No SIMD instruction set enabled"
43#endif
44
45namespace Botan {
46
47#if defined(BOTAN_SIMD_USE_SSSE3) || defined(BOTAN_SIMD_USE_LSX)
48using native_simd_type = __m128i;
49#elif defined(BOTAN_SIMD_USE_ALTIVEC)
50using native_simd_type = __vector unsigned int;
51#elif defined(BOTAN_SIMD_USE_NEON)
52using native_simd_type = uint32x4_t;
53#endif
54
55// NOLINTBEGIN(portability-simd-intrinsics)
56
57/**
58* 4x32 bit SIMD register
59*
60* This class is not a general purpose SIMD type, and only offers instructions
61* needed for evaluation of specific crypto primitives. For example it does not
62* currently have equality operators of any kind.
63*
64* Implemented for SSE2, VMX (Altivec), ARMv7/Aarch64 NEON, and LoongArch LSX
65*/
66class SIMD_4x32 final {
67 public:
68 SIMD_4x32& operator=(const SIMD_4x32& other) = default;
69 SIMD_4x32(const SIMD_4x32& other) = default;
70
71 SIMD_4x32& operator=(SIMD_4x32&& other) = default;
72 SIMD_4x32(SIMD_4x32&& other) = default;
73
74 ~SIMD_4x32() = default;
75
76 /* NOLINTBEGIN(*-prefer-member-initializer) */
77
78 /**
79 * Zero initialize SIMD register with 4 32-bit elements
80 */
81 SIMD_4x32() noexcept {
82#if defined(BOTAN_SIMD_USE_SSSE3)
83 m_simd = _mm_setzero_si128();
84#elif defined(BOTAN_SIMD_USE_ALTIVEC)
85 m_simd = vec_splat_u32(0);
86#elif defined(BOTAN_SIMD_USE_NEON)
87 m_simd = vdupq_n_u32(0);
88#elif defined(BOTAN_SIMD_USE_LSX)
89 m_simd = __lsx_vldi(0);
90#endif
91 }
92
93 /**
94 * Load SIMD register with 4 32-bit elements
95 */
96 SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3) noexcept {
97#if defined(BOTAN_SIMD_USE_SSSE3)
98 m_simd = _mm_set_epi32(B3, B2, B1, B0);
99#elif defined(BOTAN_SIMD_USE_ALTIVEC)
100 __vector unsigned int val = {B0, B1, B2, B3};
101 m_simd = val;
102#elif defined(BOTAN_SIMD_USE_NEON)
103 // Better way to do this?
104 const uint32_t B[4] = {B0, B1, B2, B3};
105 m_simd = vld1q_u32(B);
106#elif defined(BOTAN_SIMD_USE_LSX)
107 // Better way to do this?
108 const uint32_t B[4] = {B0, B1, B2, B3};
109 m_simd = __lsx_vld(B, 0);
110#endif
111 }
112
113 /* NOLINTEND(*-prefer-member-initializer) */
114
115 /**
116 * Load SIMD register with one 32-bit element repeated
117 */
118 static SIMD_4x32 splat(uint32_t B) noexcept {
119#if defined(BOTAN_SIMD_USE_SSSE3)
120 return SIMD_4x32(_mm_set1_epi32(B));
121#elif defined(BOTAN_SIMD_USE_NEON)
122 return SIMD_4x32(vdupq_n_u32(B));
123#elif defined(BOTAN_SIMD_USE_LSX)
124 return SIMD_4x32(__lsx_vreplgr2vr_w(B));
125#else
126 return SIMD_4x32(B, B, B, B);
127#endif
128 }
129
130 /**
131 * Load SIMD register with one 8-bit element repeated
132 */
133 static SIMD_4x32 splat_u8(uint8_t B) noexcept {
134#if defined(BOTAN_SIMD_USE_SSSE3)
135 return SIMD_4x32(_mm_set1_epi8(B));
136#elif defined(BOTAN_SIMD_USE_NEON)
137 return SIMD_4x32(vreinterpretq_u32_u8(vdupq_n_u8(B)));
138#elif defined(BOTAN_SIMD_USE_LSX)
139 return SIMD_4x32(__lsx_vreplgr2vr_b(B));
140#else
141 const uint32_t B4 = make_uint32(B, B, B, B);
142 return SIMD_4x32(B4, B4, B4, B4);
143#endif
144 }
145
146 /**
147 * Load a SIMD register with little-endian convention
148 */
149 static SIMD_4x32 load_le(const void* in) noexcept {
150#if defined(BOTAN_SIMD_USE_SSSE3)
151 return SIMD_4x32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(in)));
152#elif defined(BOTAN_SIMD_USE_ALTIVEC)
153 uint32_t R0 = Botan::load_le<uint32_t>(reinterpret_cast<const uint8_t*>(in), 0);
154 uint32_t R1 = Botan::load_le<uint32_t>(reinterpret_cast<const uint8_t*>(in), 1);
155 uint32_t R2 = Botan::load_le<uint32_t>(reinterpret_cast<const uint8_t*>(in), 2);
156 uint32_t R3 = Botan::load_le<uint32_t>(reinterpret_cast<const uint8_t*>(in), 3);
157 __vector unsigned int val = {R0, R1, R2, R3};
158 return SIMD_4x32(val);
159#elif defined(BOTAN_SIMD_USE_NEON)
160 SIMD_4x32 l(vld1q_u32(static_cast<const uint32_t*>(in)));
161 if constexpr(std::endian::native == std::endian::big) {
162 return l.bswap();
163 } else {
164 return l;
165 }
166#elif defined(BOTAN_SIMD_USE_LSX)
167 return SIMD_4x32(__lsx_vld(in, 0));
168#endif
169 }
170
171 /**
172 * Load a SIMD register with big-endian convention
173 */
174 static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 load_be(const void* in) noexcept {
175#if defined(BOTAN_SIMD_USE_SSSE3) || defined(BOTAN_SIMD_USE_LSX)
176 return load_le(in).bswap();
177
178#elif defined(BOTAN_SIMD_USE_ALTIVEC)
179 uint32_t R0 = Botan::load_be<uint32_t>(reinterpret_cast<const uint8_t*>(in), 0);
180 uint32_t R1 = Botan::load_be<uint32_t>(reinterpret_cast<const uint8_t*>(in), 1);
181 uint32_t R2 = Botan::load_be<uint32_t>(reinterpret_cast<const uint8_t*>(in), 2);
182 uint32_t R3 = Botan::load_be<uint32_t>(reinterpret_cast<const uint8_t*>(in), 3);
183 __vector unsigned int val = {R0, R1, R2, R3};
184 return SIMD_4x32(val);
185
186#elif defined(BOTAN_SIMD_USE_NEON)
187 SIMD_4x32 l(vld1q_u32(static_cast<const uint32_t*>(in)));
188 if constexpr(std::endian::native == std::endian::little) {
189 return l.bswap();
190 } else {
191 return l;
192 }
193#endif
194 }
195
196 static SIMD_4x32 load_le(std::span<const uint8_t, 16> in) { return SIMD_4x32::load_le(in.data()); }
197
198 static SIMD_4x32 load_be(std::span<const uint8_t, 16> in) { return SIMD_4x32::load_be(in.data()); }
199
200 void store_le(uint32_t out[4]) const noexcept { this->store_le(reinterpret_cast<uint8_t*>(out)); }
201
202 void store_be(uint32_t out[4]) const noexcept { this->store_be(reinterpret_cast<uint8_t*>(out)); }
203
204 void store_le(uint64_t out[2]) const noexcept { this->store_le(reinterpret_cast<uint8_t*>(out)); }
205
206 /**
207 * Load a SIMD register with little-endian convention
208 */
209 void store_le(uint8_t out[]) const noexcept {
210#if defined(BOTAN_SIMD_USE_SSSE3)
211
212 _mm_storeu_si128(reinterpret_cast<__m128i*>(out), raw());
213
214#elif defined(BOTAN_SIMD_USE_ALTIVEC)
215
216 union {
217 __vector unsigned int V;
218 uint32_t R[4];
219 } vec{};
220
221 // NOLINTNEXTLINE(*-union-access)
222 vec.V = raw();
223 // NOLINTNEXTLINE(*-union-access)
224 Botan::store_le(out, vec.R[0], vec.R[1], vec.R[2], vec.R[3]);
225
226#elif defined(BOTAN_SIMD_USE_NEON)
227 if constexpr(std::endian::native == std::endian::little) {
228 vst1q_u8(out, vreinterpretq_u8_u32(m_simd));
229 } else {
230 vst1q_u8(out, vreinterpretq_u8_u32(bswap().m_simd));
231 }
232#elif defined(BOTAN_SIMD_USE_LSX)
233 __lsx_vst(raw(), out, 0);
234#endif
235 }
236
237 /**
238 * Load a SIMD register with big-endian convention
239 */
240 BOTAN_FN_ISA_SIMD_4X32 void store_be(uint8_t out[]) const noexcept {
241#if defined(BOTAN_SIMD_USE_SSSE3) || defined(BOTAN_SIMD_USE_LSX)
242
243 bswap().store_le(out);
244
245#elif defined(BOTAN_SIMD_USE_ALTIVEC)
246
247 union {
248 __vector unsigned int V;
249 uint32_t R[4];
250 } vec{};
251
252 // NOLINTNEXTLINE(*-union-access)
253 vec.V = m_simd;
254 // NOLINTNEXTLINE(*-union-access)
255 Botan::store_be(out, vec.R[0], vec.R[1], vec.R[2], vec.R[3]);
256
257#elif defined(BOTAN_SIMD_USE_NEON)
258 if constexpr(std::endian::native == std::endian::little) {
259 vst1q_u8(out, vreinterpretq_u8_u32(bswap().m_simd));
260 } else {
261 vst1q_u8(out, vreinterpretq_u8_u32(m_simd));
262 }
263#endif
264 }
265
266 void store_be(std::span<uint8_t, 16> out) const { this->store_be(out.data()); }
267
268 void store_le(std::span<uint8_t, 16> out) const { this->store_le(out.data()); }
269
270 /*
271 * This is used for SHA-2/SHACAL2
272 */
273 SIMD_4x32 sigma0() const noexcept {
274#if BOTAN_COMPILER_HAS_BUILTIN(__builtin_crypto_vshasigmaw) && defined(_ARCH_PWR8)
275 return SIMD_4x32(__builtin_crypto_vshasigmaw(raw(), 1, 0));
276#else
277 const SIMD_4x32 rot1 = this->rotr<2>();
278 const SIMD_4x32 rot2 = this->rotr<13>();
279 const SIMD_4x32 rot3 = this->rotr<22>();
280 return (rot1 ^ rot2 ^ rot3);
281#endif
282 }
283
284 /*
285 * This is used for SHA-2/SHACAL2
286 */
287 SIMD_4x32 sigma1() const noexcept {
288#if BOTAN_COMPILER_HAS_BUILTIN(__builtin_crypto_vshasigmaw) && defined(_ARCH_PWR8)
289 return SIMD_4x32(__builtin_crypto_vshasigmaw(raw(), 1, 0xF));
290#else
291 const SIMD_4x32 rot1 = this->rotr<6>();
292 const SIMD_4x32 rot2 = this->rotr<11>();
293 const SIMD_4x32 rot3 = this->rotr<25>();
294 return (rot1 ^ rot2 ^ rot3);
295#endif
296 }
297
298 /**
299 * Left rotation by a compile time constant
300 */
301 template <size_t ROT>
302 BOTAN_FN_ISA_SIMD_4X32 SIMD_4x32 rotl() const noexcept
303 requires(ROT > 0 && ROT < 32)
304 {
305#if defined(BOTAN_SIMD_USE_SSSE3)
306 if constexpr(ROT == 8) {
307 const auto shuf_rotl_8 = _mm_set_epi64x(0x0e0d0c0f0a09080b, 0x0605040702010003);
308 return SIMD_4x32(_mm_shuffle_epi8(raw(), shuf_rotl_8));
309 } else if constexpr(ROT == 16) {
310 const auto shuf_rotl_16 = _mm_set_epi64x(0x0d0c0f0e09080b0a, 0x0504070601000302);
311 return SIMD_4x32(_mm_shuffle_epi8(raw(), shuf_rotl_16));
312 } else if constexpr(ROT == 24) {
313 const auto shuf_rotl_24 = _mm_set_epi64x(0x0c0f0e0d080b0a09, 0x0407060500030201);
314 return SIMD_4x32(_mm_shuffle_epi8(raw(), shuf_rotl_24));
315 } else {
316 return SIMD_4x32(_mm_or_si128(_mm_slli_epi32(raw(), static_cast<int>(ROT)),
317 _mm_srli_epi32(raw(), static_cast<int>(32 - ROT))));
318 }
319
320#elif defined(BOTAN_SIMD_USE_ALTIVEC)
321
322 const unsigned int r = static_cast<unsigned int>(ROT);
323 __vector unsigned int rot = {r, r, r, r};
324 return SIMD_4x32(vec_rl(m_simd, rot));
325
326#elif defined(BOTAN_SIMD_USE_NEON)
327
328 #if defined(BOTAN_TARGET_ARCH_IS_ARM64)
329
330 if constexpr(ROT == 8) {
331 const uint8_t maskb[16] = {3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14};
332 const uint8x16_t mask = vld1q_u8(maskb);
333 return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(m_simd), mask)));
334 } else if constexpr(ROT == 16) {
335 return SIMD_4x32(vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(m_simd))));
336 }
337 #endif
338 return SIMD_4x32(
339 vorrq_u32(vshlq_n_u32(m_simd, static_cast<int>(ROT)), vshrq_n_u32(m_simd, static_cast<int>(32 - ROT))));
340#elif defined(BOTAN_SIMD_USE_LSX)
341 return SIMD_4x32(__lsx_vrotri_w(raw(), 32 - ROT));
342#endif
343 }
344
345 /**
346 * Right rotation by a compile time constant
347 */
348 template <size_t ROT>
349 SIMD_4x32 rotr() const noexcept {
350 return this->rotl<32 - ROT>();
351 }
352
353 /**
354 * Add elements of a SIMD vector
355 */
356 SIMD_4x32 operator+(const SIMD_4x32& other) const noexcept {
357 SIMD_4x32 retval(*this);
358 retval += other;
359 return retval;
360 }
361
362 /**
363 * Subtract elements of a SIMD vector
364 */
365 SIMD_4x32 operator-(const SIMD_4x32& other) const noexcept {
366 SIMD_4x32 retval(*this);
367 retval -= other;
368 return retval;
369 }
370
371 /**
372 * XOR elements of a SIMD vector
373 */
374 SIMD_4x32 operator^(const SIMD_4x32& other) const noexcept {
375 SIMD_4x32 retval(*this);
376 retval ^= other;
377 return retval;
378 }
379
380 /**
381 * Binary OR elements of a SIMD vector
382 */
383 SIMD_4x32 operator|(const SIMD_4x32& other) const noexcept {
384 SIMD_4x32 retval(*this);
385 retval |= other;
386 return retval;
387 }
388
389 /**
390 * Binary AND elements of a SIMD vector
391 */
392 SIMD_4x32 operator&(const SIMD_4x32& other) const noexcept {
393 SIMD_4x32 retval(*this);
394 retval &= other;
395 return retval;
396 }
397
398 void operator+=(const SIMD_4x32& other) noexcept {
399#if defined(BOTAN_SIMD_USE_SSSE3)
400 m_simd = _mm_add_epi32(m_simd, other.m_simd);
401#elif defined(BOTAN_SIMD_USE_ALTIVEC)
402 m_simd = vec_add(m_simd, other.m_simd);
403#elif defined(BOTAN_SIMD_USE_NEON)
404 m_simd = vaddq_u32(m_simd, other.m_simd);
405#elif defined(BOTAN_SIMD_USE_LSX)
406 m_simd = __lsx_vadd_w(m_simd, other.m_simd);
407#endif
408 }
409
410 void operator-=(const SIMD_4x32& other) noexcept {
411#if defined(BOTAN_SIMD_USE_SSSE3)
412 m_simd = _mm_sub_epi32(m_simd, other.m_simd);
413#elif defined(BOTAN_SIMD_USE_ALTIVEC)
414 m_simd = vec_sub(m_simd, other.m_simd);
415#elif defined(BOTAN_SIMD_USE_NEON)
416 m_simd = vsubq_u32(m_simd, other.m_simd);
417#elif defined(BOTAN_SIMD_USE_LSX)
418 m_simd = __lsx_vsub_w(m_simd, other.m_simd);
419#endif
420 }
421
422 void operator^=(const SIMD_4x32& other) noexcept {
423#if defined(BOTAN_SIMD_USE_SSSE3)
424 m_simd = _mm_xor_si128(m_simd, other.m_simd);
425#elif defined(BOTAN_SIMD_USE_ALTIVEC)
426 m_simd = vec_xor(m_simd, other.m_simd);
427#elif defined(BOTAN_SIMD_USE_NEON)
428 m_simd = veorq_u32(m_simd, other.m_simd);
429#elif defined(BOTAN_SIMD_USE_LSX)
430 m_simd = __lsx_vxor_v(m_simd, other.m_simd);
431#endif
432 }
433
434 void operator^=(uint32_t other) noexcept { *this ^= SIMD_4x32::splat(other); }
435
436 void operator|=(const SIMD_4x32& other) noexcept {
437#if defined(BOTAN_SIMD_USE_SSSE3)
438 m_simd = _mm_or_si128(m_simd, other.m_simd);
439#elif defined(BOTAN_SIMD_USE_ALTIVEC)
440 m_simd = vec_or(m_simd, other.m_simd);
441#elif defined(BOTAN_SIMD_USE_NEON)
442 m_simd = vorrq_u32(m_simd, other.m_simd);
443#elif defined(BOTAN_SIMD_USE_LSX)
444 m_simd = __lsx_vor_v(m_simd, other.m_simd);
445#endif
446 }
447
448 void operator&=(const SIMD_4x32& other) noexcept {
449#if defined(BOTAN_SIMD_USE_SSSE3)
450 m_simd = _mm_and_si128(m_simd, other.m_simd);
451#elif defined(BOTAN_SIMD_USE_ALTIVEC)
452 m_simd = vec_and(m_simd, other.m_simd);
453#elif defined(BOTAN_SIMD_USE_NEON)
454 m_simd = vandq_u32(m_simd, other.m_simd);
455#elif defined(BOTAN_SIMD_USE_LSX)
456 m_simd = __lsx_vand_v(m_simd, other.m_simd);
457#endif
458 }
459
460 template <int SHIFT>
461 SIMD_4x32 shl() const noexcept
462 requires(SHIFT > 0 && SHIFT < 32)
463 {
464#if defined(BOTAN_SIMD_USE_SSSE3)
465 return SIMD_4x32(_mm_slli_epi32(m_simd, SHIFT));
466
467#elif defined(BOTAN_SIMD_USE_ALTIVEC)
468 const unsigned int s = static_cast<unsigned int>(SHIFT);
469 const __vector unsigned int shifts = {s, s, s, s};
470 return SIMD_4x32(vec_sl(m_simd, shifts));
471#elif defined(BOTAN_SIMD_USE_NEON)
472 return SIMD_4x32(vshlq_n_u32(m_simd, SHIFT));
473#elif defined(BOTAN_SIMD_USE_LSX)
474 return SIMD_4x32(__lsx_vslli_w(m_simd, SHIFT));
475#endif
476 }
477
478 template <int SHIFT>
479 SIMD_4x32 shr() const noexcept {
480#if defined(BOTAN_SIMD_USE_SSSE3)
481 return SIMD_4x32(_mm_srli_epi32(m_simd, SHIFT));
482
483#elif defined(BOTAN_SIMD_USE_ALTIVEC)
484 const unsigned int s = static_cast<unsigned int>(SHIFT);
485 const __vector unsigned int shifts = {s, s, s, s};
486 return SIMD_4x32(vec_sr(m_simd, shifts));
487#elif defined(BOTAN_SIMD_USE_NEON)
488 return SIMD_4x32(vshrq_n_u32(m_simd, SHIFT));
489#elif defined(BOTAN_SIMD_USE_LSX)
490 return SIMD_4x32(__lsx_vsrli_w(m_simd, SHIFT));
491#endif
492 }
493
494 SIMD_4x32 operator~() const noexcept {
495#if defined(BOTAN_SIMD_USE_SSSE3)
496 return SIMD_4x32(_mm_xor_si128(m_simd, _mm_set1_epi32(0xFFFFFFFF)));
497#elif defined(BOTAN_SIMD_USE_ALTIVEC)
498 return SIMD_4x32(vec_nor(m_simd, m_simd));
499#elif defined(BOTAN_SIMD_USE_NEON)
500 return SIMD_4x32(vmvnq_u32(m_simd));
501#elif defined(BOTAN_SIMD_USE_LSX)
502 return SIMD_4x32(__lsx_vnor_v(m_simd, m_simd));
503#endif
504 }
505
506 // (~reg) & other
507 SIMD_4x32 andc(const SIMD_4x32& other) const noexcept {
508#if defined(BOTAN_SIMD_USE_SSSE3)
509 return SIMD_4x32(_mm_andnot_si128(m_simd, other.m_simd));
510#elif defined(BOTAN_SIMD_USE_ALTIVEC)
511 /*
512 AltiVec does arg1 & ~arg2 rather than SSE's ~arg1 & arg2
513 so swap the arguments
514 */
515 return SIMD_4x32(vec_andc(other.m_simd, m_simd));
516#elif defined(BOTAN_SIMD_USE_NEON)
517 // NEON is also a & ~b
518 return SIMD_4x32(vbicq_u32(other.m_simd, m_simd));
519#elif defined(BOTAN_SIMD_USE_LSX)
520 // LSX is ~a & b
521 return SIMD_4x32(__lsx_vandn_v(m_simd, other.m_simd));
522#endif
523 }
524
525 /**
526 * Return copy *this with each word byte swapped
527 */
528 BOTAN_FN_ISA_SIMD_4X32 SIMD_4x32 bswap() const noexcept {
529#if defined(BOTAN_SIMD_USE_SSSE3)
530 const auto idx = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
531
532 return SIMD_4x32(_mm_shuffle_epi8(raw(), idx));
533#elif defined(BOTAN_SIMD_USE_ALTIVEC)
534 #ifdef BOTAN_SIMD_USE_VSX
535 return SIMD_4x32(vec_revb(m_simd));
536 #else
537 const __vector unsigned char rev[1] = {
538 {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12},
539 };
540
541 return SIMD_4x32(vec_perm(m_simd, m_simd, rev[0]));
542 #endif
543
544#elif defined(BOTAN_SIMD_USE_NEON)
545 return SIMD_4x32(vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(m_simd))));
546#elif defined(BOTAN_SIMD_USE_LSX)
547 return SIMD_4x32(__lsx_vshuf4i_b(m_simd, 0b00011011));
548#endif
549 }
550
551 template <size_t I>
553 requires(I <= 3)
554 {
555#if defined(BOTAN_SIMD_USE_SSSE3)
556 return SIMD_4x32(_mm_slli_si128(raw(), 4 * I));
557#elif defined(BOTAN_SIMD_USE_NEON)
558 return SIMD_4x32(vextq_u32(vdupq_n_u32(0), raw(), 4 - I));
559#elif defined(BOTAN_SIMD_USE_ALTIVEC)
560 const __vector unsigned int zero = vec_splat_u32(0);
561
562 const __vector unsigned char shuf[3] = {
563 {16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
564 {16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7},
565 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3},
566 };
567
568 return SIMD_4x32(vec_perm(raw(), zero, shuf[I - 1]));
569#elif defined(BOTAN_SIMD_USE_LSX)
570 return SIMD_4x32(__lsx_vbsll_v(raw(), 4 * I));
571#endif
572 }
573
574 template <size_t I>
576 requires(I <= 3)
577 {
578#if defined(BOTAN_SIMD_USE_SSSE3)
579 return SIMD_4x32(_mm_srli_si128(raw(), 4 * I));
580#elif defined(BOTAN_SIMD_USE_NEON)
581 return SIMD_4x32(vextq_u32(raw(), vdupq_n_u32(0), I));
582#elif defined(BOTAN_SIMD_USE_ALTIVEC)
583 const __vector unsigned int zero = vec_splat_u32(0);
584
585 const __vector unsigned char shuf[3] = {
586 {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
587 {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
588 {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
589 };
590
591 return SIMD_4x32(vec_perm(raw(), zero, shuf[I - 1]));
592#elif defined(BOTAN_SIMD_USE_LSX)
593 return SIMD_4x32(__lsx_vbsrl_v(raw(), 4 * I));
594#endif
595 }
596
597 /**
598 * 4x4 Transposition on SIMD registers
599 */
600 static void transpose(SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) noexcept {
601#if defined(BOTAN_SIMD_USE_SSSE3)
602 const __m128i T0 = _mm_unpacklo_epi32(B0.m_simd, B1.m_simd);
603 const __m128i T1 = _mm_unpacklo_epi32(B2.m_simd, B3.m_simd);
604 const __m128i T2 = _mm_unpackhi_epi32(B0.m_simd, B1.m_simd);
605 const __m128i T3 = _mm_unpackhi_epi32(B2.m_simd, B3.m_simd);
606
607 B0.m_simd = _mm_unpacklo_epi64(T0, T1);
608 B1.m_simd = _mm_unpackhi_epi64(T0, T1);
609 B2.m_simd = _mm_unpacklo_epi64(T2, T3);
610 B3.m_simd = _mm_unpackhi_epi64(T2, T3);
611#elif defined(BOTAN_SIMD_USE_ALTIVEC)
612 const __vector unsigned int T0 = vec_mergeh(B0.m_simd, B2.m_simd);
613 const __vector unsigned int T1 = vec_mergeh(B1.m_simd, B3.m_simd);
614 const __vector unsigned int T2 = vec_mergel(B0.m_simd, B2.m_simd);
615 const __vector unsigned int T3 = vec_mergel(B1.m_simd, B3.m_simd);
616
617 B0.m_simd = vec_mergeh(T0, T1);
618 B1.m_simd = vec_mergel(T0, T1);
619 B2.m_simd = vec_mergeh(T2, T3);
620 B3.m_simd = vec_mergel(T2, T3);
621
622#elif defined(BOTAN_SIMD_USE_NEON) && defined(BOTAN_TARGET_ARCH_IS_ARM32)
623 const uint32x4x2_t T0 = vzipq_u32(B0.m_simd, B2.m_simd);
624 const uint32x4x2_t T1 = vzipq_u32(B1.m_simd, B3.m_simd);
625 const uint32x4x2_t O0 = vzipq_u32(T0.val[0], T1.val[0]);
626 const uint32x4x2_t O1 = vzipq_u32(T0.val[1], T1.val[1]);
627
628 B0.m_simd = O0.val[0];
629 B1.m_simd = O0.val[1];
630 B2.m_simd = O1.val[0];
631 B3.m_simd = O1.val[1];
632
633#elif defined(BOTAN_SIMD_USE_NEON) && defined(BOTAN_TARGET_ARCH_IS_ARM64)
634 const uint32x4_t T0 = vzip1q_u32(B0.m_simd, B2.m_simd);
635 const uint32x4_t T2 = vzip2q_u32(B0.m_simd, B2.m_simd);
636 const uint32x4_t T1 = vzip1q_u32(B1.m_simd, B3.m_simd);
637 const uint32x4_t T3 = vzip2q_u32(B1.m_simd, B3.m_simd);
638
639 B0.m_simd = vzip1q_u32(T0, T1);
640 B1.m_simd = vzip2q_u32(T0, T1);
641 B2.m_simd = vzip1q_u32(T2, T3);
642 B3.m_simd = vzip2q_u32(T2, T3);
643#elif defined(BOTAN_SIMD_USE_LSX)
644 const __m128i T0 = __lsx_vilvl_w(B2.raw(), B0.raw());
645 const __m128i T1 = __lsx_vilvh_w(B2.raw(), B0.raw());
646 const __m128i T2 = __lsx_vilvl_w(B3.raw(), B1.raw());
647 const __m128i T3 = __lsx_vilvh_w(B3.raw(), B1.raw());
648 B0.m_simd = __lsx_vilvl_w(T2, T0);
649 B1.m_simd = __lsx_vilvh_w(T2, T0);
650 B2.m_simd = __lsx_vilvl_w(T3, T1);
651 B3.m_simd = __lsx_vilvh_w(T3, T1);
652#endif
653 }
654
655 static inline SIMD_4x32 choose(const SIMD_4x32& mask, const SIMD_4x32& a, const SIMD_4x32& b) noexcept {
656#if defined(BOTAN_SIMD_USE_ALTIVEC)
657 return SIMD_4x32(vec_sel(b.raw(), a.raw(), mask.raw()));
658#elif defined(BOTAN_SIMD_USE_NEON)
659 return SIMD_4x32(vbslq_u32(mask.raw(), a.raw(), b.raw()));
660#elif defined(BOTAN_SIMD_USE_LSX)
661 return SIMD_4x32(__lsx_vbitsel_v(b.raw(), a.raw(), mask.raw()));
662#else
663 return (mask & a) ^ mask.andc(b);
664#endif
665 }
666
667 static inline SIMD_4x32 majority(const SIMD_4x32& x, const SIMD_4x32& y, const SIMD_4x32& z) noexcept {
668 return SIMD_4x32::choose(x ^ y, z, y);
669 }
670
671 /**
672 * Byte shuffle
673 *
674 * This function assumes that each byte of idx is <= 16; it may produce incorrect
675 * results if this does not hold.
676 */
677 static inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 byte_shuffle(const SIMD_4x32& tbl, const SIMD_4x32& idx) {
678#if defined(BOTAN_SIMD_USE_SSSE3)
679 return SIMD_4x32(_mm_shuffle_epi8(tbl.raw(), idx.raw()));
680#elif defined(BOTAN_SIMD_USE_NEON)
681 const uint8x16_t tbl8 = vreinterpretq_u8_u32(tbl.raw());
682 const uint8x16_t idx8 = vreinterpretq_u8_u32(idx.raw());
683
684 #if defined(BOTAN_TARGET_ARCH_IS_ARM32)
685 const uint8x8x2_t tbl2 = {vget_low_u8(tbl8), vget_high_u8(tbl8)};
686
687 return SIMD_4x32(
688 vreinterpretq_u32_u8(vcombine_u8(vtbl2_u8(tbl2, vget_low_u8(idx8)), vtbl2_u8(tbl2, vget_high_u8(idx8)))));
689 #else
690 return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(tbl8, idx8)));
691 #endif
692
693#elif defined(BOTAN_SIMD_USE_ALTIVEC)
694 const auto r = vec_perm(reinterpret_cast<__vector signed char>(tbl.raw()),
695 reinterpret_cast<__vector signed char>(tbl.raw()),
696 reinterpret_cast<__vector unsigned char>(idx.raw()));
697 return SIMD_4x32(reinterpret_cast<__vector unsigned int>(r));
698#elif defined(BOTAN_SIMD_USE_LSX)
699 return SIMD_4x32(__lsx_vshuf_b(tbl.raw(), tbl.raw(), idx.raw()));
700#endif
701 }
702
703 /**
704 * Byte shuffle with masking
705 *
706 * If the index is >= 128 then the output byte is set to zero.
707 *
708 * Warning: for indices between 16 and 128 this function may have different
709 * behaviors depending on the CPU; possibly the output is zero, tbl[idx % 16],
710 * or even undefined.
711 */
712 inline static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 masked_byte_shuffle(const SIMD_4x32& tbl, const SIMD_4x32& idx) {
713#if defined(BOTAN_SIMD_USE_ALTIVEC)
714 const auto zero = vec_splat_s8(0x00);
715 const auto mask = vec_cmplt(reinterpret_cast<__vector signed char>(idx.raw()), zero);
716 const auto r = vec_perm(reinterpret_cast<__vector signed char>(tbl.raw()),
717 reinterpret_cast<__vector signed char>(tbl.raw()),
718 reinterpret_cast<__vector unsigned char>(idx.raw()));
719 return SIMD_4x32(reinterpret_cast<__vector unsigned int>(vec_sel(r, zero, mask)));
720#elif defined(BOTAN_SIMD_USE_LSX)
721 /*
722 * The behavior of vshuf.b unfortunately differs among microarchitectures
723 * when the index is larger than the available elements. In LA664 CPUs,
724 * larger indices result in a zero byte, which is exactly what we want.
725 * Unfortunately on LA464 machines, the output is instead undefined.
726 *
727 * So we must use a slower sequence that handles the larger indices.
728 * If we had a way of knowing at compile time that we are on an LA664
729 * or later, we could use __lsx_vshuf_b without the comparison or select.
730 */
731 const auto zero = __lsx_vldi(0);
732 const auto r = __lsx_vshuf_b(zero, tbl.raw(), idx.raw());
733 const auto mask = __lsx_vslti_bu(idx.raw(), 16);
734 return SIMD_4x32(__lsx_vbitsel_v(zero, r, mask));
735#else
736 // ARM and x86 byte shuffles have the behavior we want for out of range idx
737 return SIMD_4x32::byte_shuffle(tbl, idx);
738#endif
739 }
740
741 static inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 alignr4(const SIMD_4x32& a, const SIMD_4x32& b) {
742#if defined(BOTAN_SIMD_USE_SSSE3)
743 return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), 4));
744#elif defined(BOTAN_SIMD_USE_NEON)
745 return SIMD_4x32(vextq_u32(b.raw(), a.raw(), 1));
746#elif defined(BOTAN_SIMD_USE_ALTIVEC)
747 const __vector unsigned char mask = {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19};
748 return SIMD_4x32(vec_perm(b.raw(), a.raw(), mask));
749#elif defined(BOTAN_SIMD_USE_LSX)
750 const auto mask = SIMD_4x32(0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110);
751 return SIMD_4x32(__lsx_vshuf_b(a.raw(), b.raw(), mask.raw()));
752#endif
753 }
754
755 static inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 alignr8(const SIMD_4x32& a, const SIMD_4x32& b) {
756#if defined(BOTAN_SIMD_USE_SSSE3)
757 return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), 8));
758#elif defined(BOTAN_SIMD_USE_NEON)
759 return SIMD_4x32(vextq_u32(b.raw(), a.raw(), 2));
760#elif defined(BOTAN_SIMD_USE_ALTIVEC)
761 const __vector unsigned char mask = {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
762 return SIMD_4x32(vec_perm(b.raw(), a.raw(), mask));
763#elif defined(BOTAN_SIMD_USE_LSX)
764 return SIMD_4x32(__lsx_vshuf4i_d(a.raw(), b.raw(), 0b0011));
765#endif
766 }
767
768 native_simd_type raw() const noexcept { return m_simd; }
769
770 explicit SIMD_4x32(native_simd_type x) noexcept : m_simd(x) {}
771
772 private:
773 native_simd_type m_simd;
774};
775
776// NOLINTEND(portability-simd-intrinsics)
777
778template <size_t R>
779inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 rotl(SIMD_4x32 input) {
780 return input.rotl<R>();
781}
782
783template <size_t R>
784inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 rotr(SIMD_4x32 input) {
785 return input.rotr<R>();
786}
787
788// For Serpent:
789template <size_t S>
790inline SIMD_4x32 shl(SIMD_4x32 input) {
791 return input.shl<S>();
792}
793
794} // namespace Botan
795
796#endif
SIMD_4x32 andc(const SIMD_4x32 &other) const noexcept
Definition simd_4x32.h:507
void store_le(uint8_t out[]) const noexcept
Definition simd_4x32.h:209
SIMD_4x32(SIMD_4x32 &&other)=default
static SIMD_4x32 load_le(std::span< const uint8_t, 16 > in)
Definition simd_4x32.h:196
SIMD_4x32 & operator=(SIMD_4x32 &&other)=default
SIMD_4x32 operator|(const SIMD_4x32 &other) const noexcept
Definition simd_4x32.h:383
SIMD_4x32 operator^(const SIMD_4x32 &other) const noexcept
Definition simd_4x32.h:374
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 load_be(const void *in) noexcept
Definition simd_4x32.h:174
SIMD_4x32(native_simd_type x) noexcept
Definition simd_4x32.h:770
void store_be(std::span< uint8_t, 16 > out) const
Definition simd_4x32.h:266
static void transpose(SIMD_4x32 &B0, SIMD_4x32 &B1, SIMD_4x32 &B2, SIMD_4x32 &B3) noexcept
Definition simd_4x32.h:600
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 byte_shuffle(const SIMD_4x32 &tbl, const SIMD_4x32 &idx)
Definition simd_4x32.h:677
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 alignr4(const SIMD_4x32 &a, const SIMD_4x32 &b)
Definition simd_4x32.h:741
SIMD_4x32 operator+(const SIMD_4x32 &other) const noexcept
Definition simd_4x32.h:356
void store_le(std::span< uint8_t, 16 > out) const
Definition simd_4x32.h:268
SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3) noexcept
Definition simd_4x32.h:96
native_simd_type raw() const noexcept
Definition simd_4x32.h:768
SIMD_4x32() noexcept
Definition simd_4x32.h:81
void store_le(uint32_t out[4]) const noexcept
Definition simd_4x32.h:200
static SIMD_4x32 load_le(const void *in) noexcept
Definition simd_4x32.h:149
SIMD_4x32 sigma1() const noexcept
Definition simd_4x32.h:287
SIMD_4x32(const SIMD_4x32 &other)=default
~SIMD_4x32()=default
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 alignr8(const SIMD_4x32 &a, const SIMD_4x32 &b)
Definition simd_4x32.h:755
void operator^=(uint32_t other) noexcept
Definition simd_4x32.h:434
void operator^=(const SIMD_4x32 &other) noexcept
Definition simd_4x32.h:422
void operator+=(const SIMD_4x32 &other) noexcept
Definition simd_4x32.h:398
SIMD_4x32 operator~() const noexcept
Definition simd_4x32.h:494
void store_le(uint64_t out[2]) const noexcept
Definition simd_4x32.h:204
void operator|=(const SIMD_4x32 &other) noexcept
Definition simd_4x32.h:436
static SIMD_4x32 load_be(std::span< const uint8_t, 16 > in)
Definition simd_4x32.h:198
SIMD_4x32 shift_elems_left() const noexcept
Definition simd_4x32.h:552
void store_be(uint32_t out[4]) const noexcept
Definition simd_4x32.h:202
SIMD_4x32 sigma0() const noexcept
Definition simd_4x32.h:273
SIMD_4x32 operator-(const SIMD_4x32 &other) const noexcept
Definition simd_4x32.h:365
SIMD_4x32 shr() const noexcept
Definition simd_4x32.h:479
static SIMD_4x32 splat_u8(uint8_t B) noexcept
Definition simd_4x32.h:133
SIMD_4x32 rotr() const noexcept
Definition simd_4x32.h:349
SIMD_4x32 shl() const noexcept
Definition simd_4x32.h:461
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 masked_byte_shuffle(const SIMD_4x32 &tbl, const SIMD_4x32 &idx)
Definition simd_4x32.h:712
SIMD_4x32 shift_elems_right() const noexcept
Definition simd_4x32.h:575
void operator&=(const SIMD_4x32 &other) noexcept
Definition simd_4x32.h:448
BOTAN_FN_ISA_SIMD_4X32 SIMD_4x32 bswap() const noexcept
Definition simd_4x32.h:528
SIMD_4x32 operator&(const SIMD_4x32 &other) const noexcept
Definition simd_4x32.h:392
SIMD_4x32 & operator=(const SIMD_4x32 &other)=default
BOTAN_FN_ISA_SIMD_4X32 SIMD_4x32 rotl() const noexcept
Definition simd_4x32.h:302
static SIMD_4x32 choose(const SIMD_4x32 &mask, const SIMD_4x32 &a, const SIMD_4x32 &b) noexcept
Definition simd_4x32.h:655
void operator-=(const SIMD_4x32 &other) noexcept
Definition simd_4x32.h:410
static SIMD_4x32 majority(const SIMD_4x32 &x, const SIMD_4x32 &y, const SIMD_4x32 &z) noexcept
Definition simd_4x32.h:667
BOTAN_FN_ISA_SIMD_4X32 void store_be(uint8_t out[]) const noexcept
Definition simd_4x32.h:240
static SIMD_4x32 splat(uint32_t B) noexcept
Definition simd_4x32.h:118
BOTAN_FORCE_INLINE constexpr T rotr(T input)
Definition rotate.h:35
constexpr uint32_t make_uint32(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3)
Definition loadstor.h:104
constexpr auto store_le(ParamTs &&... params)
Definition loadstor.h:736
BOTAN_FORCE_INLINE constexpr T rotl(T input)
Definition rotate.h:23
SIMD_4x32 shl(SIMD_4x32 input)
Definition simd_4x32.h:790
constexpr auto load_le(ParamTs &&... params)
Definition loadstor.h:495
constexpr auto store_be(ParamTs &&... params)
Definition loadstor.h:745
constexpr auto load_be(ParamTs &&... params)
Definition loadstor.h:504