8#ifndef BOTAN_SIMD_32_H_
9#define BOTAN_SIMD_32_H_
11#include <botan/types.h>
13#if defined(BOTAN_TARGET_SUPPORTS_SSE2)
14 #include <emmintrin.h>
15 #define BOTAN_SIMD_USE_SSE2
17#elif defined(BOTAN_TARGET_SUPPORTS_ALTIVEC)
18 #include <botan/internal/loadstor.h>
22 #define BOTAN_SIMD_USE_ALTIVEC
24 #define BOTAN_SIMD_USE_VSX
27#elif defined(BOTAN_TARGET_SUPPORTS_NEON)
28 #include <botan/internal/cpuid.h>
30 #define BOTAN_SIMD_USE_NEON
33 #error "No SIMD instruction set enabled"
36#if defined(BOTAN_SIMD_USE_SSE2)
37 #define BOTAN_SIMD_ISA "sse2"
38 #define BOTAN_VPERM_ISA "ssse3"
39 #define BOTAN_CLMUL_ISA "pclmul"
40#elif defined(BOTAN_SIMD_USE_NEON)
41 #if defined(BOTAN_TARGET_ARCH_IS_ARM64)
42 #define BOTAN_SIMD_ISA "+simd"
43 #define BOTAN_CLMUL_ISA "+crypto+aes"
45 #define BOTAN_SIMD_ISA "fpu=neon"
47 #define BOTAN_VPERM_ISA BOTAN_SIMD_ISA
48#elif defined(BOTAN_SIMD_USE_ALTIVEC)
49 #define BOTAN_SIMD_ISA "altivec"
50 #define BOTAN_VPERM_ISA "altivec"
51 #define BOTAN_CLMUL_ISA "crypto"
56#if defined(BOTAN_SIMD_USE_SSE2)
57using native_simd_type = __m128i;
58#elif defined(BOTAN_SIMD_USE_ALTIVEC)
59using native_simd_type = __vector
unsigned int;
60#elif defined(BOTAN_SIMD_USE_NEON)
61using native_simd_type = uint32x4_t;
89#if defined(BOTAN_SIMD_USE_SSE2)
90 m_simd = _mm_setzero_si128();
91#elif defined(BOTAN_SIMD_USE_ALTIVEC)
92 m_simd = vec_splat_u32(0);
93#elif defined(BOTAN_SIMD_USE_NEON)
94 m_simd = vdupq_n_u32(0);
102#if defined(BOTAN_SIMD_USE_SSE2)
103 m_simd = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(B));
104#elif defined(BOTAN_SIMD_USE_ALTIVEC)
105 __vector
unsigned int val = {B[0], B[1], B[2], B[3]};
107#elif defined(BOTAN_SIMD_USE_NEON)
108 m_simd = vld1q_u32(B);
115 SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3)
noexcept {
116#if defined(BOTAN_SIMD_USE_SSE2)
117 m_simd = _mm_set_epi32(B3, B2, B1, B0);
118#elif defined(BOTAN_SIMD_USE_ALTIVEC)
119 __vector
unsigned int val = {B0, B1, B2, B3};
121#elif defined(BOTAN_SIMD_USE_NEON)
123 const uint32_t B[4] = {B0, B1, B2, B3};
124 m_simd = vld1q_u32(B);
132#if defined(BOTAN_SIMD_USE_SSE2)
134#elif defined(BOTAN_SIMD_USE_NEON)
145#if defined(BOTAN_SIMD_USE_SSE2)
147#elif defined(BOTAN_SIMD_USE_NEON)
148 return SIMD_4x32(vreinterpretq_u32_u8(vdupq_n_u8(B)));
159#if defined(BOTAN_SIMD_USE_SSE2)
160 return SIMD_4x32(_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(in)));
161#elif defined(BOTAN_SIMD_USE_ALTIVEC)
165#elif defined(BOTAN_SIMD_USE_NEON)
166 SIMD_4x32 l(vld1q_u32(
static_cast<const uint32_t*
>(in)));
175#if defined(BOTAN_SIMD_USE_SSE2)
178#elif defined(BOTAN_SIMD_USE_ALTIVEC)
183#elif defined(BOTAN_SIMD_USE_NEON)
184 SIMD_4x32 l(vld1q_u32(
static_cast<const uint32_t*
>(in)));
189 void store_le(uint32_t out[4])
const noexcept { this->
store_le(
reinterpret_cast<uint8_t*
>(out)); }
191 void store_be(uint32_t out[4])
const noexcept { this->
store_be(
reinterpret_cast<uint8_t*
>(out)); }
193 void store_le(uint64_t out[2])
const noexcept { this->
store_le(
reinterpret_cast<uint8_t*
>(out)); }
199#if defined(BOTAN_SIMD_USE_SSE2)
201 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(out),
raw());
203#elif defined(BOTAN_SIMD_USE_ALTIVEC)
206 __vector
unsigned int V;
213#elif defined(BOTAN_SIMD_USE_NEON)
215 vst1q_u8(out, vreinterpretq_u8_u32(m_simd));
217 vst1q_u8(out, vreinterpretq_u8_u32(
bswap().m_simd));
226#if defined(BOTAN_SIMD_USE_SSE2)
230#elif defined(BOTAN_SIMD_USE_ALTIVEC)
233 __vector
unsigned int V;
240#elif defined(BOTAN_SIMD_USE_NEON)
242 vst1q_u8(out, vreinterpretq_u8_u32(
bswap().m_simd));
244 vst1q_u8(out, vreinterpretq_u8_u32(m_simd));
253#if BOTAN_COMPILER_HAS_BUILTIN(__builtin_crypto_vshasigmaw) && defined(_ARCH_PWR8)
254 return SIMD_4x32(__builtin_crypto_vshasigmaw(
raw(), 1, 0));
259 return (rot1 ^ rot2 ^ rot3);
267#if BOTAN_COMPILER_HAS_BUILTIN(__builtin_crypto_vshasigmaw) && defined(_ARCH_PWR8)
268 return SIMD_4x32(__builtin_crypto_vshasigmaw(
raw(), 1, 0xF));
273 return (rot1 ^ rot2 ^ rot3);
280 template <
size_t ROT>
282 requires(ROT > 0 && ROT < 32)
284#if defined(BOTAN_SIMD_USE_SSE2)
286 return SIMD_4x32(_mm_or_si128(_mm_slli_epi32(m_simd,
static_cast<int>(ROT)),
287 _mm_srli_epi32(m_simd,
static_cast<int>(32 - ROT))));
289#elif defined(BOTAN_SIMD_USE_ALTIVEC)
291 const unsigned int r =
static_cast<unsigned int>(ROT);
292 __vector
unsigned int rot = {r, r, r, r};
295#elif defined(BOTAN_SIMD_USE_NEON)
297 #if defined(BOTAN_TARGET_ARCH_IS_ARM64)
299 if constexpr(ROT == 8) {
300 const uint8_t maskb[16] = {3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14};
301 const uint8x16_t mask = vld1q_u8(maskb);
302 return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(m_simd), mask)));
303 }
else if constexpr(ROT == 16) {
304 return SIMD_4x32(vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(m_simd))));
308 vorrq_u32(vshlq_n_u32(m_simd,
static_cast<int>(ROT)), vshrq_n_u32(m_simd,
static_cast<int>(32 - ROT))));
315 template <
size_t ROT>
317 return this->
rotl<32 - ROT>();
366#if defined(BOTAN_SIMD_USE_SSE2)
367 m_simd = _mm_add_epi32(m_simd, other.m_simd);
368#elif defined(BOTAN_SIMD_USE_ALTIVEC)
369 m_simd = vec_add(m_simd, other.m_simd);
370#elif defined(BOTAN_SIMD_USE_NEON)
371 m_simd = vaddq_u32(m_simd, other.m_simd);
376#if defined(BOTAN_SIMD_USE_SSE2)
377 m_simd = _mm_sub_epi32(m_simd, other.m_simd);
378#elif defined(BOTAN_SIMD_USE_ALTIVEC)
379 m_simd = vec_sub(m_simd, other.m_simd);
380#elif defined(BOTAN_SIMD_USE_NEON)
381 m_simd = vsubq_u32(m_simd, other.m_simd);
386#if defined(BOTAN_SIMD_USE_SSE2)
387 m_simd = _mm_xor_si128(m_simd, other.m_simd);
388#elif defined(BOTAN_SIMD_USE_ALTIVEC)
389 m_simd = vec_xor(m_simd, other.m_simd);
390#elif defined(BOTAN_SIMD_USE_NEON)
391 m_simd = veorq_u32(m_simd, other.m_simd);
398#if defined(BOTAN_SIMD_USE_SSE2)
399 m_simd = _mm_or_si128(m_simd, other.m_simd);
400#elif defined(BOTAN_SIMD_USE_ALTIVEC)
401 m_simd = vec_or(m_simd, other.m_simd);
402#elif defined(BOTAN_SIMD_USE_NEON)
403 m_simd = vorrq_u32(m_simd, other.m_simd);
408#if defined(BOTAN_SIMD_USE_SSE2)
409 m_simd = _mm_and_si128(m_simd, other.m_simd);
410#elif defined(BOTAN_SIMD_USE_ALTIVEC)
411 m_simd = vec_and(m_simd, other.m_simd);
412#elif defined(BOTAN_SIMD_USE_NEON)
413 m_simd = vandq_u32(m_simd, other.m_simd);
419 requires(SHIFT > 0 && SHIFT < 32)
421#if defined(BOTAN_SIMD_USE_SSE2)
422 return SIMD_4x32(_mm_slli_epi32(m_simd, SHIFT));
424#elif defined(BOTAN_SIMD_USE_ALTIVEC)
425 const unsigned int s =
static_cast<unsigned int>(SHIFT);
426 const __vector
unsigned int shifts = {s, s, s, s};
427 return SIMD_4x32(vec_sl(m_simd, shifts));
428#elif defined(BOTAN_SIMD_USE_NEON)
429 return SIMD_4x32(vshlq_n_u32(m_simd, SHIFT));
435#if defined(BOTAN_SIMD_USE_SSE2)
436 return SIMD_4x32(_mm_srli_epi32(m_simd, SHIFT));
438#elif defined(BOTAN_SIMD_USE_ALTIVEC)
439 const unsigned int s =
static_cast<unsigned int>(SHIFT);
440 const __vector
unsigned int shifts = {s, s, s, s};
441 return SIMD_4x32(vec_sr(m_simd, shifts));
442#elif defined(BOTAN_SIMD_USE_NEON)
443 return SIMD_4x32(vshrq_n_u32(m_simd, SHIFT));
448#if defined(BOTAN_SIMD_USE_SSE2)
449 return SIMD_4x32(_mm_xor_si128(m_simd, _mm_set1_epi32(0xFFFFFFFF)));
450#elif defined(BOTAN_SIMD_USE_ALTIVEC)
451 return SIMD_4x32(vec_nor(m_simd, m_simd));
452#elif defined(BOTAN_SIMD_USE_NEON)
459#if defined(BOTAN_SIMD_USE_SSE2)
460 return SIMD_4x32(_mm_andnot_si128(m_simd, other.m_simd));
461#elif defined(BOTAN_SIMD_USE_ALTIVEC)
466 return SIMD_4x32(vec_andc(other.m_simd, m_simd));
467#elif defined(BOTAN_SIMD_USE_NEON)
469 return SIMD_4x32(vbicq_u32(other.m_simd, m_simd));
477#if defined(BOTAN_SIMD_USE_SSE2)
480 T = _mm_shufflehi_epi16(
T, _MM_SHUFFLE(2, 3, 0, 1));
481 T = _mm_shufflelo_epi16(
T, _MM_SHUFFLE(2, 3, 0, 1));
482 return SIMD_4x32(_mm_or_si128(_mm_srli_epi16(
T, 8), _mm_slli_epi16(
T, 8)));
484#elif defined(BOTAN_SIMD_USE_ALTIVEC)
485 #ifdef BOTAN_SIMD_USE_VSX
488 const __vector
unsigned char rev[1] = {
489 {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12},
492 return SIMD_4x32(vec_perm(m_simd, m_simd, rev[0]));
495#elif defined(BOTAN_SIMD_USE_NEON)
496 return SIMD_4x32(vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(m_simd))));
504#if defined(BOTAN_SIMD_USE_SSE2)
506#elif defined(BOTAN_SIMD_USE_NEON)
507 return SIMD_4x32(vextq_u32(vdupq_n_u32(0),
raw(), 4 - I));
508#elif defined(BOTAN_SIMD_USE_ALTIVEC)
509 const __vector
unsigned int zero = vec_splat_u32(0);
511 const __vector
unsigned char shuf[3] = {
512 {16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
513 {16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7},
514 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3},
525#if defined(BOTAN_SIMD_USE_SSE2)
527#elif defined(BOTAN_SIMD_USE_NEON)
529#elif defined(BOTAN_SIMD_USE_ALTIVEC)
530 const __vector
unsigned int zero = vec_splat_u32(0);
532 const __vector
unsigned char shuf[3] = {
533 {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
534 {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
535 {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
546#if defined(BOTAN_SIMD_USE_SSE2)
547 const __m128i T0 = _mm_unpacklo_epi32(B0.m_simd, B1.m_simd);
548 const __m128i T1 = _mm_unpacklo_epi32(B2.m_simd, B3.m_simd);
549 const __m128i T2 = _mm_unpackhi_epi32(B0.m_simd, B1.m_simd);
550 const __m128i T3 = _mm_unpackhi_epi32(B2.m_simd, B3.m_simd);
552 B0.m_simd = _mm_unpacklo_epi64(T0, T1);
553 B1.m_simd = _mm_unpackhi_epi64(T0, T1);
554 B2.m_simd = _mm_unpacklo_epi64(T2, T3);
555 B3.m_simd = _mm_unpackhi_epi64(T2, T3);
556#elif defined(BOTAN_SIMD_USE_ALTIVEC)
557 const __vector
unsigned int T0 = vec_mergeh(B0.m_simd, B2.m_simd);
558 const __vector
unsigned int T1 = vec_mergeh(B1.m_simd, B3.m_simd);
559 const __vector
unsigned int T2 = vec_mergel(B0.m_simd, B2.m_simd);
560 const __vector
unsigned int T3 = vec_mergel(B1.m_simd, B3.m_simd);
562 B0.m_simd = vec_mergeh(T0, T1);
563 B1.m_simd = vec_mergel(T0, T1);
564 B2.m_simd = vec_mergeh(T2, T3);
565 B3.m_simd = vec_mergel(T2, T3);
567#elif defined(BOTAN_SIMD_USE_NEON) && defined(BOTAN_TARGET_ARCH_IS_ARM32)
568 const uint32x4x2_t T0 = vzipq_u32(B0.m_simd, B2.m_simd);
569 const uint32x4x2_t T1 = vzipq_u32(B1.m_simd, B3.m_simd);
570 const uint32x4x2_t O0 = vzipq_u32(T0.val[0], T1.val[0]);
571 const uint32x4x2_t O1 = vzipq_u32(T0.val[1], T1.val[1]);
573 B0.m_simd = O0.val[0];
574 B1.m_simd = O0.val[1];
575 B2.m_simd = O1.val[0];
576 B3.m_simd = O1.val[1];
578#elif defined(BOTAN_SIMD_USE_NEON) && defined(BOTAN_TARGET_ARCH_IS_ARM64)
579 const uint32x4_t T0 = vzip1q_u32(B0.m_simd, B2.m_simd);
580 const uint32x4_t T2 = vzip2q_u32(B0.m_simd, B2.m_simd);
581 const uint32x4_t T1 = vzip1q_u32(B1.m_simd, B3.m_simd);
582 const uint32x4_t T3 = vzip2q_u32(B1.m_simd, B3.m_simd);
584 B0.m_simd = vzip1q_u32(T0, T1);
585 B1.m_simd = vzip2q_u32(T0, T1);
586 B2.m_simd = vzip1q_u32(T2, T3);
587 B3.m_simd = vzip2q_u32(T2, T3);
592#if defined(BOTAN_SIMD_USE_ALTIVEC)
593 return SIMD_4x32(vec_sel(b.raw(), a.raw(), mask.raw()));
594#elif defined(BOTAN_SIMD_USE_NEON)
595 return SIMD_4x32(vbslq_u32(mask.raw(), a.raw(), b.raw()));
597 return (mask & a) ^ mask.andc(b);
605 native_simd_type
raw() const noexcept {
return m_simd; }
607 explicit SIMD_4x32(native_simd_type x) noexcept : m_simd(x) {}
610 native_simd_type m_simd;
615 return input.
rotl<R>();
620 return input.
rotr<R>();
626 return input.
shl<S>();
static bool is_little_endian()
static bool is_big_endian()
static SIMD_4x32 load_be(const void *in) noexcept
SIMD_4x32 andc(const SIMD_4x32 &other) const noexcept
void store_le(uint8_t out[]) const noexcept
SIMD_4x32(SIMD_4x32 &&other)=default
SIMD_4x32 & operator=(SIMD_4x32 &&other)=default
SIMD_4x32 operator|(const SIMD_4x32 &other) const noexcept
SIMD_4x32 operator^(const SIMD_4x32 &other) const noexcept
SIMD_4x32(native_simd_type x) noexcept
static void transpose(SIMD_4x32 &B0, SIMD_4x32 &B1, SIMD_4x32 &B2, SIMD_4x32 &B3) noexcept
SIMD_4x32 bswap() const noexcept
SIMD_4x32 operator+(const SIMD_4x32 &other) const noexcept
SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3) noexcept
native_simd_type raw() const noexcept
void store_le(uint32_t out[4]) const noexcept
static SIMD_4x32 load_le(const void *in) noexcept
SIMD_4x32 sigma1() const noexcept
void store_be(uint8_t out[]) const noexcept
SIMD_4x32(const SIMD_4x32 &other)=default
void operator^=(uint32_t other) noexcept
void operator^=(const SIMD_4x32 &other) noexcept
void operator+=(const SIMD_4x32 &other) noexcept
SIMD_4x32 operator~() const noexcept
void store_le(uint64_t out[2]) const noexcept
void operator|=(const SIMD_4x32 &other) noexcept
SIMD_4x32 shift_elems_left() const noexcept
void store_be(uint32_t out[4]) const noexcept
SIMD_4x32(const uint32_t B[4]) noexcept
SIMD_4x32 sigma0() const noexcept
SIMD_4x32 operator-(const SIMD_4x32 &other) const noexcept
SIMD_4x32 shr() const noexcept
static SIMD_4x32 splat_u8(uint8_t B) noexcept
SIMD_4x32 rotr() const noexcept
SIMD_4x32 shl() const noexcept
SIMD_4x32 shift_elems_right() const noexcept
SIMD_4x32 rotl() const noexcept
void operator&=(const SIMD_4x32 &other) noexcept
SIMD_4x32 operator&(const SIMD_4x32 &other) const noexcept
SIMD_4x32 & operator=(const SIMD_4x32 &other)=default
static SIMD_4x32 choose(const SIMD_4x32 &mask, const SIMD_4x32 &a, const SIMD_4x32 &b) noexcept
void operator-=(const SIMD_4x32 &other) noexcept
static SIMD_4x32 majority(const SIMD_4x32 &x, const SIMD_4x32 &y, const SIMD_4x32 &z) noexcept
static SIMD_4x32 splat(uint32_t B) noexcept
int(* final)(unsigned char *, CTX *)
constexpr T rotl(T input)
constexpr T rotr(T input)
constexpr uint32_t make_uint32(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3)
constexpr auto store_le(ParamTs &&... params)
SIMD_4x32 shl(SIMD_4x32 input)
constexpr auto load_le(ParamTs &&... params)
constexpr auto store_be(ParamTs &&... params)
constexpr auto load_be(ParamTs &&... params)