8#ifndef BOTAN_SIMD_32_H_
9#define BOTAN_SIMD_32_H_
11#include <botan/types.h>
15#if defined(BOTAN_TARGET_SUPPORTS_SSE2)
16 #include <emmintrin.h>
17 #define BOTAN_SIMD_USE_SSE2
19#elif defined(BOTAN_TARGET_SUPPORTS_ALTIVEC)
20 #include <botan/internal/loadstor.h>
24 #define BOTAN_SIMD_USE_ALTIVEC
26 #define BOTAN_SIMD_USE_VSX
29#elif defined(BOTAN_TARGET_SUPPORTS_NEON)
30 #include <botan/internal/cpuid.h>
32 #define BOTAN_SIMD_USE_NEON
35 #error "No SIMD instruction set enabled"
38#if defined(BOTAN_SIMD_USE_SSE2)
39 #define BOTAN_SIMD_ISA "sse2"
40 #define BOTAN_VPERM_ISA "ssse3"
41 #define BOTAN_CLMUL_ISA "pclmul"
42#elif defined(BOTAN_SIMD_USE_NEON)
43 #if defined(BOTAN_TARGET_ARCH_IS_ARM64)
44 #define BOTAN_SIMD_ISA "+simd"
45 #define BOTAN_CLMUL_ISA "+crypto+aes"
47 #define BOTAN_SIMD_ISA "fpu=neon"
49 #define BOTAN_VPERM_ISA BOTAN_SIMD_ISA
50#elif defined(BOTAN_SIMD_USE_ALTIVEC)
51 #define BOTAN_SIMD_ISA "altivec"
52 #define BOTAN_VPERM_ISA "altivec"
53 #define BOTAN_CLMUL_ISA "crypto"
58#if defined(BOTAN_SIMD_USE_SSE2)
59using native_simd_type = __m128i;
60#elif defined(BOTAN_SIMD_USE_ALTIVEC)
61using native_simd_type = __vector
unsigned int;
62#elif defined(BOTAN_SIMD_USE_NEON)
63using native_simd_type = uint32x4_t;
90#if defined(BOTAN_SIMD_USE_SSE2)
91 m_simd = _mm_setzero_si128();
92#elif defined(BOTAN_SIMD_USE_ALTIVEC)
93 m_simd = vec_splat_u32(0);
94#elif defined(BOTAN_SIMD_USE_NEON)
95 m_simd = vdupq_n_u32(0);
103#if defined(BOTAN_SIMD_USE_SSE2)
104 m_simd = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(B));
105#elif defined(BOTAN_SIMD_USE_ALTIVEC)
106 __vector
unsigned int val = {B[0], B[1], B[2], B[3]};
108#elif defined(BOTAN_SIMD_USE_NEON)
109 m_simd = vld1q_u32(B);
116 SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3)
noexcept {
117#if defined(BOTAN_SIMD_USE_SSE2)
118 m_simd = _mm_set_epi32(B3, B2, B1, B0);
119#elif defined(BOTAN_SIMD_USE_ALTIVEC)
120 __vector
unsigned int val = {B0, B1, B2, B3};
122#elif defined(BOTAN_SIMD_USE_NEON)
124 const uint32_t B[4] = {B0, B1, B2, B3};
125 m_simd = vld1q_u32(B);
133#if defined(BOTAN_SIMD_USE_SSE2)
135#elif defined(BOTAN_SIMD_USE_NEON)
146#if defined(BOTAN_SIMD_USE_SSE2)
148#elif defined(BOTAN_SIMD_USE_NEON)
149 return SIMD_4x32(vreinterpretq_u32_u8(vdupq_n_u8(B)));
160#if defined(BOTAN_SIMD_USE_SSE2)
161 return SIMD_4x32(_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(in)));
162#elif defined(BOTAN_SIMD_USE_ALTIVEC)
166#elif defined(BOTAN_SIMD_USE_NEON)
167 SIMD_4x32 l(vld1q_u32(
static_cast<const uint32_t*
>(in)));
176#if defined(BOTAN_SIMD_USE_SSE2)
179#elif defined(BOTAN_SIMD_USE_ALTIVEC)
184#elif defined(BOTAN_SIMD_USE_NEON)
185 SIMD_4x32 l(vld1q_u32(
static_cast<const uint32_t*
>(in)));
194 void store_le(uint32_t out[4])
const noexcept { this->
store_le(
reinterpret_cast<uint8_t*
>(out)); }
196 void store_be(uint32_t out[4])
const noexcept { this->
store_be(
reinterpret_cast<uint8_t*
>(out)); }
198 void store_le(uint64_t out[2])
const noexcept { this->
store_le(
reinterpret_cast<uint8_t*
>(out)); }
204#if defined(BOTAN_SIMD_USE_SSE2)
206 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(out),
raw());
208#elif defined(BOTAN_SIMD_USE_ALTIVEC)
211 __vector
unsigned int V;
218#elif defined(BOTAN_SIMD_USE_NEON)
220 vst1q_u8(out, vreinterpretq_u8_u32(m_simd));
222 vst1q_u8(out, vreinterpretq_u8_u32(
bswap().m_simd));
231#if defined(BOTAN_SIMD_USE_SSE2)
235#elif defined(BOTAN_SIMD_USE_ALTIVEC)
238 __vector
unsigned int V;
245#elif defined(BOTAN_SIMD_USE_NEON)
247 vst1q_u8(out, vreinterpretq_u8_u32(
bswap().m_simd));
249 vst1q_u8(out, vreinterpretq_u8_u32(m_simd));
262#if BOTAN_COMPILER_HAS_BUILTIN(__builtin_crypto_vshasigmaw) && defined(_ARCH_PWR8)
263 return SIMD_4x32(__builtin_crypto_vshasigmaw(
raw(), 1, 0));
268 return (rot1 ^ rot2 ^ rot3);
276#if BOTAN_COMPILER_HAS_BUILTIN(__builtin_crypto_vshasigmaw) && defined(_ARCH_PWR8)
277 return SIMD_4x32(__builtin_crypto_vshasigmaw(
raw(), 1, 0xF));
282 return (rot1 ^ rot2 ^ rot3);
289 template <
size_t ROT>
291 requires(ROT > 0 && ROT < 32)
293#if defined(BOTAN_SIMD_USE_SSE2)
295 return SIMD_4x32(_mm_or_si128(_mm_slli_epi32(m_simd,
static_cast<int>(ROT)),
296 _mm_srli_epi32(m_simd,
static_cast<int>(32 - ROT))));
298#elif defined(BOTAN_SIMD_USE_ALTIVEC)
300 const unsigned int r =
static_cast<unsigned int>(ROT);
301 __vector
unsigned int rot = {r, r, r, r};
304#elif defined(BOTAN_SIMD_USE_NEON)
306 #if defined(BOTAN_TARGET_ARCH_IS_ARM64)
308 if constexpr(ROT == 8) {
309 const uint8_t maskb[16] = {3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14};
310 const uint8x16_t mask = vld1q_u8(maskb);
311 return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(m_simd), mask)));
312 }
else if constexpr(ROT == 16) {
313 return SIMD_4x32(vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(m_simd))));
317 vorrq_u32(vshlq_n_u32(m_simd,
static_cast<int>(ROT)), vshrq_n_u32(m_simd,
static_cast<int>(32 - ROT))));
324 template <
size_t ROT>
326 return this->
rotl<32 - ROT>();
375#if defined(BOTAN_SIMD_USE_SSE2)
376 m_simd = _mm_add_epi32(m_simd, other.m_simd);
377#elif defined(BOTAN_SIMD_USE_ALTIVEC)
378 m_simd = vec_add(m_simd, other.m_simd);
379#elif defined(BOTAN_SIMD_USE_NEON)
380 m_simd = vaddq_u32(m_simd, other.m_simd);
385#if defined(BOTAN_SIMD_USE_SSE2)
386 m_simd = _mm_sub_epi32(m_simd, other.m_simd);
387#elif defined(BOTAN_SIMD_USE_ALTIVEC)
388 m_simd = vec_sub(m_simd, other.m_simd);
389#elif defined(BOTAN_SIMD_USE_NEON)
390 m_simd = vsubq_u32(m_simd, other.m_simd);
395#if defined(BOTAN_SIMD_USE_SSE2)
396 m_simd = _mm_xor_si128(m_simd, other.m_simd);
397#elif defined(BOTAN_SIMD_USE_ALTIVEC)
398 m_simd = vec_xor(m_simd, other.m_simd);
399#elif defined(BOTAN_SIMD_USE_NEON)
400 m_simd = veorq_u32(m_simd, other.m_simd);
407#if defined(BOTAN_SIMD_USE_SSE2)
408 m_simd = _mm_or_si128(m_simd, other.m_simd);
409#elif defined(BOTAN_SIMD_USE_ALTIVEC)
410 m_simd = vec_or(m_simd, other.m_simd);
411#elif defined(BOTAN_SIMD_USE_NEON)
412 m_simd = vorrq_u32(m_simd, other.m_simd);
417#if defined(BOTAN_SIMD_USE_SSE2)
418 m_simd = _mm_and_si128(m_simd, other.m_simd);
419#elif defined(BOTAN_SIMD_USE_ALTIVEC)
420 m_simd = vec_and(m_simd, other.m_simd);
421#elif defined(BOTAN_SIMD_USE_NEON)
422 m_simd = vandq_u32(m_simd, other.m_simd);
428 requires(SHIFT > 0 && SHIFT < 32)
430#if defined(BOTAN_SIMD_USE_SSE2)
431 return SIMD_4x32(_mm_slli_epi32(m_simd, SHIFT));
433#elif defined(BOTAN_SIMD_USE_ALTIVEC)
434 const unsigned int s =
static_cast<unsigned int>(SHIFT);
435 const __vector
unsigned int shifts = {s, s, s, s};
436 return SIMD_4x32(vec_sl(m_simd, shifts));
437#elif defined(BOTAN_SIMD_USE_NEON)
438 return SIMD_4x32(vshlq_n_u32(m_simd, SHIFT));
444#if defined(BOTAN_SIMD_USE_SSE2)
445 return SIMD_4x32(_mm_srli_epi32(m_simd, SHIFT));
447#elif defined(BOTAN_SIMD_USE_ALTIVEC)
448 const unsigned int s =
static_cast<unsigned int>(SHIFT);
449 const __vector
unsigned int shifts = {s, s, s, s};
450 return SIMD_4x32(vec_sr(m_simd, shifts));
451#elif defined(BOTAN_SIMD_USE_NEON)
452 return SIMD_4x32(vshrq_n_u32(m_simd, SHIFT));
457#if defined(BOTAN_SIMD_USE_SSE2)
458 return SIMD_4x32(_mm_xor_si128(m_simd, _mm_set1_epi32(0xFFFFFFFF)));
459#elif defined(BOTAN_SIMD_USE_ALTIVEC)
460 return SIMD_4x32(vec_nor(m_simd, m_simd));
461#elif defined(BOTAN_SIMD_USE_NEON)
468#if defined(BOTAN_SIMD_USE_SSE2)
469 return SIMD_4x32(_mm_andnot_si128(m_simd, other.m_simd));
470#elif defined(BOTAN_SIMD_USE_ALTIVEC)
475 return SIMD_4x32(vec_andc(other.m_simd, m_simd));
476#elif defined(BOTAN_SIMD_USE_NEON)
478 return SIMD_4x32(vbicq_u32(other.m_simd, m_simd));
486#if defined(BOTAN_SIMD_USE_SSE2)
489 T = _mm_shufflehi_epi16(
T, _MM_SHUFFLE(2, 3, 0, 1));
490 T = _mm_shufflelo_epi16(
T, _MM_SHUFFLE(2, 3, 0, 1));
491 return SIMD_4x32(_mm_or_si128(_mm_srli_epi16(
T, 8), _mm_slli_epi16(
T, 8)));
493#elif defined(BOTAN_SIMD_USE_ALTIVEC)
494 #ifdef BOTAN_SIMD_USE_VSX
497 const __vector
unsigned char rev[1] = {
498 {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12},
501 return SIMD_4x32(vec_perm(m_simd, m_simd, rev[0]));
504#elif defined(BOTAN_SIMD_USE_NEON)
505 return SIMD_4x32(vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(m_simd))));
513#if defined(BOTAN_SIMD_USE_SSE2)
515#elif defined(BOTAN_SIMD_USE_NEON)
516 return SIMD_4x32(vextq_u32(vdupq_n_u32(0),
raw(), 4 - I));
517#elif defined(BOTAN_SIMD_USE_ALTIVEC)
518 const __vector
unsigned int zero = vec_splat_u32(0);
520 const __vector
unsigned char shuf[3] = {
521 {16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
522 {16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7},
523 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3},
534#if defined(BOTAN_SIMD_USE_SSE2)
536#elif defined(BOTAN_SIMD_USE_NEON)
538#elif defined(BOTAN_SIMD_USE_ALTIVEC)
539 const __vector
unsigned int zero = vec_splat_u32(0);
541 const __vector
unsigned char shuf[3] = {
542 {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
543 {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
544 {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
555#if defined(BOTAN_SIMD_USE_SSE2)
556 const __m128i T0 = _mm_unpacklo_epi32(B0.m_simd, B1.m_simd);
557 const __m128i T1 = _mm_unpacklo_epi32(B2.m_simd, B3.m_simd);
558 const __m128i T2 = _mm_unpackhi_epi32(B0.m_simd, B1.m_simd);
559 const __m128i T3 = _mm_unpackhi_epi32(B2.m_simd, B3.m_simd);
561 B0.m_simd = _mm_unpacklo_epi64(T0, T1);
562 B1.m_simd = _mm_unpackhi_epi64(T0, T1);
563 B2.m_simd = _mm_unpacklo_epi64(T2, T3);
564 B3.m_simd = _mm_unpackhi_epi64(T2, T3);
565#elif defined(BOTAN_SIMD_USE_ALTIVEC)
566 const __vector
unsigned int T0 = vec_mergeh(B0.m_simd, B2.m_simd);
567 const __vector
unsigned int T1 = vec_mergeh(B1.m_simd, B3.m_simd);
568 const __vector
unsigned int T2 = vec_mergel(B0.m_simd, B2.m_simd);
569 const __vector
unsigned int T3 = vec_mergel(B1.m_simd, B3.m_simd);
571 B0.m_simd = vec_mergeh(T0, T1);
572 B1.m_simd = vec_mergel(T0, T1);
573 B2.m_simd = vec_mergeh(T2, T3);
574 B3.m_simd = vec_mergel(T2, T3);
576#elif defined(BOTAN_SIMD_USE_NEON) && defined(BOTAN_TARGET_ARCH_IS_ARM32)
577 const uint32x4x2_t T0 = vzipq_u32(B0.m_simd, B2.m_simd);
578 const uint32x4x2_t T1 = vzipq_u32(B1.m_simd, B3.m_simd);
579 const uint32x4x2_t O0 = vzipq_u32(T0.val[0], T1.val[0]);
580 const uint32x4x2_t O1 = vzipq_u32(T0.val[1], T1.val[1]);
582 B0.m_simd = O0.val[0];
583 B1.m_simd = O0.val[1];
584 B2.m_simd = O1.val[0];
585 B3.m_simd = O1.val[1];
587#elif defined(BOTAN_SIMD_USE_NEON) && defined(BOTAN_TARGET_ARCH_IS_ARM64)
588 const uint32x4_t T0 = vzip1q_u32(B0.m_simd, B2.m_simd);
589 const uint32x4_t T2 = vzip2q_u32(B0.m_simd, B2.m_simd);
590 const uint32x4_t T1 = vzip1q_u32(B1.m_simd, B3.m_simd);
591 const uint32x4_t T3 = vzip2q_u32(B1.m_simd, B3.m_simd);
593 B0.m_simd = vzip1q_u32(T0, T1);
594 B1.m_simd = vzip2q_u32(T0, T1);
595 B2.m_simd = vzip1q_u32(T2, T3);
596 B3.m_simd = vzip2q_u32(T2, T3);
601#if defined(BOTAN_SIMD_USE_ALTIVEC)
602 return SIMD_4x32(vec_sel(
b.raw(), a.raw(), mask.raw()));
603#elif defined(BOTAN_SIMD_USE_NEON)
604 return SIMD_4x32(vbslq_u32(mask.raw(), a.raw(),
b.raw()));
606 return (mask & a) ^ mask.andc(
b);
614 native_simd_type
raw() const noexcept {
return m_simd; }
616 explicit SIMD_4x32(native_simd_type x) noexcept : m_simd(x) {}
619 native_simd_type m_simd;
624 return input.
rotl<R>();
629 return input.
rotr<R>();
635 return input.
shl<S>();
static bool is_little_endian()
static bool is_big_endian()
static SIMD_4x32 load_be(const void *in) noexcept
SIMD_4x32 andc(const SIMD_4x32 &other) const noexcept
void store_le(uint8_t out[]) const noexcept
SIMD_4x32(SIMD_4x32 &&other)=default
static SIMD_4x32 load_le(std::span< const uint8_t, 16 > in)
SIMD_4x32 & operator=(SIMD_4x32 &&other)=default
SIMD_4x32 operator|(const SIMD_4x32 &other) const noexcept
SIMD_4x32 operator^(const SIMD_4x32 &other) const noexcept
SIMD_4x32(native_simd_type x) noexcept
void store_be(std::span< uint8_t, 16 > out) const
static void transpose(SIMD_4x32 &B0, SIMD_4x32 &B1, SIMD_4x32 &B2, SIMD_4x32 &B3) noexcept
SIMD_4x32 bswap() const noexcept
SIMD_4x32 operator+(const SIMD_4x32 &other) const noexcept
void store_le(std::span< uint8_t, 16 > out) const
SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3) noexcept
native_simd_type raw() const noexcept
void store_le(uint32_t out[4]) const noexcept
static SIMD_4x32 load_le(const void *in) noexcept
SIMD_4x32 sigma1() const noexcept
void store_be(uint8_t out[]) const noexcept
SIMD_4x32(const SIMD_4x32 &other)=default
void operator^=(uint32_t other) noexcept
void operator^=(const SIMD_4x32 &other) noexcept
void operator+=(const SIMD_4x32 &other) noexcept
SIMD_4x32 operator~() const noexcept
void store_le(uint64_t out[2]) const noexcept
void operator|=(const SIMD_4x32 &other) noexcept
static SIMD_4x32 load_be(std::span< const uint8_t, 16 > in)
SIMD_4x32 shift_elems_left() const noexcept
void store_be(uint32_t out[4]) const noexcept
SIMD_4x32(const uint32_t B[4]) noexcept
SIMD_4x32 sigma0() const noexcept
SIMD_4x32 operator-(const SIMD_4x32 &other) const noexcept
SIMD_4x32 shr() const noexcept
static SIMD_4x32 splat_u8(uint8_t B) noexcept
SIMD_4x32 rotr() const noexcept
SIMD_4x32 shl() const noexcept
SIMD_4x32 shift_elems_right() const noexcept
SIMD_4x32 rotl() const noexcept
void operator&=(const SIMD_4x32 &other) noexcept
SIMD_4x32 operator&(const SIMD_4x32 &other) const noexcept
SIMD_4x32 & operator=(const SIMD_4x32 &other)=default
static SIMD_4x32 choose(const SIMD_4x32 &mask, const SIMD_4x32 &a, const SIMD_4x32 &b) noexcept
void operator-=(const SIMD_4x32 &other) noexcept
static SIMD_4x32 majority(const SIMD_4x32 &x, const SIMD_4x32 &y, const SIMD_4x32 &z) noexcept
static SIMD_4x32 splat(uint32_t B) noexcept
int(* final)(unsigned char *, CTX *)
constexpr T rotl(T input)
constexpr T rotr(T input)
constexpr uint32_t make_uint32(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3)
constexpr auto store_le(ParamTs &&... params)
SIMD_4x32 shl(SIMD_4x32 input)
constexpr auto load_le(ParamTs &&... params)
constexpr auto store_be(ParamTs &&... params)
constexpr auto load_be(ParamTs &&... params)