8#ifndef BOTAN_SIMD_32_H_
9#define BOTAN_SIMD_32_H_
11#include <botan/types.h>
13#if defined(BOTAN_TARGET_SUPPORTS_SSE2)
14 #include <emmintrin.h>
15 #define BOTAN_SIMD_USE_SSE2
17#elif defined(BOTAN_TARGET_SUPPORTS_ALTIVEC)
18 #include <botan/internal/bswap.h>
19 #include <botan/internal/loadstor.h>
23 #define BOTAN_SIMD_USE_ALTIVEC
25 #define BOTAN_SIMD_USE_VSX
28#elif defined(BOTAN_TARGET_SUPPORTS_NEON)
29 #include <botan/internal/cpuid.h>
31 #define BOTAN_SIMD_USE_NEON
34 #error "No SIMD instruction set enabled"
37#if defined(BOTAN_SIMD_USE_SSE2)
38 #define BOTAN_SIMD_ISA "sse2"
39 #define BOTAN_VPERM_ISA "ssse3"
40 #define BOTAN_CLMUL_ISA "pclmul"
41#elif defined(BOTAN_SIMD_USE_NEON)
42 #if defined(BOTAN_TARGET_ARCH_IS_ARM64)
43 #define BOTAN_SIMD_ISA "+simd"
44 #define BOTAN_CLMUL_ISA "+crypto+aes"
46 #define BOTAN_SIMD_ISA "fpu=neon"
48 #define BOTAN_VPERM_ISA BOTAN_SIMD_ISA
49#elif defined(BOTAN_SIMD_USE_ALTIVEC)
50 #define BOTAN_SIMD_ISA "altivec"
51 #define BOTAN_VPERM_ISA "altivec"
52 #define BOTAN_CLMUL_ISA "crypto"
57#if defined(BOTAN_SIMD_USE_SSE2)
58using native_simd_type = __m128i;
59#elif defined(BOTAN_SIMD_USE_ALTIVEC)
60using native_simd_type = __vector
unsigned int;
61#elif defined(BOTAN_SIMD_USE_NEON)
62using native_simd_type = uint32x4_t;
90#if defined(BOTAN_SIMD_USE_SSE2)
91 m_simd = _mm_setzero_si128();
92#elif defined(BOTAN_SIMD_USE_ALTIVEC)
93 m_simd = vec_splat_u32(0);
94#elif defined(BOTAN_SIMD_USE_NEON)
95 m_simd = vdupq_n_u32(0);
103#if defined(BOTAN_SIMD_USE_SSE2)
104 m_simd = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(B));
105#elif defined(BOTAN_SIMD_USE_ALTIVEC)
106 __vector
unsigned int val = {B[0], B[1], B[2], B[3]};
108#elif defined(BOTAN_SIMD_USE_NEON)
109 m_simd = vld1q_u32(B);
116 SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3)
noexcept {
117#if defined(BOTAN_SIMD_USE_SSE2)
118 m_simd = _mm_set_epi32(B3, B2, B1, B0);
119#elif defined(BOTAN_SIMD_USE_ALTIVEC)
120 __vector
unsigned int val = {B0, B1, B2, B3};
122#elif defined(BOTAN_SIMD_USE_NEON)
124 const uint32_t B[4] = {B0, B1, B2, B3};
125 m_simd = vld1q_u32(B);
133#if defined(BOTAN_SIMD_USE_SSE2)
135#elif defined(BOTAN_SIMD_USE_NEON)
146#if defined(BOTAN_SIMD_USE_SSE2)
148#elif defined(BOTAN_SIMD_USE_NEON)
149 return SIMD_4x32(vreinterpretq_u32_u8(vdupq_n_u8(B)));
160#if defined(BOTAN_SIMD_USE_SSE2)
161 return SIMD_4x32(_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(in)));
162#elif defined(BOTAN_SIMD_USE_ALTIVEC)
166#elif defined(BOTAN_SIMD_USE_NEON)
167 SIMD_4x32 l(vld1q_u32(
static_cast<const uint32_t*
>(in)));
176#if defined(BOTAN_SIMD_USE_SSE2)
179#elif defined(BOTAN_SIMD_USE_ALTIVEC)
184#elif defined(BOTAN_SIMD_USE_NEON)
185 SIMD_4x32 l(vld1q_u32(
static_cast<const uint32_t*
>(in)));
190 void store_le(uint32_t out[4])
const noexcept { this->
store_le(
reinterpret_cast<uint8_t*
>(out)); }
192 void store_be(uint32_t out[4])
const noexcept { this->
store_be(
reinterpret_cast<uint8_t*
>(out)); }
194 void store_le(uint64_t out[2])
const noexcept { this->
store_le(
reinterpret_cast<uint8_t*
>(out)); }
200#if defined(BOTAN_SIMD_USE_SSE2)
202 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(out),
raw());
204#elif defined(BOTAN_SIMD_USE_ALTIVEC)
207 __vector
unsigned int V;
214#elif defined(BOTAN_SIMD_USE_NEON)
216 vst1q_u8(out, vreinterpretq_u8_u32(m_simd));
218 vst1q_u8(out, vreinterpretq_u8_u32(
bswap().m_simd));
227#if defined(BOTAN_SIMD_USE_SSE2)
231#elif defined(BOTAN_SIMD_USE_ALTIVEC)
234 __vector
unsigned int V;
241#elif defined(BOTAN_SIMD_USE_NEON)
243 vst1q_u8(out, vreinterpretq_u8_u32(
bswap().m_simd));
245 vst1q_u8(out, vreinterpretq_u8_u32(m_simd));
254#if BOTAN_COMPILER_HAS_BUILTIN(__builtin_crypto_vshasigmaw) && defined(_ARCH_PWR8)
255 return SIMD_4x32(__builtin_crypto_vshasigmaw(
raw(), 1, 0));
260 return (rot1 ^ rot2 ^ rot3);
268#if BOTAN_COMPILER_HAS_BUILTIN(__builtin_crypto_vshasigmaw) && defined(_ARCH_PWR8)
269 return SIMD_4x32(__builtin_crypto_vshasigmaw(
raw(), 1, 0xF));
274 return (rot1 ^ rot2 ^ rot3);
281 template <
size_t ROT>
283 requires(ROT > 0 && ROT < 32)
285#if defined(BOTAN_SIMD_USE_SSE2)
287 return SIMD_4x32(_mm_or_si128(_mm_slli_epi32(m_simd,
static_cast<int>(ROT)),
288 _mm_srli_epi32(m_simd,
static_cast<int>(32 - ROT))));
290#elif defined(BOTAN_SIMD_USE_ALTIVEC)
292 const unsigned int r =
static_cast<unsigned int>(ROT);
293 __vector
unsigned int rot = {r, r, r, r};
296#elif defined(BOTAN_SIMD_USE_NEON)
298 #if defined(BOTAN_TARGET_ARCH_IS_ARM64)
300 if constexpr(ROT == 8) {
301 const uint8_t maskb[16] = {3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14};
302 const uint8x16_t mask = vld1q_u8(maskb);
303 return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(m_simd), mask)));
304 }
else if constexpr(ROT == 16) {
305 return SIMD_4x32(vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(m_simd))));
309 vorrq_u32(vshlq_n_u32(m_simd,
static_cast<int>(ROT)), vshrq_n_u32(m_simd,
static_cast<int>(32 - ROT))));
316 template <
size_t ROT>
318 return this->
rotl<32 - ROT>();
367#if defined(BOTAN_SIMD_USE_SSE2)
368 m_simd = _mm_add_epi32(m_simd, other.m_simd);
369#elif defined(BOTAN_SIMD_USE_ALTIVEC)
370 m_simd = vec_add(m_simd, other.m_simd);
371#elif defined(BOTAN_SIMD_USE_NEON)
372 m_simd = vaddq_u32(m_simd, other.m_simd);
377#if defined(BOTAN_SIMD_USE_SSE2)
378 m_simd = _mm_sub_epi32(m_simd, other.m_simd);
379#elif defined(BOTAN_SIMD_USE_ALTIVEC)
380 m_simd = vec_sub(m_simd, other.m_simd);
381#elif defined(BOTAN_SIMD_USE_NEON)
382 m_simd = vsubq_u32(m_simd, other.m_simd);
387#if defined(BOTAN_SIMD_USE_SSE2)
388 m_simd = _mm_xor_si128(m_simd, other.m_simd);
389#elif defined(BOTAN_SIMD_USE_ALTIVEC)
390 m_simd = vec_xor(m_simd, other.m_simd);
391#elif defined(BOTAN_SIMD_USE_NEON)
392 m_simd = veorq_u32(m_simd, other.m_simd);
399#if defined(BOTAN_SIMD_USE_SSE2)
400 m_simd = _mm_or_si128(m_simd, other.m_simd);
401#elif defined(BOTAN_SIMD_USE_ALTIVEC)
402 m_simd = vec_or(m_simd, other.m_simd);
403#elif defined(BOTAN_SIMD_USE_NEON)
404 m_simd = vorrq_u32(m_simd, other.m_simd);
409#if defined(BOTAN_SIMD_USE_SSE2)
410 m_simd = _mm_and_si128(m_simd, other.m_simd);
411#elif defined(BOTAN_SIMD_USE_ALTIVEC)
412 m_simd = vec_and(m_simd, other.m_simd);
413#elif defined(BOTAN_SIMD_USE_NEON)
414 m_simd = vandq_u32(m_simd, other.m_simd);
420 requires(SHIFT > 0 && SHIFT < 32)
422#if defined(BOTAN_SIMD_USE_SSE2)
423 return SIMD_4x32(_mm_slli_epi32(m_simd, SHIFT));
425#elif defined(BOTAN_SIMD_USE_ALTIVEC)
426 const unsigned int s =
static_cast<unsigned int>(SHIFT);
427 const __vector
unsigned int shifts = {s, s, s, s};
428 return SIMD_4x32(vec_sl(m_simd, shifts));
429#elif defined(BOTAN_SIMD_USE_NEON)
430 return SIMD_4x32(vshlq_n_u32(m_simd, SHIFT));
436#if defined(BOTAN_SIMD_USE_SSE2)
437 return SIMD_4x32(_mm_srli_epi32(m_simd, SHIFT));
439#elif defined(BOTAN_SIMD_USE_ALTIVEC)
440 const unsigned int s =
static_cast<unsigned int>(SHIFT);
441 const __vector
unsigned int shifts = {s, s, s, s};
442 return SIMD_4x32(vec_sr(m_simd, shifts));
443#elif defined(BOTAN_SIMD_USE_NEON)
444 return SIMD_4x32(vshrq_n_u32(m_simd, SHIFT));
449#if defined(BOTAN_SIMD_USE_SSE2)
450 return SIMD_4x32(_mm_xor_si128(m_simd, _mm_set1_epi32(0xFFFFFFFF)));
451#elif defined(BOTAN_SIMD_USE_ALTIVEC)
452 return SIMD_4x32(vec_nor(m_simd, m_simd));
453#elif defined(BOTAN_SIMD_USE_NEON)
460#if defined(BOTAN_SIMD_USE_SSE2)
461 return SIMD_4x32(_mm_andnot_si128(m_simd, other.m_simd));
462#elif defined(BOTAN_SIMD_USE_ALTIVEC)
467 return SIMD_4x32(vec_andc(other.m_simd, m_simd));
468#elif defined(BOTAN_SIMD_USE_NEON)
470 return SIMD_4x32(vbicq_u32(other.m_simd, m_simd));
478#if defined(BOTAN_SIMD_USE_SSE2)
481 T = _mm_shufflehi_epi16(
T, _MM_SHUFFLE(2, 3, 0, 1));
482 T = _mm_shufflelo_epi16(
T, _MM_SHUFFLE(2, 3, 0, 1));
483 return SIMD_4x32(_mm_or_si128(_mm_srli_epi16(
T, 8), _mm_slli_epi16(
T, 8)));
485#elif defined(BOTAN_SIMD_USE_ALTIVEC)
486 #ifdef BOTAN_SIMD_USE_VSX
489 const __vector
unsigned char rev[1] = {
490 {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12},
493 return SIMD_4x32(vec_perm(m_simd, m_simd, rev[0]));
496#elif defined(BOTAN_SIMD_USE_NEON)
497 return SIMD_4x32(vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(m_simd))));
505#if defined(BOTAN_SIMD_USE_SSE2)
507#elif defined(BOTAN_SIMD_USE_NEON)
508 return SIMD_4x32(vextq_u32(vdupq_n_u32(0),
raw(), 4 - I));
509#elif defined(BOTAN_SIMD_USE_ALTIVEC)
510 const __vector
unsigned int zero = vec_splat_u32(0);
512 const __vector
unsigned char shuf[3] = {
513 {16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
514 {16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7},
515 {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3},
526#if defined(BOTAN_SIMD_USE_SSE2)
528#elif defined(BOTAN_SIMD_USE_NEON)
530#elif defined(BOTAN_SIMD_USE_ALTIVEC)
531 const __vector
unsigned int zero = vec_splat_u32(0);
533 const __vector
unsigned char shuf[3] = {
534 {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
535 {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
536 {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27},
547#if defined(BOTAN_SIMD_USE_SSE2)
548 const __m128i T0 = _mm_unpacklo_epi32(B0.m_simd, B1.m_simd);
549 const __m128i T1 = _mm_unpacklo_epi32(B2.m_simd, B3.m_simd);
550 const __m128i T2 = _mm_unpackhi_epi32(B0.m_simd, B1.m_simd);
551 const __m128i T3 = _mm_unpackhi_epi32(B2.m_simd, B3.m_simd);
553 B0.m_simd = _mm_unpacklo_epi64(T0, T1);
554 B1.m_simd = _mm_unpackhi_epi64(T0, T1);
555 B2.m_simd = _mm_unpacklo_epi64(T2, T3);
556 B3.m_simd = _mm_unpackhi_epi64(T2, T3);
557#elif defined(BOTAN_SIMD_USE_ALTIVEC)
558 const __vector
unsigned int T0 = vec_mergeh(B0.m_simd, B2.m_simd);
559 const __vector
unsigned int T1 = vec_mergeh(B1.m_simd, B3.m_simd);
560 const __vector
unsigned int T2 = vec_mergel(B0.m_simd, B2.m_simd);
561 const __vector
unsigned int T3 = vec_mergel(B1.m_simd, B3.m_simd);
563 B0.m_simd = vec_mergeh(T0, T1);
564 B1.m_simd = vec_mergel(T0, T1);
565 B2.m_simd = vec_mergeh(T2, T3);
566 B3.m_simd = vec_mergel(T2, T3);
568#elif defined(BOTAN_SIMD_USE_NEON) && defined(BOTAN_TARGET_ARCH_IS_ARM32)
569 const uint32x4x2_t T0 = vzipq_u32(B0.m_simd, B2.m_simd);
570 const uint32x4x2_t T1 = vzipq_u32(B1.m_simd, B3.m_simd);
571 const uint32x4x2_t O0 = vzipq_u32(T0.val[0], T1.val[0]);
572 const uint32x4x2_t O1 = vzipq_u32(T0.val[1], T1.val[1]);
574 B0.m_simd = O0.val[0];
575 B1.m_simd = O0.val[1];
576 B2.m_simd = O1.val[0];
577 B3.m_simd = O1.val[1];
579#elif defined(BOTAN_SIMD_USE_NEON) && defined(BOTAN_TARGET_ARCH_IS_ARM64)
580 const uint32x4_t T0 = vzip1q_u32(B0.m_simd, B2.m_simd);
581 const uint32x4_t T2 = vzip2q_u32(B0.m_simd, B2.m_simd);
582 const uint32x4_t T1 = vzip1q_u32(B1.m_simd, B3.m_simd);
583 const uint32x4_t T3 = vzip2q_u32(B1.m_simd, B3.m_simd);
585 B0.m_simd = vzip1q_u32(T0, T1);
586 B1.m_simd = vzip2q_u32(T0, T1);
587 B2.m_simd = vzip1q_u32(T2, T3);
588 B3.m_simd = vzip2q_u32(T2, T3);
593#if defined(BOTAN_SIMD_USE_ALTIVEC)
594 return SIMD_4x32(vec_sel(b.raw(), a.raw(), mask.raw()));
595#elif defined(BOTAN_SIMD_USE_NEON)
596 return SIMD_4x32(vbslq_u32(mask.raw(), a.raw(), b.raw()));
598 return (mask & a) ^ mask.andc(b);
606 native_simd_type
raw() const noexcept {
return m_simd; }
608 explicit SIMD_4x32(native_simd_type x) noexcept : m_simd(x) {}
611 native_simd_type m_simd;
616 return input.
rotl<R>();
621 return input.
rotr<R>();
627 return input.
shl<S>();
static bool is_little_endian()
static bool is_big_endian()
static SIMD_4x32 load_be(const void *in) noexcept
SIMD_4x32 andc(const SIMD_4x32 &other) const noexcept
void store_le(uint8_t out[]) const noexcept
SIMD_4x32(SIMD_4x32 &&other)=default
SIMD_4x32 & operator=(SIMD_4x32 &&other)=default
SIMD_4x32 operator|(const SIMD_4x32 &other) const noexcept
SIMD_4x32 operator^(const SIMD_4x32 &other) const noexcept
SIMD_4x32(native_simd_type x) noexcept
static void transpose(SIMD_4x32 &B0, SIMD_4x32 &B1, SIMD_4x32 &B2, SIMD_4x32 &B3) noexcept
SIMD_4x32 bswap() const noexcept
SIMD_4x32 operator+(const SIMD_4x32 &other) const noexcept
SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3) noexcept
native_simd_type raw() const noexcept
void store_le(uint32_t out[4]) const noexcept
static SIMD_4x32 load_le(const void *in) noexcept
SIMD_4x32 sigma1() const noexcept
void store_be(uint8_t out[]) const noexcept
SIMD_4x32(const SIMD_4x32 &other)=default
void operator^=(uint32_t other) noexcept
void operator^=(const SIMD_4x32 &other) noexcept
void operator+=(const SIMD_4x32 &other) noexcept
SIMD_4x32 operator~() const noexcept
void store_le(uint64_t out[2]) const noexcept
void operator|=(const SIMD_4x32 &other) noexcept
SIMD_4x32 shift_elems_left() const noexcept
void store_be(uint32_t out[4]) const noexcept
SIMD_4x32(const uint32_t B[4]) noexcept
SIMD_4x32 sigma0() const noexcept
SIMD_4x32 operator-(const SIMD_4x32 &other) const noexcept
SIMD_4x32 shr() const noexcept
static SIMD_4x32 splat_u8(uint8_t B) noexcept
SIMD_4x32 rotr() const noexcept
SIMD_4x32 shl() const noexcept
SIMD_4x32 shift_elems_right() const noexcept
SIMD_4x32 rotl() const noexcept
void operator&=(const SIMD_4x32 &other) noexcept
SIMD_4x32 operator&(const SIMD_4x32 &other) const noexcept
SIMD_4x32 & operator=(const SIMD_4x32 &other)=default
static SIMD_4x32 choose(const SIMD_4x32 &mask, const SIMD_4x32 &a, const SIMD_4x32 &b) noexcept
void operator-=(const SIMD_4x32 &other) noexcept
static SIMD_4x32 majority(const SIMD_4x32 &x, const SIMD_4x32 &y, const SIMD_4x32 &z) noexcept
static SIMD_4x32 splat(uint32_t B) noexcept
int(* final)(unsigned char *, CTX *)
constexpr T rotl(T input)
constexpr T rotr(T input)
constexpr uint32_t make_uint32(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3)
constexpr auto store_le(ParamTs &&... params)
SIMD_4x32 shl(SIMD_4x32 input)
constexpr auto load_le(ParamTs &&... params)
constexpr auto store_be(ParamTs &&... params)
constexpr auto load_be(ParamTs &&... params)