7#ifndef BOTAN_SIMD_AVX2_H_
8#define BOTAN_SIMD_AVX2_H_
10#include <botan/compiler.h>
11#include <botan/types.h>
12#include <botan/internal/isa_extn.h>
32 m_avx2 = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(B));
43 uint32_t B7)
noexcept {
44 m_avx2 = _mm256_set_epi32(B7, B6, B5, B4, B3, B2, B1, B0);
48 explicit SIMD_8x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3)
noexcept {
49 m_avx2 = _mm256_set_epi32(B3, B2, B1, B0, B3, B2, B1, B0);
48 explicit SIMD_8x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3)
noexcept {
…}
57 return SIMD_8x32(_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(in)));
62 return SIMD_8x32(_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(in)));
67 return SIMD_8x32(_mm256_broadcastsi128_si256(_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(in))));
72 return SIMD_8x32(_mm256_broadcastsi128_si256(_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(in))));
79 void store_le(uint8_t out[])
const noexcept { _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(out), m_avx2); }
83 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(out), _mm256_extracti128_si256(
raw(), 0));
89 _mm256_loadu2_m128i(
reinterpret_cast<const __m128i*
>(in2),
reinterpret_cast<const __m128i*
>(in1)));
95 _mm256_loadu2_m128i(
reinterpret_cast<const __m128i*
>(in2),
reinterpret_cast<const __m128i*
>(in1)))
100 void store_le128(uint32_t out1[], uint32_t out2[])
const noexcept {
101 _mm256_storeu2_m128i(
reinterpret_cast<__m128i*
>(out2),
reinterpret_cast<__m128i*
>(out1),
raw());
107 template <
size_t ROT>
109 requires(ROT > 0 && ROT < 32)
111#if defined(__AVX512VL__)
112 return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT));
114 if constexpr(ROT == 8) {
115 const __m256i shuf_rotl_8 =
116 _mm256_set_epi64x(0x0e0d0c0f'0a09080b, 0x06050407'02010003, 0x0e0d0c0f'0a09080b, 0x06050407'02010003);
118 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8));
119 }
else if constexpr(ROT == 16) {
120 const __m256i shuf_rotl_16 =
121 _mm256_set_epi64x(0x0d0c0f0e'09080b0a, 0x05040706'01000302, 0x0d0c0f0e'09080b0a, 0x05040706'01000302);
123 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16));
124 }
else if constexpr(ROT == 24) {
125 const __m256i shuf_rotl_24 =
126 _mm256_set_epi64x(0x0c0f0e0d'080b0a09, 0x04070605'00030201, 0x0c0f0e0d'080b0a09, 0x04070605'00030201);
128 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_24));
130 return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2,
static_cast<int>(ROT)),
131 _mm256_srli_epi32(m_avx2,
static_cast<int>(32 - ROT))));
136 template <
size_t ROT>
138 return this->
rotl<32 - ROT>();
145 return rot1 ^ rot2 ^ rot3;
152 return rot1 ^ rot2 ^ rot3;
210 return SIMD_8x32(_mm256_slli_epi32(m_avx2, SHIFT));
215 return SIMD_8x32(_mm256_srli_epi32(m_avx2, SHIFT));
220 return SIMD_8x32(_mm256_xor_si256(m_avx2, _mm256_set1_epi32(0xFFFFFFFF)));
226 return SIMD_8x32(_mm256_andnot_si256(m_avx2, other.m_avx2));
231 alignas(32)
const uint8_t BSWAP_TBL[32] = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
232 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
234 const __m256i
bswap = _mm256_load_si256(
reinterpret_cast<const __m256i*
>(BSWAP_TBL));
236 const __m256i output = _mm256_shuffle_epi8(m_avx2,
bswap);
244 alignas(32)
const uint8_t REV_TBL[32] = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
245 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
247 const __m256i
bswap = _mm256_load_si256(
reinterpret_cast<const __m256i*
>(REV_TBL));
248 const __m256i output = _mm256_shuffle_epi8(m_avx2,
bswap);
257 const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2, B1.m_avx2);
258 const __m256i T1 = _mm256_unpacklo_epi32(B2.m_avx2, B3.m_avx2);
259 const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2, B1.m_avx2);
260 const __m256i T3 = _mm256_unpackhi_epi32(B2.m_avx2, B3.m_avx2);
262 B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1);
263 B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1);
264 B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3);
265 B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3);
288#if defined(__AVX512VL__)
289 return _mm256_ternarylogic_epi32(mask.raw(), a.raw(), b.raw(), 0xca);
291 return (mask & a) ^ mask.andc(b);
297#if defined(__AVX512VL__)
298 return _mm256_ternarylogic_epi32(x.raw(), y.raw(), z.raw(), 0xe8);
314 __m256i BOTAN_FN_ISA_AVX2
raw() const noexcept {
return m_avx2; }
322 SIMD_8x32 T0 = _mm256_permute2x128_si256(A.
raw(), B.
raw(), 0 + (2 << 4));
323 SIMD_8x32 T1 = _mm256_permute2x128_si256(A.
raw(), B.
raw(), 1 + (3 << 4));
333 return input.
rotl<R>();
338 return input.
rotr<R>();
344 return input.
shl<S>();
SIMD_8x32 BOTAN_FN_ISA_AVX2 sigma0() const noexcept
BOTAN_FN_ISA_AVX2 SIMD_8x32 operator^(const SIMD_8x32 &other) const noexcept
__m256i BOTAN_FN_ISA_AVX2 raw() const noexcept
SIMD_8x32 & operator=(SIMD_8x32 &&other)=default
BOTAN_FN_ISA_AVX2 SIMD_8x32 shl() const noexcept
SIMD_8x32(const SIMD_8x32 &other)=default
BOTAN_FN_ISA_AVX2 SIMD_8x32 shr() const noexcept
SIMD_8x32 BOTAN_FN_ISA_AVX2 sigma1() const noexcept
BOTAN_FN_ISA_AVX2 SIMD_8x32(__m256i x) noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_le128(const uint8_t *in) noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_le128(const uint32_t *in) noexcept
static BOTAN_FN_ISA_AVX2 void reset_registers() noexcept
BOTAN_FN_ISA_AVX2 SIMD_8x32 rev_words() const noexcept
BOTAN_FN_ISA_AVX2 SIMD_8x32 rotr() const noexcept
BOTAN_FN_ISA_AVX2 void operator&=(const SIMD_8x32 &other)
BOTAN_FN_ISA_AVX2 SIMD_8x32 rotl() const noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_le128(const uint32_t in1[], const uint32_t in2[]) noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_le(const uint32_t *in) noexcept
BOTAN_FN_ISA_AVX2 SIMD_8x32 reverse() const noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_le(const uint8_t *in) noexcept
BOTAN_FN_ISA_AVX2 void operator^=(uint32_t other)
BOTAN_FN_ISA_AVX2 void store_le128(uint32_t out1[], uint32_t out2[]) const noexcept
BOTAN_FN_ISA_AVX2 void operator|=(const SIMD_8x32 &other)
SIMD_8x32(SIMD_8x32 &&other)=default
BOTAN_FN_ISA_AVX2 SIMD_8x32 operator~() const noexcept
BOTAN_FN_ISA_AVX2 void operator^=(const SIMD_8x32 &other)
BOTAN_FN_ISA_AVX2 void operator-=(const SIMD_8x32 &other)
BOTAN_FN_ISA_AVX2 SIMD_8x32(const uint32_t B[8]) noexcept
BOTAN_FN_ISA_AVX2 void store_le128(uint8_t out[]) const noexcept
static BOTAN_FN_ISA_AVX2 void transpose(SIMD_8x32 &B0, SIMD_8x32 &B1, SIMD_8x32 &B2, SIMD_8x32 &B3) noexcept
static BOTAN_FN_ISA_AVX2 void zero_registers() noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 majority(const SIMD_8x32 &x, const SIMD_8x32 &y, const SIMD_8x32 &z) noexcept
BOTAN_FN_ISA_AVX2 void store_be(uint8_t out[]) const noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 splat(uint32_t B) noexcept
static BOTAN_FN_ISA_AVX2 void transpose(SIMD_8x32 &B0, SIMD_8x32 &B1, SIMD_8x32 &B2, SIMD_8x32 &B3, SIMD_8x32 &B4, SIMD_8x32 &B5, SIMD_8x32 &B6, SIMD_8x32 &B7) noexcept
BOTAN_FN_ISA_AVX2 SIMD_8x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3) noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 choose(const SIMD_8x32 &mask, const SIMD_8x32 &a, const SIMD_8x32 &b) noexcept
BOTAN_FN_ISA_AVX2 SIMD_8x32 andc(const SIMD_8x32 &other) const noexcept
BOTAN_FN_ISA_AVX2 SIMD_8x32 operator-(const SIMD_8x32 &other) const noexcept
SIMD_8x32 & operator=(const SIMD_8x32 &other)=default
BOTAN_FN_ISA_AVX2 void operator+=(const SIMD_8x32 &other)
static SIMD_8x32 BOTAN_FN_ISA_AVX2 byte_shuffle(const SIMD_8x32 &tbl, const SIMD_8x32 &idx)
BOTAN_FN_ISA_AVX2 void store_le(uint8_t out[]) const noexcept
BOTAN_FN_ISA_AVX2 SIMD_8x32 operator&(const SIMD_8x32 &other) const noexcept
BOTAN_FN_ISA_AVX2 SIMD_8x32 operator+(const SIMD_8x32 &other) const noexcept
BOTAN_FN_ISA_AVX2 SIMD_8x32 bswap() const noexcept
BOTAN_FN_ISA_AVX2 BOTAN_FORCE_INLINE SIMD_8x32() noexcept
BOTAN_FN_ISA_AVX2 SIMD_8x32 operator|(const SIMD_8x32 &other) const noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be128(const uint8_t in1[], const uint8_t in2[]) noexcept
BOTAN_FN_ISA_AVX2 SIMD_8x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3, uint32_t B4, uint32_t B5, uint32_t B6, uint32_t B7) noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be(const uint8_t *in) noexcept
#define BOTAN_FORCE_INLINE
constexpr T rotl(T input)
constexpr T rotr(T input)
SIMD_4x32 shl(SIMD_4x32 input)