81 requires(ROT > 0 && ROT < 32)
83#if defined(__AVX512VL__)
84 return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT));
86 if constexpr(ROT == 8) {
87 const __m256i shuf_rotl_8 =
88 _mm256_set_epi64x(0x0e0d0c0f'0a09080b, 0x06050407'02010003, 0x0e0d0c0f'0a09080b, 0x06050407'02010003);
90 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8));
91 }
else if constexpr(ROT == 16) {
92 const __m256i shuf_rotl_16 =
93 _mm256_set_epi64x(0x0d0c0f0e'09080b0a, 0x05040706'01000302, 0x0d0c0f0e'09080b0a, 0x05040706'01000302);
95 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16));
96 }
else if constexpr(ROT == 24) {
97 const __m256i shuf_rotl_24 =
98 _mm256_set_epi64x(0x0c0f0e0d'080b0a09, 0x04070605'00030201, 0x0c0f0e0d'080b0a09, 0x04070605'00030201);
100 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_24));
102 return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2,
static_cast<int>(ROT)),
103 _mm256_srli_epi32(m_avx2,
static_cast<int>(32 - ROT))));
163 void operator+=(
const SIMD_8x32& other) { m_avx2 = _mm256_add_epi32(m_avx2, other.m_avx2); }
166 void operator-=(
const SIMD_8x32& other) { m_avx2 = _mm256_sub_epi32(m_avx2, other.m_avx2); }
169 void operator^=(
const SIMD_8x32& other) { m_avx2 = _mm256_xor_si256(m_avx2, other.m_avx2); }
172 void operator^=(uint32_t other) { *
this ^= SIMD_8x32::splat(other); }
175 void operator|=(
const SIMD_8x32& other) { m_avx2 = _mm256_or_si256(m_avx2, other.m_avx2); }
178 void operator&=(
const SIMD_8x32& other) { m_avx2 = _mm256_and_si256(m_avx2, other.m_avx2); }
182 return SIMD_8x32(_mm256_slli_epi32(m_avx2, SHIFT));
187 return SIMD_8x32(_mm256_srli_epi32(m_avx2, SHIFT));
192 return SIMD_8x32(_mm256_xor_si256(m_avx2, _mm256_set1_epi32(0xFFFFFFFF)));
198 return SIMD_8x32(_mm256_andnot_si256(m_avx2, other.m_avx2));
203 const uint8_t BSWAP_MASK[32] = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
204 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
206 const __m256i bswap = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(BSWAP_MASK));
208 const __m256i output = _mm256_shuffle_epi8(m_avx2, bswap);
218 const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2,
B1.m_avx2);
219 const __m256i T1 = _mm256_unpacklo_epi32(
B2.m_avx2,
B3.m_avx2);
220 const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2,
B1.m_avx2);
221 const __m256i T3 = _mm256_unpackhi_epi32(
B2.m_avx2,
B3.m_avx2);
223 B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1);
224 B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1);
225 B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3);
226 B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3);
238 transpose(B0,
B1,
B2,
B3);
239 transpose(
B4,
B5,
B6, B7);
249#if defined(__AVX512VL__)
250 return _mm256_ternarylogic_epi32(mask.raw(), a.raw(),
b.raw(), 0xca);
252 return (mask & a) ^ mask.andc(
b);
258#if defined(__AVX512VL__)
259 return _mm256_ternarylogic_epi32(x.raw(), y.raw(), z.raw(), 0xe8);
261 return SIMD_8x32::choose(x ^ y, z, y);
266 static void reset_registers()
noexcept { _mm256_zeroupper(); }
269 static void zero_registers()
noexcept { _mm256_zeroall(); }
279 SIMD_8x32 T0 = _mm256_permute2x128_si256(A.raw(), B.raw(), 0 + (2 << 4));
280 SIMD_8x32 T1 = _mm256_permute2x128_si256(A.raw(), B.raw(), 1 + (3 << 4));