82 requires(ROT > 0 && ROT < 32)
84#if defined(__AVX512VL__)
85 return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT));
87 if constexpr(ROT == 8) {
88 const __m256i shuf_rotl_8 =
89 _mm256_set_epi64x(0x0e0d0c0f'0a09080b, 0x06050407'02010003, 0x0e0d0c0f'0a09080b, 0x06050407'02010003);
91 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8));
92 }
else if constexpr(ROT == 16) {
93 const __m256i shuf_rotl_16 =
94 _mm256_set_epi64x(0x0d0c0f0e'09080b0a, 0x05040706'01000302, 0x0d0c0f0e'09080b0a, 0x05040706'01000302);
96 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16));
97 }
else if constexpr(ROT == 24) {
98 const __m256i shuf_rotl_24 =
99 _mm256_set_epi64x(0x0c0f0e0d'080b0a09, 0x04070605'00030201, 0x0c0f0e0d'080b0a09, 0x04070605'00030201);
101 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_24));
103 return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2,
static_cast<int>(ROT)),
104 _mm256_srli_epi32(m_avx2,
static_cast<int>(32 - ROT))));
164 void operator+=(
const SIMD_8x32& other) { m_avx2 = _mm256_add_epi32(m_avx2, other.m_avx2); }
167 void operator-=(
const SIMD_8x32& other) { m_avx2 = _mm256_sub_epi32(m_avx2, other.m_avx2); }
170 void operator^=(
const SIMD_8x32& other) { m_avx2 = _mm256_xor_si256(m_avx2, other.m_avx2); }
173 void operator^=(uint32_t other) { *
this ^= SIMD_8x32::splat(other); }
176 void operator|=(
const SIMD_8x32& other) { m_avx2 = _mm256_or_si256(m_avx2, other.m_avx2); }
179 void operator&=(
const SIMD_8x32& other) { m_avx2 = _mm256_and_si256(m_avx2, other.m_avx2); }
183 return SIMD_8x32(_mm256_slli_epi32(m_avx2, SHIFT));
188 return SIMD_8x32(_mm256_srli_epi32(m_avx2, SHIFT));
193 return SIMD_8x32(_mm256_xor_si256(m_avx2, _mm256_set1_epi32(0xFFFFFFFF)));
199 return SIMD_8x32(_mm256_andnot_si256(m_avx2, other.m_avx2));
204 const uint8_t BSWAP_MASK[32] = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
205 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
207 const __m256i bswap = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(BSWAP_MASK));
209 const __m256i output = _mm256_shuffle_epi8(m_avx2, bswap);
219 const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2,
B1.m_avx2);
220 const __m256i T1 = _mm256_unpacklo_epi32(
B2.m_avx2,
B3.m_avx2);
221 const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2,
B1.m_avx2);
222 const __m256i T3 = _mm256_unpackhi_epi32(
B2.m_avx2,
B3.m_avx2);
224 B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1);
225 B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1);
226 B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3);
227 B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3);
239 transpose(B0,
B1,
B2,
B3);
240 transpose(
B4,
B5,
B6, B7);
250#if defined(__AVX512VL__)
251 return _mm256_ternarylogic_epi32(mask.raw(), a.raw(),
b.raw(), 0xca);
253 return (mask & a) ^ mask.andc(
b);
259#if defined(__AVX512VL__)
260 return _mm256_ternarylogic_epi32(x.raw(), y.raw(), z.raw(), 0xe8);
262 return SIMD_8x32::choose(x ^ y, z, y);
267 static void reset_registers()
noexcept { _mm256_zeroupper(); }
270 static void zero_registers()
noexcept { _mm256_zeroall(); }
280 SIMD_8x32 T0 = _mm256_permute2x128_si256(A.raw(), B.raw(), 0 + (2 << 4));
281 SIMD_8x32 T1 = _mm256_permute2x128_si256(A.raw(), B.raw(), 1 + (3 << 4));