66 requires(ROT > 0 && ROT < 32)
68#if defined(__AVX512VL__)
69 return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT));
71 if constexpr(ROT == 8) {
72 const __m256i shuf_rotl_8 =
73 _mm256_set_epi64x(0x0e0d0c0f'0a09080b, 0x06050407'02010003, 0x0e0d0c0f'0a09080b, 0x06050407'02010003);
75 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8));
76 }
else if constexpr(ROT == 16) {
77 const __m256i shuf_rotl_16 =
78 _mm256_set_epi64x(0x0d0c0f0e'09080b0a, 0x05040706'01000302, 0x0d0c0f0e'09080b0a, 0x05040706'01000302);
80 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16));
82 return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2,
static_cast<int>(ROT)),
83 _mm256_srli_epi32(m_avx2,
static_cast<int>(32 - ROT))));
143 void operator+=(
const SIMD_8x32& other) { m_avx2 = _mm256_add_epi32(m_avx2, other.m_avx2); }
146 void operator-=(
const SIMD_8x32& other) { m_avx2 = _mm256_sub_epi32(m_avx2, other.m_avx2); }
149 void operator^=(
const SIMD_8x32& other) { m_avx2 = _mm256_xor_si256(m_avx2, other.m_avx2); }
152 void operator^=(uint32_t other) { *
this ^= SIMD_8x32::splat(other); }
155 void operator|=(
const SIMD_8x32& other) { m_avx2 = _mm256_or_si256(m_avx2, other.m_avx2); }
158 void operator&=(
const SIMD_8x32& other) { m_avx2 = _mm256_and_si256(m_avx2, other.m_avx2); }
162 return SIMD_8x32(_mm256_slli_epi32(m_avx2, SHIFT));
167 return SIMD_8x32(_mm256_srli_epi32(m_avx2, SHIFT));
172 return SIMD_8x32(_mm256_xor_si256(m_avx2, _mm256_set1_epi32(0xFFFFFFFF)));
178 return SIMD_8x32(_mm256_andnot_si256(m_avx2, other.m_avx2));
183 const uint8_t BSWAP_MASK[32] = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
184 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
186 const __m256i bswap = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(BSWAP_MASK));
188 const __m256i output = _mm256_shuffle_epi8(m_avx2, bswap);
195 const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2,
B1.m_avx2);
196 const __m256i T1 = _mm256_unpacklo_epi32(
B2.m_avx2,
B3.m_avx2);
197 const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2,
B1.m_avx2);
198 const __m256i T3 = _mm256_unpackhi_epi32(
B2.m_avx2,
B3.m_avx2);
200 B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1);
201 B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1);
202 B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3);
203 B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3);
215 transpose(B0,
B1,
B2,
B3);
216 transpose(
B4,
B5,
B6, B7);
226#if defined(__AVX512VL__)
227 return _mm256_ternarylogic_epi32(mask.handle(), a.handle(), b.handle(), 0xca);
229 return (mask & a) ^ mask.andc(b);
235#if defined(__AVX512VL__)
236 return _mm256_ternarylogic_epi32(x.handle(), y.handle(), z.handle(), 0xe8);
238 return SIMD_8x32::choose(x ^ y, z, y);
243 static void reset_registers()
noexcept { _mm256_zeroupper(); }
246 static void zero_registers()
noexcept { _mm256_zeroall(); }
256 SIMD_8x32 T0 = _mm256_permute2x128_si256(A.handle(), B.handle(), 0 + (2 << 4));
257 SIMD_8x32 T1 = _mm256_permute2x128_si256(A.handle(), B.handle(), 1 + (3 << 4));