131 void operator+=(
const SIMD_16x32& other) { m_avx512 = _mm512_add_epi32(m_avx512, other.m_avx512); }
134 void operator-=(
const SIMD_16x32& other) { m_avx512 = _mm512_sub_epi32(m_avx512, other.m_avx512); }
137 void operator^=(
const SIMD_16x32& other) { m_avx512 = _mm512_xor_si512(m_avx512, other.m_avx512); }
140 void operator^=(uint32_t other) { *
this ^= SIMD_16x32::splat(other); }
143 void operator|=(
const SIMD_16x32& other) { m_avx512 = _mm512_or_si512(m_avx512, other.m_avx512); }
146 void operator&=(
const SIMD_16x32& other) { m_avx512 = _mm512_and_si512(m_avx512, other.m_avx512); }
150 return SIMD_16x32(_mm512_slli_epi32(m_avx512, SHIFT));
155 return SIMD_16x32(_mm512_srli_epi32(m_avx512, SHIFT));
159 SIMD_16x32 operator~()
const {
return SIMD_16x32(_mm512_xor_si512(m_avx512, _mm512_set1_epi32(0xFFFFFFFF))); }
164 return SIMD_16x32(_mm512_andnot_si512(m_avx512, other.m_avx512));
167 template <u
int8_t TBL>
169 return _mm512_ternarylogic_epi32(a.raw(),
b.raw(), c.raw(), TBL);
174 const uint8_t BSWAP_MASK[64] = {
175 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
176 21, 20, 27, 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40,
177 47, 46, 45, 44, 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60,
180 const __m512i bswap = _mm512_loadu_si512(
reinterpret_cast<const __m512i*
>(BSWAP_MASK));
182 const __m512i output = _mm512_shuffle_epi8(m_avx512, bswap);
189 const __m512i T0 = _mm512_unpacklo_epi32(B0.m_avx512,
B1.m_avx512);
190 const __m512i T1 = _mm512_unpacklo_epi32(
B2.m_avx512,
B3.m_avx512);
191 const __m512i T2 = _mm512_unpackhi_epi32(B0.m_avx512,
B1.m_avx512);
192 const __m512i T3 = _mm512_unpackhi_epi32(
B2.m_avx512,
B3.m_avx512);
194 B0.m_avx512 = _mm512_unpacklo_epi64(T0, T1);
195 B1.m_avx512 = _mm512_unpackhi_epi64(T0, T1);
196 B2.m_avx512 = _mm512_unpacklo_epi64(T2, T3);
197 B3.m_avx512 = _mm512_unpackhi_epi64(T2, T3);
217 auto t0 = _mm512_unpacklo_epi32(B0.raw(),
B1.raw());
218 auto t1 = _mm512_unpackhi_epi32(B0.raw(),
B1.raw());
219 auto t2 = _mm512_unpacklo_epi32(
B2.raw(),
B3.raw());
220 auto t3 = _mm512_unpackhi_epi32(
B2.raw(),
B3.raw());
221 auto t4 = _mm512_unpacklo_epi32(
B4.raw(),
B5.raw());
222 auto t5 = _mm512_unpackhi_epi32(
B4.raw(),
B5.raw());
223 auto t6 = _mm512_unpacklo_epi32(
B6.raw(),
B7.raw());
224 auto t7 = _mm512_unpackhi_epi32(
B6.raw(),
B7.raw());
225 auto t8 = _mm512_unpacklo_epi32(
B8.raw(),
B9.raw());
226 auto t9 = _mm512_unpackhi_epi32(
B8.raw(),
B9.raw());
227 auto ta = _mm512_unpacklo_epi32(
BA.raw(),
BB.raw());
228 auto tb = _mm512_unpackhi_epi32(
BA.raw(),
BB.raw());
229 auto tc = _mm512_unpacklo_epi32(
BC.raw(),
BD.raw());
230 auto td = _mm512_unpackhi_epi32(
BC.raw(),
BD.raw());
231 auto te = _mm512_unpacklo_epi32(
BE.raw(),
BF.raw());
232 auto tf = _mm512_unpackhi_epi32(
BE.raw(),
BF.raw());
234 auto r0 = _mm512_unpacklo_epi64(t0, t2);
235 auto r1 = _mm512_unpackhi_epi64(t0, t2);
236 auto r2 = _mm512_unpacklo_epi64(t1, t3);
237 auto r3 = _mm512_unpackhi_epi64(t1, t3);
238 auto r4 = _mm512_unpacklo_epi64(t4, t6);
239 auto r5 = _mm512_unpackhi_epi64(t4, t6);
240 auto r6 = _mm512_unpacklo_epi64(t5, t7);
241 auto r7 = _mm512_unpackhi_epi64(t5, t7);
242 auto r8 = _mm512_unpacklo_epi64(t8, ta);
243 auto r9 = _mm512_unpackhi_epi64(t8, ta);
244 auto ra = _mm512_unpacklo_epi64(t9, tb);
245 auto rb = _mm512_unpackhi_epi64(t9, tb);
246 auto rc = _mm512_unpacklo_epi64(tc, te);
247 auto rd = _mm512_unpackhi_epi64(tc, te);
248 auto re = _mm512_unpacklo_epi64(td, tf);
249 auto rf = _mm512_unpackhi_epi64(td, tf);
251 t0 = _mm512_shuffle_i32x4(r0, r4, 0x88);
252 t1 = _mm512_shuffle_i32x4(r1, r5, 0x88);
253 t2 = _mm512_shuffle_i32x4(r2, r6, 0x88);
254 t3 = _mm512_shuffle_i32x4(r3, r7, 0x88);
255 t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd);
256 t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd);
257 t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd);
258 t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd);
259 t8 = _mm512_shuffle_i32x4(r8, rc, 0x88);
260 t9 = _mm512_shuffle_i32x4(r9, rd, 0x88);
261 ta = _mm512_shuffle_i32x4(ra, re, 0x88);
262 tb = _mm512_shuffle_i32x4(rb, rf, 0x88);
263 tc = _mm512_shuffle_i32x4(r8, rc, 0xdd);
264 td = _mm512_shuffle_i32x4(r9, rd, 0xdd);
265 te = _mm512_shuffle_i32x4(ra, re, 0xdd);
266 tf = _mm512_shuffle_i32x4(rb, rf, 0xdd);
268 B0.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0x88);
269 B1.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0x88);
270 B2.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0x88);
271 B3.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0x88);
272 B4.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0x88);
273 B5.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0x88);
274 B6.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0x88);
275 B7.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0x88);
276 B8.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0xdd);
277 B9.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0xdd);
278 BA.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0xdd);
279 BB.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0xdd);
280 BC.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0xdd);
281 BD.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0xdd);
282 BE.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0xdd);
283 BF.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0xdd);
288 return SIMD_16x32::ternary_fn<0xca>(mask, a,
b);
293 return SIMD_16x32::ternary_fn<0xe8>(x, y, z);