130 void operator+=(
const SIMD_16x32& other) { m_avx512 = _mm512_add_epi32(m_avx512, other.m_avx512); }
133 void operator-=(
const SIMD_16x32& other) { m_avx512 = _mm512_sub_epi32(m_avx512, other.m_avx512); }
136 void operator^=(
const SIMD_16x32& other) { m_avx512 = _mm512_xor_si512(m_avx512, other.m_avx512); }
139 void operator^=(uint32_t other) { *
this ^= SIMD_16x32::splat(other); }
142 void operator|=(
const SIMD_16x32& other) { m_avx512 = _mm512_or_si512(m_avx512, other.m_avx512); }
145 void operator&=(
const SIMD_16x32& other) { m_avx512 = _mm512_and_si512(m_avx512, other.m_avx512); }
149 return SIMD_16x32(_mm512_slli_epi32(m_avx512, SHIFT));
154 return SIMD_16x32(_mm512_srli_epi32(m_avx512, SHIFT));
158 SIMD_16x32 operator~()
const {
return SIMD_16x32(_mm512_xor_si512(m_avx512, _mm512_set1_epi32(0xFFFFFFFF))); }
163 return SIMD_16x32(_mm512_andnot_si512(m_avx512, other.m_avx512));
166 template <u
int8_t TBL>
168 return _mm512_ternarylogic_epi32(a.raw(),
b.raw(), c.raw(), TBL);
173 const uint8_t BSWAP_MASK[64] = {
174 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
175 21, 20, 27, 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40,
176 47, 46, 45, 44, 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60,
179 const __m512i bswap = _mm512_loadu_si512(
reinterpret_cast<const __m512i*
>(BSWAP_MASK));
181 const __m512i output = _mm512_shuffle_epi8(m_avx512, bswap);
188 const __m512i T0 = _mm512_unpacklo_epi32(B0.m_avx512,
B1.m_avx512);
189 const __m512i T1 = _mm512_unpacklo_epi32(
B2.m_avx512,
B3.m_avx512);
190 const __m512i T2 = _mm512_unpackhi_epi32(B0.m_avx512,
B1.m_avx512);
191 const __m512i T3 = _mm512_unpackhi_epi32(
B2.m_avx512,
B3.m_avx512);
193 B0.m_avx512 = _mm512_unpacklo_epi64(T0, T1);
194 B1.m_avx512 = _mm512_unpackhi_epi64(T0, T1);
195 B2.m_avx512 = _mm512_unpacklo_epi64(T2, T3);
196 B3.m_avx512 = _mm512_unpackhi_epi64(T2, T3);
216 auto t0 = _mm512_unpacklo_epi32(B0.raw(),
B1.raw());
217 auto t1 = _mm512_unpackhi_epi32(B0.raw(),
B1.raw());
218 auto t2 = _mm512_unpacklo_epi32(
B2.raw(),
B3.raw());
219 auto t3 = _mm512_unpackhi_epi32(
B2.raw(),
B3.raw());
220 auto t4 = _mm512_unpacklo_epi32(
B4.raw(),
B5.raw());
221 auto t5 = _mm512_unpackhi_epi32(
B4.raw(),
B5.raw());
222 auto t6 = _mm512_unpacklo_epi32(
B6.raw(),
B7.raw());
223 auto t7 = _mm512_unpackhi_epi32(
B6.raw(),
B7.raw());
224 auto t8 = _mm512_unpacklo_epi32(
B8.raw(),
B9.raw());
225 auto t9 = _mm512_unpackhi_epi32(
B8.raw(),
B9.raw());
226 auto ta = _mm512_unpacklo_epi32(
BA.raw(),
BB.raw());
227 auto tb = _mm512_unpackhi_epi32(
BA.raw(),
BB.raw());
228 auto tc = _mm512_unpacklo_epi32(
BC.raw(),
BD.raw());
229 auto td = _mm512_unpackhi_epi32(
BC.raw(),
BD.raw());
230 auto te = _mm512_unpacklo_epi32(
BE.raw(),
BF.raw());
231 auto tf = _mm512_unpackhi_epi32(
BE.raw(),
BF.raw());
233 auto r0 = _mm512_unpacklo_epi64(t0, t2);
234 auto r1 = _mm512_unpackhi_epi64(t0, t2);
235 auto r2 = _mm512_unpacklo_epi64(t1, t3);
236 auto r3 = _mm512_unpackhi_epi64(t1, t3);
237 auto r4 = _mm512_unpacklo_epi64(t4, t6);
238 auto r5 = _mm512_unpackhi_epi64(t4, t6);
239 auto r6 = _mm512_unpacklo_epi64(t5, t7);
240 auto r7 = _mm512_unpackhi_epi64(t5, t7);
241 auto r8 = _mm512_unpacklo_epi64(t8, ta);
242 auto r9 = _mm512_unpackhi_epi64(t8, ta);
243 auto ra = _mm512_unpacklo_epi64(t9, tb);
244 auto rb = _mm512_unpackhi_epi64(t9, tb);
245 auto rc = _mm512_unpacklo_epi64(tc, te);
246 auto rd = _mm512_unpackhi_epi64(tc, te);
247 auto re = _mm512_unpacklo_epi64(td, tf);
248 auto rf = _mm512_unpackhi_epi64(td, tf);
250 t0 = _mm512_shuffle_i32x4(r0, r4, 0x88);
251 t1 = _mm512_shuffle_i32x4(r1, r5, 0x88);
252 t2 = _mm512_shuffle_i32x4(r2, r6, 0x88);
253 t3 = _mm512_shuffle_i32x4(r3, r7, 0x88);
254 t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd);
255 t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd);
256 t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd);
257 t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd);
258 t8 = _mm512_shuffle_i32x4(r8, rc, 0x88);
259 t9 = _mm512_shuffle_i32x4(r9, rd, 0x88);
260 ta = _mm512_shuffle_i32x4(ra, re, 0x88);
261 tb = _mm512_shuffle_i32x4(rb, rf, 0x88);
262 tc = _mm512_shuffle_i32x4(r8, rc, 0xdd);
263 td = _mm512_shuffle_i32x4(r9, rd, 0xdd);
264 te = _mm512_shuffle_i32x4(ra, re, 0xdd);
265 tf = _mm512_shuffle_i32x4(rb, rf, 0xdd);
267 B0.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0x88);
268 B1.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0x88);
269 B2.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0x88);
270 B3.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0x88);
271 B4.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0x88);
272 B5.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0x88);
273 B6.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0x88);
274 B7.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0x88);
275 B8.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0xdd);
276 B9.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0xdd);
277 BA.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0xdd);
278 BB.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0xdd);
279 BC.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0xdd);
280 BD.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0xdd);
281 BE.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0xdd);
282 BF.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0xdd);
287 return SIMD_16x32::ternary_fn<0xca>(mask, a,
b);
292 return SIMD_16x32::ternary_fn<0xe8>(x, y, z);