181 const uint8_t BSWAP_MASK[64] = {
182 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
183 21, 20, 27, 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40,
184 47, 46, 45, 44, 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60,
187 const __m512i
bswap = _mm512_loadu_si512(
reinterpret_cast<const __m512i*
>(BSWAP_MASK));
189 const __m512i output = _mm512_shuffle_epi8(m_avx512,
bswap);
227 auto t0 = _mm512_unpacklo_epi32(B0.
raw(), B1.
raw());
228 auto t1 = _mm512_unpackhi_epi32(B0.
raw(), B1.
raw());
229 auto t2 = _mm512_unpacklo_epi32(B2.
raw(), B3.
raw());
230 auto t3 = _mm512_unpackhi_epi32(B2.
raw(), B3.
raw());
231 auto t4 = _mm512_unpacklo_epi32(B4.
raw(), B5.
raw());
232 auto t5 = _mm512_unpackhi_epi32(B4.
raw(), B5.
raw());
233 auto t6 = _mm512_unpacklo_epi32(B6.
raw(), B7.
raw());
234 auto t7 = _mm512_unpackhi_epi32(B6.
raw(), B7.
raw());
235 auto t8 = _mm512_unpacklo_epi32(B8.
raw(), B9.
raw());
236 auto t9 = _mm512_unpackhi_epi32(B8.
raw(), B9.
raw());
237 auto ta = _mm512_unpacklo_epi32(BA.
raw(), BB.
raw());
238 auto tb = _mm512_unpackhi_epi32(BA.
raw(), BB.
raw());
239 auto tc = _mm512_unpacklo_epi32(BC.
raw(), BD.
raw());
240 auto td = _mm512_unpackhi_epi32(BC.
raw(), BD.
raw());
241 auto te = _mm512_unpacklo_epi32(BE.
raw(), BF.
raw());
242 auto tf = _mm512_unpackhi_epi32(BE.
raw(), BF.
raw());
244 auto r0 = _mm512_unpacklo_epi64(t0, t2);
245 auto r1 = _mm512_unpackhi_epi64(t0, t2);
246 auto r2 = _mm512_unpacklo_epi64(t1, t3);
247 auto r3 = _mm512_unpackhi_epi64(t1, t3);
248 auto r4 = _mm512_unpacklo_epi64(t4, t6);
249 auto r5 = _mm512_unpackhi_epi64(t4, t6);
250 auto r6 = _mm512_unpacklo_epi64(t5, t7);
251 auto r7 = _mm512_unpackhi_epi64(t5, t7);
252 auto r8 = _mm512_unpacklo_epi64(t8, ta);
253 auto r9 = _mm512_unpackhi_epi64(t8, ta);
254 auto ra = _mm512_unpacklo_epi64(t9, tb);
255 auto rb = _mm512_unpackhi_epi64(t9, tb);
256 auto rc = _mm512_unpacklo_epi64(tc, te);
257 auto rd = _mm512_unpackhi_epi64(tc, te);
258 auto re = _mm512_unpacklo_epi64(td, tf);
259 auto rf = _mm512_unpackhi_epi64(td, tf);
261 t0 = _mm512_shuffle_i32x4(r0, r4, 0x88);
262 t1 = _mm512_shuffle_i32x4(r1, r5, 0x88);
263 t2 = _mm512_shuffle_i32x4(r2, r6, 0x88);
264 t3 = _mm512_shuffle_i32x4(r3, r7, 0x88);
265 t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd);
266 t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd);
267 t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd);
268 t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd);
269 t8 = _mm512_shuffle_i32x4(r8, rc, 0x88);
270 t9 = _mm512_shuffle_i32x4(r9, rd, 0x88);
271 ta = _mm512_shuffle_i32x4(ra, re, 0x88);
272 tb = _mm512_shuffle_i32x4(rb, rf, 0x88);
273 tc = _mm512_shuffle_i32x4(r8, rc, 0xdd);
274 td = _mm512_shuffle_i32x4(r9, rd, 0xdd);
275 te = _mm512_shuffle_i32x4(ra, re, 0xdd);
276 tf = _mm512_shuffle_i32x4(rb, rf, 0xdd);
278 B0.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0x88);
279 B1.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0x88);
280 B2.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0x88);
281 B3.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0x88);
282 B4.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0x88);
283 B5.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0x88);
284 B6.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0x88);
285 B7.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0x88);
286 B8.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0xdd);
287 B9.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0xdd);
288 BA.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0xdd);
289 BB.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0xdd);
290 BC.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0xdd);
291 BD.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0xdd);
292 BE.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0xdd);
293 BF.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0xdd);