181 const uint8_t BSWAP_MASK[64] = {
182 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
183 21, 20, 27, 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40,
184 47, 46, 45, 44, 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60,
187 const __m512i
bswap = _mm512_loadu_si512(
reinterpret_cast<const __m512i*
>(BSWAP_MASK));
189 const __m512i output = _mm512_shuffle_epi8(m_avx512,
bswap);
224 auto t0 = _mm512_unpacklo_epi32(B0.
raw(), B1.
raw());
225 auto t1 = _mm512_unpackhi_epi32(B0.
raw(), B1.
raw());
226 auto t2 = _mm512_unpacklo_epi32(B2.
raw(), B3.
raw());
227 auto t3 = _mm512_unpackhi_epi32(B2.
raw(), B3.
raw());
228 auto t4 = _mm512_unpacklo_epi32(B4.
raw(), B5.
raw());
229 auto t5 = _mm512_unpackhi_epi32(B4.
raw(), B5.
raw());
230 auto t6 = _mm512_unpacklo_epi32(B6.
raw(), B7.
raw());
231 auto t7 = _mm512_unpackhi_epi32(B6.
raw(), B7.
raw());
232 auto t8 = _mm512_unpacklo_epi32(B8.
raw(), B9.
raw());
233 auto t9 = _mm512_unpackhi_epi32(B8.
raw(), B9.
raw());
234 auto ta = _mm512_unpacklo_epi32(BA.
raw(), BB.
raw());
235 auto tb = _mm512_unpackhi_epi32(BA.
raw(), BB.
raw());
236 auto tc = _mm512_unpacklo_epi32(BC.
raw(), BD.
raw());
237 auto td = _mm512_unpackhi_epi32(BC.
raw(), BD.
raw());
238 auto te = _mm512_unpacklo_epi32(BE.
raw(), BF.
raw());
239 auto tf = _mm512_unpackhi_epi32(BE.
raw(), BF.
raw());
241 auto r0 = _mm512_unpacklo_epi64(t0, t2);
242 auto r1 = _mm512_unpackhi_epi64(t0, t2);
243 auto r2 = _mm512_unpacklo_epi64(t1, t3);
244 auto r3 = _mm512_unpackhi_epi64(t1, t3);
245 auto r4 = _mm512_unpacklo_epi64(t4, t6);
246 auto r5 = _mm512_unpackhi_epi64(t4, t6);
247 auto r6 = _mm512_unpacklo_epi64(t5, t7);
248 auto r7 = _mm512_unpackhi_epi64(t5, t7);
249 auto r8 = _mm512_unpacklo_epi64(t8, ta);
250 auto r9 = _mm512_unpackhi_epi64(t8, ta);
251 auto ra = _mm512_unpacklo_epi64(t9, tb);
252 auto rb = _mm512_unpackhi_epi64(t9, tb);
253 auto rc = _mm512_unpacklo_epi64(tc, te);
254 auto rd = _mm512_unpackhi_epi64(tc, te);
255 auto re = _mm512_unpacklo_epi64(td, tf);
256 auto rf = _mm512_unpackhi_epi64(td, tf);
258 t0 = _mm512_shuffle_i32x4(r0, r4, 0x88);
259 t1 = _mm512_shuffle_i32x4(r1, r5, 0x88);
260 t2 = _mm512_shuffle_i32x4(r2, r6, 0x88);
261 t3 = _mm512_shuffle_i32x4(r3, r7, 0x88);
262 t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd);
263 t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd);
264 t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd);
265 t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd);
266 t8 = _mm512_shuffle_i32x4(r8, rc, 0x88);
267 t9 = _mm512_shuffle_i32x4(r9, rd, 0x88);
268 ta = _mm512_shuffle_i32x4(ra, re, 0x88);
269 tb = _mm512_shuffle_i32x4(rb, rf, 0x88);
270 tc = _mm512_shuffle_i32x4(r8, rc, 0xdd);
271 td = _mm512_shuffle_i32x4(r9, rd, 0xdd);
272 te = _mm512_shuffle_i32x4(ra, re, 0xdd);
273 tf = _mm512_shuffle_i32x4(rb, rf, 0xdd);
275 B0.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0x88);
276 B1.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0x88);
277 B2.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0x88);
278 B3.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0x88);
279 B4.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0x88);
280 B5.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0x88);
281 B6.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0x88);
282 B7.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0x88);
283 B8.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0xdd);
284 B9.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0xdd);
285 BA.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0xdd);
286 BB.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0xdd);
287 BC.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0xdd);
288 BD.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0xdd);
289 BE.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0xdd);
290 BF.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0xdd);