7#include <botan/internal/shacal2.h>
9#include <botan/internal/simd_avx2.h>
10#include <botan/internal/simd_avx512.h>
44 auto t0 = _mm512_unpacklo_epi32(B0.
raw(), B1.
raw());
45 auto t1 = _mm512_unpackhi_epi32(B0.
raw(), B1.
raw());
46 auto t2 = _mm512_unpacklo_epi32(B2.
raw(), B3.
raw());
47 auto t3 = _mm512_unpackhi_epi32(B2.
raw(), B3.
raw());
48 auto t4 = _mm512_unpacklo_epi32(B4.
raw(), B5.
raw());
49 auto t5 = _mm512_unpackhi_epi32(B4.
raw(), B5.
raw());
50 auto t6 = _mm512_unpacklo_epi32(B6.
raw(), B7.
raw());
51 auto t7 = _mm512_unpackhi_epi32(B6.
raw(), B7.
raw());
53 auto r0 = _mm512_unpacklo_epi64(t0, t2);
54 auto r1 = _mm512_unpackhi_epi64(t0, t2);
55 auto r2 = _mm512_unpacklo_epi64(t1, t3);
56 auto r3 = _mm512_unpackhi_epi64(t1, t3);
57 auto r4 = _mm512_unpacklo_epi64(t4, t6);
58 auto r5 = _mm512_unpackhi_epi64(t4, t6);
59 auto r6 = _mm512_unpacklo_epi64(t5, t7);
60 auto r7 = _mm512_unpackhi_epi64(t5, t7);
62 const __m512i tbl0 = _mm512_set_epi32(27, 19, 26, 18, 25, 17, 24, 16, 11, 3, 10, 2, 9, 1, 8, 0);
63 const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(4));
64 B0 =
SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl0, r4));
65 B1 =
SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl0, r5));
66 B2 =
SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl0, r6));
67 B3 =
SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl0, r7));
68 B4 =
SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl1, r4));
69 B5 =
SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl1, r5));
70 B6 =
SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl1, r6));
71 B7 =
SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl1, r7));
83 auto t0 = _mm512_unpacklo_epi32(B0.
raw(), B1.
raw());
84 auto t1 = _mm512_unpackhi_epi32(B0.
raw(), B1.
raw());
85 auto t2 = _mm512_unpacklo_epi32(B2.
raw(), B3.
raw());
86 auto t3 = _mm512_unpackhi_epi32(B2.
raw(), B3.
raw());
87 auto t4 = _mm512_unpacklo_epi32(B4.
raw(), B5.
raw());
88 auto t5 = _mm512_unpackhi_epi32(B4.
raw(), B5.
raw());
89 auto t6 = _mm512_unpacklo_epi32(B6.
raw(), B7.
raw());
90 auto t7 = _mm512_unpackhi_epi32(B6.
raw(), B7.
raw());
92 auto r0 = _mm512_unpacklo_epi64(t0, t2);
93 auto r1 = _mm512_unpackhi_epi64(t0, t2);
94 auto r2 = _mm512_unpacklo_epi64(t1, t3);
95 auto r3 = _mm512_unpackhi_epi64(t1, t3);
96 auto r4 = _mm512_unpacklo_epi64(t4, t6);
97 auto r5 = _mm512_unpackhi_epi64(t4, t6);
98 auto r6 = _mm512_unpacklo_epi64(t5, t7);
99 auto r7 = _mm512_unpackhi_epi64(t5, t7);
101 const __m512i tbl0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);
102 const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(8));
104 auto s0 = _mm512_permutex2var_epi32(r0, tbl0, r4);
105 auto s1 = _mm512_permutex2var_epi32(r1, tbl0, r5);
106 auto s2 = _mm512_permutex2var_epi32(r2, tbl0, r6);
107 auto s3 = _mm512_permutex2var_epi32(r3, tbl0, r7);
108 auto s4 = _mm512_permutex2var_epi32(r0, tbl1, r4);
109 auto s5 = _mm512_permutex2var_epi32(r1, tbl1, r5);
110 auto s6 = _mm512_permutex2var_epi32(r2, tbl1, r6);
111 auto s7 = _mm512_permutex2var_epi32(r3, tbl1, r7);
113 B0 =
SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b01000100));
114 B1 =
SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b01000100));
115 B2 =
SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b11101110));
116 B3 =
SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b11101110));
117 B4 =
SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b01000100));
118 B5 =
SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b01000100));
119 B6 =
SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b11101110));
120 B7 =
SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b11101110));
123template <
typename SimdT>
133 H += E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK);
135 H += A.sigma0() + SimdT::majority(A, B, C);
138template <
typename SimdT>
148 H -= A.sigma0() + SimdT::majority(A, B, C);
150 H -= E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK);
157size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_encrypt_blocks(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
162 while(blocks >= 16) {
172 transpose_in(A, B, C, D, E, F, G, H);
174 for(
size_t r = 0; r != 64; r += 8) {
175 SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]);
176 SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]);
177 SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]);
178 SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]);
179 SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]);
180 SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]);
181 SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]);
182 SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]);
185 transpose_out(A, B, C, D, E, F, G, H);
214 for(
size_t r = 0; r != 64; r += 8) {
215 SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]);
216 SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]);
217 SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]);
218 SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]);
219 SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]);
220 SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]);
221 SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]);
222 SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]);
227 A.store_be(out + 32 * 0);
228 B.store_be(out + 32 * 1);
229 C.store_be(out + 32 * 2);
230 D.store_be(out + 32 * 3);
231 E.store_be(out + 32 * 4);
232 F.store_be(out + 32 * 5);
233 G.store_be(out + 32 * 6);
234 H.store_be(out + 32 * 7);
245size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_decrypt_blocks(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
246 using namespace SHACAL2_AVX512_F;
250 while(blocks >= 16) {
260 transpose_in(A, B, C, D, E, F, G, H);
262 for(
size_t r = 0; r != 64; r += 8) {
263 SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]);
264 SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]);
265 SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]);
266 SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]);
267 SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]);
268 SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]);
269 SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]);
270 SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]);
273 transpose_out(A, B, C, D, E, F, G, H);
275 A.store_be(out + 64 * 0);
276 B.store_be(out + 64 * 1);
277 C.store_be(out + 64 * 2);
278 D.store_be(out + 64 * 3);
279 E.store_be(out + 64 * 4);
280 F.store_be(out + 64 * 5);
281 G.store_be(out + 64 * 6);
282 H.store_be(out + 64 * 7);
302 for(
size_t r = 0; r != 64; r += 8) {
303 SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]);
304 SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]);
305 SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]);
306 SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]);
307 SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]);
308 SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]);
309 SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]);
310 SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]);
315 A.store_be(out + 32 * 0);
316 B.store_be(out + 32 * 1);
317 C.store_be(out + 32 * 2);
318 D.store_be(out + 32 * 3);
319 E.store_be(out + 32 * 4);
320 F.store_be(out + 32 * 5);
321 G.store_be(out + 32 * 6);
322 H.store_be(out + 32 * 7);
BOTAN_FN_ISA_AVX512 void store_be(uint8_t out[]) const
__m512i BOTAN_FN_ISA_AVX512 raw() const
static BOTAN_FN_ISA_AVX512 SIMD_16x32 load_be(const uint8_t *in)
static BOTAN_FN_ISA_AVX2 void transpose(SIMD_8x32 &B0, SIMD_8x32 &B1, SIMD_8x32 &B2, SIMD_8x32 &B3) noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be(const uint8_t *in) noexcept
#define BOTAN_FORCE_INLINE