7#include <botan/internal/shacal2.h>
9#include <botan/internal/simd_avx2.h>
10#include <botan/internal/simd_avx512.h>
46 auto t0 = _mm512_unpacklo_epi32(B0.
raw(), B1.
raw());
47 auto t1 = _mm512_unpackhi_epi32(B0.
raw(), B1.
raw());
48 auto t2 = _mm512_unpacklo_epi32(B2.
raw(), B3.
raw());
49 auto t3 = _mm512_unpackhi_epi32(B2.
raw(), B3.
raw());
50 auto t4 = _mm512_unpacklo_epi32(B4.
raw(), B5.
raw());
51 auto t5 = _mm512_unpackhi_epi32(B4.
raw(), B5.
raw());
52 auto t6 = _mm512_unpacklo_epi32(B6.
raw(), B7.
raw());
53 auto t7 = _mm512_unpackhi_epi32(B6.
raw(), B7.
raw());
55 auto r0 = _mm512_unpacklo_epi64(t0, t2);
56 auto r1 = _mm512_unpackhi_epi64(t0, t2);
57 auto r2 = _mm512_unpacklo_epi64(t1, t3);
58 auto r3 = _mm512_unpackhi_epi64(t1, t3);
59 auto r4 = _mm512_unpacklo_epi64(t4, t6);
60 auto r5 = _mm512_unpackhi_epi64(t4, t6);
61 auto r6 = _mm512_unpacklo_epi64(t5, t7);
62 auto r7 = _mm512_unpackhi_epi64(t5, t7);
64 const __m512i tbl0 = _mm512_set_epi32(27, 19, 26, 18, 25, 17, 24, 16, 11, 3, 10, 2, 9, 1, 8, 0);
65 const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(4));
66 B0 =
SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl0, r4));
67 B1 =
SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl0, r5));
68 B2 =
SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl0, r6));
69 B3 =
SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl0, r7));
70 B4 =
SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl1, r4));
71 B5 =
SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl1, r5));
72 B6 =
SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl1, r6));
73 B7 =
SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl1, r7));
85 auto t0 = _mm512_unpacklo_epi32(B0.
raw(), B1.
raw());
86 auto t1 = _mm512_unpackhi_epi32(B0.
raw(), B1.
raw());
87 auto t2 = _mm512_unpacklo_epi32(B2.
raw(), B3.
raw());
88 auto t3 = _mm512_unpackhi_epi32(B2.
raw(), B3.
raw());
89 auto t4 = _mm512_unpacklo_epi32(B4.
raw(), B5.
raw());
90 auto t5 = _mm512_unpackhi_epi32(B4.
raw(), B5.
raw());
91 auto t6 = _mm512_unpacklo_epi32(B6.
raw(), B7.
raw());
92 auto t7 = _mm512_unpackhi_epi32(B6.
raw(), B7.
raw());
94 auto r0 = _mm512_unpacklo_epi64(t0, t2);
95 auto r1 = _mm512_unpackhi_epi64(t0, t2);
96 auto r2 = _mm512_unpacklo_epi64(t1, t3);
97 auto r3 = _mm512_unpackhi_epi64(t1, t3);
98 auto r4 = _mm512_unpacklo_epi64(t4, t6);
99 auto r5 = _mm512_unpackhi_epi64(t4, t6);
100 auto r6 = _mm512_unpacklo_epi64(t5, t7);
101 auto r7 = _mm512_unpackhi_epi64(t5, t7);
103 const __m512i tbl0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);
104 const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(8));
106 auto s0 = _mm512_permutex2var_epi32(r0, tbl0, r4);
107 auto s1 = _mm512_permutex2var_epi32(r1, tbl0, r5);
108 auto s2 = _mm512_permutex2var_epi32(r2, tbl0, r6);
109 auto s3 = _mm512_permutex2var_epi32(r3, tbl0, r7);
110 auto s4 = _mm512_permutex2var_epi32(r0, tbl1, r4);
111 auto s5 = _mm512_permutex2var_epi32(r1, tbl1, r5);
112 auto s6 = _mm512_permutex2var_epi32(r2, tbl1, r6);
113 auto s7 = _mm512_permutex2var_epi32(r3, tbl1, r7);
115 B0 =
SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b01000100));
116 B1 =
SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b01000100));
117 B2 =
SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b11101110));
118 B3 =
SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b11101110));
119 B4 =
SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b01000100));
120 B5 =
SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b01000100));
121 B6 =
SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b11101110));
122 B7 =
SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b11101110));
127template <
typename SimdT>
137 H += E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK);
139 H += A.sigma0() + SimdT::majority(A, B, C);
142template <
typename SimdT>
152 H -= A.sigma0() + SimdT::majority(A, B, C);
154 H -= E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK);
161size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_encrypt_blocks(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
166 while(blocks >= 16) {
176 transpose_in(A, B, C, D, E, F, G, H);
178 for(
size_t r = 0; r != 64; r += 8) {
179 SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]);
180 SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]);
181 SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]);
182 SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]);
183 SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]);
184 SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]);
185 SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]);
186 SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]);
189 transpose_out(A, B, C, D, E, F, G, H);
218 for(
size_t r = 0; r != 64; r += 8) {
219 SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]);
220 SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]);
221 SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]);
222 SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]);
223 SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]);
224 SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]);
225 SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]);
226 SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]);
231 A.store_be(out + 32 * 0);
232 B.store_be(out + 32 * 1);
233 C.store_be(out + 32 * 2);
234 D.store_be(out + 32 * 3);
235 E.store_be(out + 32 * 4);
236 F.store_be(out + 32 * 5);
237 G.store_be(out + 32 * 6);
238 H.store_be(out + 32 * 7);
249size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_decrypt_blocks(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
250 using namespace SHACAL2_AVX512_F;
254 while(blocks >= 16) {
264 transpose_in(A, B, C, D, E, F, G, H);
266 for(
size_t r = 0; r != 64; r += 8) {
267 SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]);
268 SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]);
269 SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]);
270 SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]);
271 SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]);
272 SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]);
273 SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]);
274 SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]);
277 transpose_out(A, B, C, D, E, F, G, H);
279 A.store_be(out + 64 * 0);
280 B.store_be(out + 64 * 1);
281 C.store_be(out + 64 * 2);
282 D.store_be(out + 64 * 3);
283 E.store_be(out + 64 * 4);
284 F.store_be(out + 64 * 5);
285 G.store_be(out + 64 * 6);
286 H.store_be(out + 64 * 7);
306 for(
size_t r = 0; r != 64; r += 8) {
307 SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]);
308 SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]);
309 SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]);
310 SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]);
311 SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]);
312 SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]);
313 SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]);
314 SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]);
319 A.store_be(out + 32 * 0);
320 B.store_be(out + 32 * 1);
321 C.store_be(out + 32 * 2);
322 D.store_be(out + 32 * 3);
323 E.store_be(out + 32 * 4);
324 F.store_be(out + 32 * 5);
325 G.store_be(out + 32 * 6);
326 H.store_be(out + 32 * 7);
BOTAN_FN_ISA_AVX512 void store_be(uint8_t out[]) const
__m512i BOTAN_FN_ISA_AVX512 raw() const
static BOTAN_FN_ISA_AVX512 SIMD_16x32 load_be(const uint8_t *in)
static BOTAN_FN_ISA_AVX2 void transpose(SIMD_8x32 &B0, SIMD_8x32 &B1, SIMD_8x32 &B2, SIMD_8x32 &B3) noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be(const uint8_t *in) noexcept
#define BOTAN_FORCE_INLINE