Botan 3.10.0
Crypto and TLS for C&
shacal2_avx512.cpp
Go to the documentation of this file.
1/*
2* (C) 2025 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/shacal2.h>
8
9#include <botan/internal/simd_avx2.h>
10#include <botan/internal/simd_avx512.h>
11
12namespace Botan {
13
15
16namespace {
17
18// NOLINTBEGIN(portability-simd-intrinsics)
19
20/*
21* 8x16 Transpose
22*
23* Convert from
24*
25* A00 B00 C00 ... H00
26* A01 B01 C01 ... H01
27* ..
28* A15 B15 C15 ... H15
29*
30* with two blocks stored in each register, into
31*
32* A00 A01 ... A15
33* B00 B01 ... B15
34* ...
35* H00 H01 ... H15
36*/
37BOTAN_FN_ISA_AVX512
38void transpose_in(SIMD_16x32& B0,
39 SIMD_16x32& B1,
40 SIMD_16x32& B2,
41 SIMD_16x32& B3,
42 SIMD_16x32& B4,
43 SIMD_16x32& B5,
44 SIMD_16x32& B6,
45 SIMD_16x32& B7) {
46 auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());
47 auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());
48 auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());
49 auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());
50 auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());
51 auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());
52 auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());
53 auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());
54
55 auto r0 = _mm512_unpacklo_epi64(t0, t2);
56 auto r1 = _mm512_unpackhi_epi64(t0, t2);
57 auto r2 = _mm512_unpacklo_epi64(t1, t3);
58 auto r3 = _mm512_unpackhi_epi64(t1, t3);
59 auto r4 = _mm512_unpacklo_epi64(t4, t6);
60 auto r5 = _mm512_unpackhi_epi64(t4, t6);
61 auto r6 = _mm512_unpacklo_epi64(t5, t7);
62 auto r7 = _mm512_unpackhi_epi64(t5, t7);
63
64 const __m512i tbl0 = _mm512_set_epi32(27, 19, 26, 18, 25, 17, 24, 16, 11, 3, 10, 2, 9, 1, 8, 0);
65 const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(4));
66 B0 = SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl0, r4));
67 B1 = SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl0, r5));
68 B2 = SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl0, r6));
69 B3 = SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl0, r7));
70 B4 = SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl1, r4));
71 B5 = SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl1, r5));
72 B6 = SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl1, r6));
73 B7 = SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl1, r7));
74}
75
76BOTAN_FN_ISA_AVX512
77void transpose_out(SIMD_16x32& B0,
78 SIMD_16x32& B1,
79 SIMD_16x32& B2,
80 SIMD_16x32& B3,
81 SIMD_16x32& B4,
82 SIMD_16x32& B5,
83 SIMD_16x32& B6,
84 SIMD_16x32& B7) {
85 auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());
86 auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());
87 auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());
88 auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());
89 auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());
90 auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());
91 auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());
92 auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());
93
94 auto r0 = _mm512_unpacklo_epi64(t0, t2);
95 auto r1 = _mm512_unpackhi_epi64(t0, t2);
96 auto r2 = _mm512_unpacklo_epi64(t1, t3);
97 auto r3 = _mm512_unpackhi_epi64(t1, t3);
98 auto r4 = _mm512_unpacklo_epi64(t4, t6);
99 auto r5 = _mm512_unpackhi_epi64(t4, t6);
100 auto r6 = _mm512_unpacklo_epi64(t5, t7);
101 auto r7 = _mm512_unpackhi_epi64(t5, t7);
102
103 const __m512i tbl0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);
104 const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(8));
105
106 auto s0 = _mm512_permutex2var_epi32(r0, tbl0, r4);
107 auto s1 = _mm512_permutex2var_epi32(r1, tbl0, r5);
108 auto s2 = _mm512_permutex2var_epi32(r2, tbl0, r6);
109 auto s3 = _mm512_permutex2var_epi32(r3, tbl0, r7);
110 auto s4 = _mm512_permutex2var_epi32(r0, tbl1, r4);
111 auto s5 = _mm512_permutex2var_epi32(r1, tbl1, r5);
112 auto s6 = _mm512_permutex2var_epi32(r2, tbl1, r6);
113 auto s7 = _mm512_permutex2var_epi32(r3, tbl1, r7);
114
115 B0 = SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b01000100));
116 B1 = SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b01000100));
117 B2 = SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b11101110));
118 B3 = SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b11101110));
119 B4 = SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b01000100));
120 B5 = SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b01000100));
121 B6 = SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b11101110));
122 B7 = SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b11101110));
123}
124
125// NOLINTEND(portability-simd-intrinsics)
126
127template <typename SimdT>
128void BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SHACAL2_Fwd(const SimdT& A,
129 const SimdT& B,
130 const SimdT& C,
131 SimdT& D,
132 const SimdT& E,
133 const SimdT& F,
134 const SimdT& G,
135 SimdT& H,
136 uint32_t RK) {
137 H += E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK);
138 D += H;
139 H += A.sigma0() + SimdT::majority(A, B, C);
140}
141
142template <typename SimdT>
143void BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SHACAL2_Rev(const SimdT& A,
144 const SimdT& B,
145 const SimdT& C,
146 SimdT& D,
147 const SimdT& E,
148 const SimdT& F,
149 const SimdT& G,
150 SimdT& H,
151 uint32_t RK) {
152 H -= A.sigma0() + SimdT::majority(A, B, C);
153 D -= H;
154 H -= E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK);
155}
156
157} // namespace
158
159} // namespace SHACAL2_AVX512_F
160
161size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const {
162 using namespace SHACAL2_AVX512_F;
163
164 size_t consumed = 0;
165
166 while(blocks >= 16) {
167 SIMD_16x32 A = SIMD_16x32::load_be(in + 64 * 0);
168 SIMD_16x32 B = SIMD_16x32::load_be(in + 64 * 1);
169 SIMD_16x32 C = SIMD_16x32::load_be(in + 64 * 2);
170 SIMD_16x32 D = SIMD_16x32::load_be(in + 64 * 3);
171 SIMD_16x32 E = SIMD_16x32::load_be(in + 64 * 4);
172 SIMD_16x32 F = SIMD_16x32::load_be(in + 64 * 5);
173 SIMD_16x32 G = SIMD_16x32::load_be(in + 64 * 6);
174 SIMD_16x32 H = SIMD_16x32::load_be(in + 64 * 7);
175
176 transpose_in(A, B, C, D, E, F, G, H);
177
178 for(size_t r = 0; r != 64; r += 8) {
179 SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]);
180 SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]);
181 SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]);
182 SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]);
183 SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]);
184 SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]);
185 SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]);
186 SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]);
187 }
188
189 transpose_out(A, B, C, D, E, F, G, H);
190
191 A.store_be(out + 64 * 0);
192 B.store_be(out + 64 * 1);
193 C.store_be(out + 64 * 2);
194 D.store_be(out + 64 * 3);
195 E.store_be(out + 64 * 4);
196 F.store_be(out + 64 * 5);
197 G.store_be(out + 64 * 6);
198 H.store_be(out + 64 * 7);
199
200 in += 16 * BLOCK_SIZE;
201 out += 16 * BLOCK_SIZE;
202 blocks -= 16;
203 consumed += 16;
204 }
205
206 while(blocks >= 8) {
207 SIMD_8x32 A = SIMD_8x32::load_be(in + 32 * 0);
208 SIMD_8x32 B = SIMD_8x32::load_be(in + 32 * 1);
209 SIMD_8x32 C = SIMD_8x32::load_be(in + 32 * 2);
210 SIMD_8x32 D = SIMD_8x32::load_be(in + 32 * 3);
211 SIMD_8x32 E = SIMD_8x32::load_be(in + 32 * 4);
212 SIMD_8x32 F = SIMD_8x32::load_be(in + 32 * 5);
213 SIMD_8x32 G = SIMD_8x32::load_be(in + 32 * 6);
214 SIMD_8x32 H = SIMD_8x32::load_be(in + 32 * 7);
215
216 SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
217
218 for(size_t r = 0; r != 64; r += 8) {
219 SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]);
220 SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]);
221 SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]);
222 SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]);
223 SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]);
224 SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]);
225 SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]);
226 SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]);
227 }
228
229 SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
230
231 A.store_be(out + 32 * 0);
232 B.store_be(out + 32 * 1);
233 C.store_be(out + 32 * 2);
234 D.store_be(out + 32 * 3);
235 E.store_be(out + 32 * 4);
236 F.store_be(out + 32 * 5);
237 G.store_be(out + 32 * 6);
238 H.store_be(out + 32 * 7);
239
240 in += 8 * BLOCK_SIZE;
241 out += 8 * BLOCK_SIZE;
242 blocks -= 8;
243 consumed += 8;
244 }
245
246 return consumed;
247}
248
249size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const {
250 using namespace SHACAL2_AVX512_F;
251
252 size_t consumed = 0;
253
254 while(blocks >= 16) {
255 SIMD_16x32 A = SIMD_16x32::load_be(in + 64 * 0);
256 SIMD_16x32 B = SIMD_16x32::load_be(in + 64 * 1);
257 SIMD_16x32 C = SIMD_16x32::load_be(in + 64 * 2);
258 SIMD_16x32 D = SIMD_16x32::load_be(in + 64 * 3);
259 SIMD_16x32 E = SIMD_16x32::load_be(in + 64 * 4);
260 SIMD_16x32 F = SIMD_16x32::load_be(in + 64 * 5);
261 SIMD_16x32 G = SIMD_16x32::load_be(in + 64 * 6);
262 SIMD_16x32 H = SIMD_16x32::load_be(in + 64 * 7);
263
264 transpose_in(A, B, C, D, E, F, G, H);
265
266 for(size_t r = 0; r != 64; r += 8) {
267 SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]);
268 SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]);
269 SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]);
270 SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]);
271 SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]);
272 SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]);
273 SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]);
274 SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]);
275 }
276
277 transpose_out(A, B, C, D, E, F, G, H);
278
279 A.store_be(out + 64 * 0);
280 B.store_be(out + 64 * 1);
281 C.store_be(out + 64 * 2);
282 D.store_be(out + 64 * 3);
283 E.store_be(out + 64 * 4);
284 F.store_be(out + 64 * 5);
285 G.store_be(out + 64 * 6);
286 H.store_be(out + 64 * 7);
287
288 in += 16 * BLOCK_SIZE;
289 out += 16 * BLOCK_SIZE;
290 blocks -= 16;
291 consumed += 16;
292 }
293
294 while(blocks >= 8) {
295 SIMD_8x32 A = SIMD_8x32::load_be(in + 32 * 0);
296 SIMD_8x32 B = SIMD_8x32::load_be(in + 32 * 1);
297 SIMD_8x32 C = SIMD_8x32::load_be(in + 32 * 2);
298 SIMD_8x32 D = SIMD_8x32::load_be(in + 32 * 3);
299 SIMD_8x32 E = SIMD_8x32::load_be(in + 32 * 4);
300 SIMD_8x32 F = SIMD_8x32::load_be(in + 32 * 5);
301 SIMD_8x32 G = SIMD_8x32::load_be(in + 32 * 6);
302 SIMD_8x32 H = SIMD_8x32::load_be(in + 32 * 7);
303
304 SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
305
306 for(size_t r = 0; r != 64; r += 8) {
307 SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]);
308 SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]);
309 SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]);
310 SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]);
311 SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]);
312 SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]);
313 SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]);
314 SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]);
315 }
316
317 SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
318
319 A.store_be(out + 32 * 0);
320 B.store_be(out + 32 * 1);
321 C.store_be(out + 32 * 2);
322 D.store_be(out + 32 * 3);
323 E.store_be(out + 32 * 4);
324 F.store_be(out + 32 * 5);
325 G.store_be(out + 32 * 6);
326 H.store_be(out + 32 * 7);
327
328 in += 8 * BLOCK_SIZE;
329 out += 8 * BLOCK_SIZE;
330 blocks -= 8;
331 consumed += 8;
332 }
333
334 return consumed;
335}
336
337} // namespace Botan
BOTAN_FN_ISA_AVX512 void store_be(uint8_t out[]) const
Definition simd_avx512.h:74
__m512i BOTAN_FN_ISA_AVX512 raw() const
static BOTAN_FN_ISA_AVX512 SIMD_16x32 load_be(const uint8_t *in)
Definition simd_avx512.h:68
static BOTAN_FN_ISA_AVX2 void transpose(SIMD_8x32 &B0, SIMD_8x32 &B1, SIMD_8x32 &B2, SIMD_8x32 &B3) noexcept
Definition simd_avx2.h:264
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be(const uint8_t *in) noexcept
Definition simd_avx2.h:81
#define BOTAN_FORCE_INLINE
Definition compiler.h:87