Botan 3.9.0
Crypto and TLS for C&
shacal2_avx512.cpp
Go to the documentation of this file.
1/*
2* (C) 2025 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/shacal2.h>
8
9#include <botan/internal/simd_avx2.h>
10#include <botan/internal/simd_avx512.h>
11
12namespace Botan {
13
15
16namespace {
17
18/*
19* 8x16 Transpose
20*
21* Convert from
22*
23* A00 B00 C00 ... H00
24* A01 B01 C01 ... H01
25* ..
26* A15 B15 C15 ... H15
27*
28* with two blocks stored in each register, into
29*
30* A00 A01 ... A15
31* B00 B01 ... B15
32* ...
33* H00 H01 ... H15
34*/
35BOTAN_FN_ISA_AVX512
36void transpose_in(SIMD_16x32& B0,
37 SIMD_16x32& B1,
38 SIMD_16x32& B2,
39 SIMD_16x32& B3,
40 SIMD_16x32& B4,
41 SIMD_16x32& B5,
42 SIMD_16x32& B6,
43 SIMD_16x32& B7) {
44 auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());
45 auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());
46 auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());
47 auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());
48 auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());
49 auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());
50 auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());
51 auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());
52
53 auto r0 = _mm512_unpacklo_epi64(t0, t2);
54 auto r1 = _mm512_unpackhi_epi64(t0, t2);
55 auto r2 = _mm512_unpacklo_epi64(t1, t3);
56 auto r3 = _mm512_unpackhi_epi64(t1, t3);
57 auto r4 = _mm512_unpacklo_epi64(t4, t6);
58 auto r5 = _mm512_unpackhi_epi64(t4, t6);
59 auto r6 = _mm512_unpacklo_epi64(t5, t7);
60 auto r7 = _mm512_unpackhi_epi64(t5, t7);
61
62 const __m512i tbl0 = _mm512_set_epi32(27, 19, 26, 18, 25, 17, 24, 16, 11, 3, 10, 2, 9, 1, 8, 0);
63 const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(4));
64 B0 = SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl0, r4));
65 B1 = SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl0, r5));
66 B2 = SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl0, r6));
67 B3 = SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl0, r7));
68 B4 = SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl1, r4));
69 B5 = SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl1, r5));
70 B6 = SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl1, r6));
71 B7 = SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl1, r7));
72}
73
74BOTAN_FN_ISA_AVX512
75void transpose_out(SIMD_16x32& B0,
76 SIMD_16x32& B1,
77 SIMD_16x32& B2,
78 SIMD_16x32& B3,
79 SIMD_16x32& B4,
80 SIMD_16x32& B5,
81 SIMD_16x32& B6,
82 SIMD_16x32& B7) {
83 auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());
84 auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());
85 auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());
86 auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());
87 auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());
88 auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());
89 auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());
90 auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());
91
92 auto r0 = _mm512_unpacklo_epi64(t0, t2);
93 auto r1 = _mm512_unpackhi_epi64(t0, t2);
94 auto r2 = _mm512_unpacklo_epi64(t1, t3);
95 auto r3 = _mm512_unpackhi_epi64(t1, t3);
96 auto r4 = _mm512_unpacklo_epi64(t4, t6);
97 auto r5 = _mm512_unpackhi_epi64(t4, t6);
98 auto r6 = _mm512_unpacklo_epi64(t5, t7);
99 auto r7 = _mm512_unpackhi_epi64(t5, t7);
100
101 const __m512i tbl0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);
102 const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(8));
103
104 auto s0 = _mm512_permutex2var_epi32(r0, tbl0, r4);
105 auto s1 = _mm512_permutex2var_epi32(r1, tbl0, r5);
106 auto s2 = _mm512_permutex2var_epi32(r2, tbl0, r6);
107 auto s3 = _mm512_permutex2var_epi32(r3, tbl0, r7);
108 auto s4 = _mm512_permutex2var_epi32(r0, tbl1, r4);
109 auto s5 = _mm512_permutex2var_epi32(r1, tbl1, r5);
110 auto s6 = _mm512_permutex2var_epi32(r2, tbl1, r6);
111 auto s7 = _mm512_permutex2var_epi32(r3, tbl1, r7);
112
113 B0 = SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b01000100));
114 B1 = SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b01000100));
115 B2 = SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b11101110));
116 B3 = SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b11101110));
117 B4 = SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b01000100));
118 B5 = SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b01000100));
119 B6 = SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b11101110));
120 B7 = SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b11101110));
121}
122
123template <typename SimdT>
124void BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SHACAL2_Fwd(const SimdT& A,
125 const SimdT& B,
126 const SimdT& C,
127 SimdT& D,
128 const SimdT& E,
129 const SimdT& F,
130 const SimdT& G,
131 SimdT& H,
132 uint32_t RK) {
133 H += E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK);
134 D += H;
135 H += A.sigma0() + SimdT::majority(A, B, C);
136}
137
138template <typename SimdT>
139void BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SHACAL2_Rev(const SimdT& A,
140 const SimdT& B,
141 const SimdT& C,
142 SimdT& D,
143 const SimdT& E,
144 const SimdT& F,
145 const SimdT& G,
146 SimdT& H,
147 uint32_t RK) {
148 H -= A.sigma0() + SimdT::majority(A, B, C);
149 D -= H;
150 H -= E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK);
151}
152
153} // namespace
154
155} // namespace SHACAL2_AVX512_F
156
157size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const {
158 using namespace SHACAL2_AVX512_F;
159
160 size_t consumed = 0;
161
162 while(blocks >= 16) {
163 SIMD_16x32 A = SIMD_16x32::load_be(in + 64 * 0);
164 SIMD_16x32 B = SIMD_16x32::load_be(in + 64 * 1);
165 SIMD_16x32 C = SIMD_16x32::load_be(in + 64 * 2);
166 SIMD_16x32 D = SIMD_16x32::load_be(in + 64 * 3);
167 SIMD_16x32 E = SIMD_16x32::load_be(in + 64 * 4);
168 SIMD_16x32 F = SIMD_16x32::load_be(in + 64 * 5);
169 SIMD_16x32 G = SIMD_16x32::load_be(in + 64 * 6);
170 SIMD_16x32 H = SIMD_16x32::load_be(in + 64 * 7);
171
172 transpose_in(A, B, C, D, E, F, G, H);
173
174 for(size_t r = 0; r != 64; r += 8) {
175 SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]);
176 SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]);
177 SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]);
178 SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]);
179 SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]);
180 SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]);
181 SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]);
182 SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]);
183 }
184
185 transpose_out(A, B, C, D, E, F, G, H);
186
187 A.store_be(out + 64 * 0);
188 B.store_be(out + 64 * 1);
189 C.store_be(out + 64 * 2);
190 D.store_be(out + 64 * 3);
191 E.store_be(out + 64 * 4);
192 F.store_be(out + 64 * 5);
193 G.store_be(out + 64 * 6);
194 H.store_be(out + 64 * 7);
195
196 in += 16 * BLOCK_SIZE;
197 out += 16 * BLOCK_SIZE;
198 blocks -= 16;
199 consumed += 16;
200 }
201
202 while(blocks >= 8) {
203 SIMD_8x32 A = SIMD_8x32::load_be(in + 32 * 0);
204 SIMD_8x32 B = SIMD_8x32::load_be(in + 32 * 1);
205 SIMD_8x32 C = SIMD_8x32::load_be(in + 32 * 2);
206 SIMD_8x32 D = SIMD_8x32::load_be(in + 32 * 3);
207 SIMD_8x32 E = SIMD_8x32::load_be(in + 32 * 4);
208 SIMD_8x32 F = SIMD_8x32::load_be(in + 32 * 5);
209 SIMD_8x32 G = SIMD_8x32::load_be(in + 32 * 6);
210 SIMD_8x32 H = SIMD_8x32::load_be(in + 32 * 7);
211
212 SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
213
214 for(size_t r = 0; r != 64; r += 8) {
215 SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]);
216 SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]);
217 SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]);
218 SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]);
219 SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]);
220 SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]);
221 SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]);
222 SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]);
223 }
224
225 SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
226
227 A.store_be(out + 32 * 0);
228 B.store_be(out + 32 * 1);
229 C.store_be(out + 32 * 2);
230 D.store_be(out + 32 * 3);
231 E.store_be(out + 32 * 4);
232 F.store_be(out + 32 * 5);
233 G.store_be(out + 32 * 6);
234 H.store_be(out + 32 * 7);
235
236 in += 8 * BLOCK_SIZE;
237 out += 8 * BLOCK_SIZE;
238 blocks -= 8;
239 consumed += 8;
240 }
241
242 return consumed;
243}
244
245size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const {
246 using namespace SHACAL2_AVX512_F;
247
248 size_t consumed = 0;
249
250 while(blocks >= 16) {
251 SIMD_16x32 A = SIMD_16x32::load_be(in + 64 * 0);
252 SIMD_16x32 B = SIMD_16x32::load_be(in + 64 * 1);
253 SIMD_16x32 C = SIMD_16x32::load_be(in + 64 * 2);
254 SIMD_16x32 D = SIMD_16x32::load_be(in + 64 * 3);
255 SIMD_16x32 E = SIMD_16x32::load_be(in + 64 * 4);
256 SIMD_16x32 F = SIMD_16x32::load_be(in + 64 * 5);
257 SIMD_16x32 G = SIMD_16x32::load_be(in + 64 * 6);
258 SIMD_16x32 H = SIMD_16x32::load_be(in + 64 * 7);
259
260 transpose_in(A, B, C, D, E, F, G, H);
261
262 for(size_t r = 0; r != 64; r += 8) {
263 SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]);
264 SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]);
265 SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]);
266 SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]);
267 SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]);
268 SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]);
269 SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]);
270 SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]);
271 }
272
273 transpose_out(A, B, C, D, E, F, G, H);
274
275 A.store_be(out + 64 * 0);
276 B.store_be(out + 64 * 1);
277 C.store_be(out + 64 * 2);
278 D.store_be(out + 64 * 3);
279 E.store_be(out + 64 * 4);
280 F.store_be(out + 64 * 5);
281 G.store_be(out + 64 * 6);
282 H.store_be(out + 64 * 7);
283
284 in += 16 * BLOCK_SIZE;
285 out += 16 * BLOCK_SIZE;
286 blocks -= 16;
287 consumed += 16;
288 }
289
290 while(blocks >= 8) {
291 SIMD_8x32 A = SIMD_8x32::load_be(in + 32 * 0);
292 SIMD_8x32 B = SIMD_8x32::load_be(in + 32 * 1);
293 SIMD_8x32 C = SIMD_8x32::load_be(in + 32 * 2);
294 SIMD_8x32 D = SIMD_8x32::load_be(in + 32 * 3);
295 SIMD_8x32 E = SIMD_8x32::load_be(in + 32 * 4);
296 SIMD_8x32 F = SIMD_8x32::load_be(in + 32 * 5);
297 SIMD_8x32 G = SIMD_8x32::load_be(in + 32 * 6);
298 SIMD_8x32 H = SIMD_8x32::load_be(in + 32 * 7);
299
300 SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
301
302 for(size_t r = 0; r != 64; r += 8) {
303 SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]);
304 SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]);
305 SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]);
306 SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]);
307 SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]);
308 SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]);
309 SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]);
310 SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]);
311 }
312
313 SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
314
315 A.store_be(out + 32 * 0);
316 B.store_be(out + 32 * 1);
317 C.store_be(out + 32 * 2);
318 D.store_be(out + 32 * 3);
319 E.store_be(out + 32 * 4);
320 F.store_be(out + 32 * 5);
321 G.store_be(out + 32 * 6);
322 H.store_be(out + 32 * 7);
323
324 in += 8 * BLOCK_SIZE;
325 out += 8 * BLOCK_SIZE;
326 blocks -= 8;
327 consumed += 8;
328 }
329
330 return consumed;
331}
332
333} // namespace Botan
BOTAN_FN_ISA_AVX512 void store_be(uint8_t out[]) const
Definition simd_avx512.h:74
__m512i BOTAN_FN_ISA_AVX512 raw() const
static BOTAN_FN_ISA_AVX512 SIMD_16x32 load_be(const uint8_t *in)
Definition simd_avx512.h:68
static BOTAN_FN_ISA_AVX2 void transpose(SIMD_8x32 &B0, SIMD_8x32 &B1, SIMD_8x32 &B2, SIMD_8x32 &B3) noexcept
Definition simd_avx2.h:264
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be(const uint8_t *in) noexcept
Definition simd_avx2.h:81
#define BOTAN_FORCE_INLINE
Definition compiler.h:87