Botan 3.11.0
Crypto and TLS for C&
camellia_avx512_gfni.cpp
Go to the documentation of this file.
1/*
2* (C) 2026 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/camellia.h>
8
9#include <botan/mem_ops.h>
10#include <botan/internal/isa_extn.h>
11#include <botan/internal/simd_8x64.h>
12#include <botan/internal/simd_avx2_gfni.h>
13
14namespace Botan {
15
16namespace Camellia_AVX512 {
17
18namespace {
19
20constexpr uint64_t pre123_a = gfni_matrix(R"(
21 1 1 1 0 1 1 0 1
22 0 0 1 1 0 0 1 0
23 1 1 0 1 0 0 0 0
24 1 0 1 1 0 0 1 1
25 0 0 0 0 1 1 0 0
26 1 0 1 0 0 1 0 0
27 0 0 1 0 1 1 0 0
28 1 0 0 0 0 1 1 0)");
29
30constexpr uint64_t pre4_a = gfni_matrix(R"(
31 1 1 0 1 1 0 1 1
32 0 1 1 0 0 1 0 0
33 1 0 1 0 0 0 0 1
34 0 1 1 0 0 1 1 1
35 0 0 0 1 1 0 0 0
36 0 1 0 0 1 0 0 1
37 0 1 0 1 1 0 0 0
38 0 0 0 0 1 1 0 1)");
39
40constexpr uint8_t pre_c = 0b01000101;
41
42constexpr uint64_t post2_a = gfni_matrix(R"(
43 0 0 0 1 1 1 0 0
44 0 0 0 0 0 0 0 1
45 0 1 1 0 0 1 1 0
46 1 0 1 1 1 1 1 0
47 0 0 0 1 1 0 1 1
48 1 0 0 0 1 1 1 0
49 0 1 0 1 1 1 1 0
50 0 1 1 1 1 1 1 1)");
51
52constexpr uint64_t post3_a = gfni_matrix(R"(
53 0 1 1 0 0 1 1 0
54 1 0 1 1 1 1 1 0
55 0 0 0 1 1 0 1 1
56 1 0 0 0 1 1 1 0
57 0 1 0 1 1 1 1 0
58 0 1 1 1 1 1 1 1
59 0 0 0 1 1 1 0 0
60 0 0 0 0 0 0 0 1)");
61
62constexpr uint64_t post14_a = gfni_matrix(R"(
63 0 0 0 0 0 0 0 1
64 0 1 1 0 0 1 1 0
65 1 0 1 1 1 1 1 0
66 0 0 0 1 1 0 1 1
67 1 0 0 0 1 1 1 0
68 0 1 0 1 1 1 1 0
69 0 1 1 1 1 1 1 1
70 0 0 0 1 1 1 0 0)");
71
72// NOLINTBEGIN(portability-simd-intrinsics)
73
74BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI SIMD_8x64 camellia_f(SIMD_8x64 x) {
75 const __m512i xr = x.raw();
76
77 /*
78 * Camellia sends different bytes of each word through different sboxes; we
79 * resolve this like cavemen by computing all 4 S-box variants over the full
80 * vector in parallel, then blending the results.
81 */
82
83 // Compute S1(x), S2(x), S3(x), S4(x) for all bytes
84 const __m512i y123 = _mm512_gf2p8affine_epi64_epi8(xr, _mm512_set1_epi64(pre123_a), pre_c);
85 const __m512i y4 = _mm512_gf2p8affine_epi64_epi8(xr, _mm512_set1_epi64(pre4_a), pre_c);
86
87 const __m512i s1 = _mm512_gf2p8affineinv_epi64_epi8(y123, _mm512_set1_epi64(post14_a), 0x6E);
88 const __m512i s2 = _mm512_gf2p8affineinv_epi64_epi8(y123, _mm512_set1_epi64(post2_a), 0xDC);
89 const __m512i s3 = _mm512_gf2p8affineinv_epi64_epi8(y123, _mm512_set1_epi64(post3_a), 0x37);
90 const __m512i s4 = _mm512_gf2p8affineinv_epi64_epi8(y4, _mm512_set1_epi64(post14_a), 0x6E);
91
92 // Blend to find correct S(x) for each byte position
93
94 auto sx = s1;
95 sx = _mm512_mask_blend_epi8(__mmask64(0x4848484848484848), sx, s2); // s2 at bytes {3,6}
96 sx = _mm512_mask_blend_epi8(__mmask64(0x2424242424242424), sx, s3); // s3 at bytes {2,5}
97 sx = _mm512_mask_blend_epi8(__mmask64(0x1212121212121212), sx, s4); // s4 at bytes {1,4}
98
99 // Linear mixing layer
100 const auto P1 = _mm512_set_epi64(0x0808080908080809,
101 0x0000000100000001,
102 0x0808080908080809,
103 0x0000000100000001,
104 0x0808080908080809,
105 0x0000000100000001,
106 0x0808080908080809,
107 0x0000000100000001);
108 const auto P2 = _mm512_set_epi64(0x09090A0A09090A0A,
109 0x0101020201010202,
110 0x09090A0A09090A0A,
111 0x0101020201010202,
112 0x09090A0A09090A0A,
113 0x0101020201010202,
114 0x09090A0A09090A0A,
115 0x0101020201010202);
116 const auto P3 = _mm512_set_epi64(0x0A0B0B0B0A0B0B0B,
117 0x0203030302030303,
118 0x0A0B0B0B0A0B0B0B,
119 0x0203030302030303,
120 0x0A0B0B0B0A0B0B0B,
121 0x0203030302030303,
122 0x0A0B0B0B0A0B0B0B,
123 0x0203030302030303);
124 const auto P4 = _mm512_set_epi64(0x0C0C0D0C0E0D0C0C,
125 0x0404050406050404,
126 0x0C0C0D0C0E0D0C0C,
127 0x0404050406050404,
128 0x0C0C0D0C0E0D0C0C,
129 0x0404050406050404,
130 0x0C0C0D0C0E0D0C0C,
131 0x0404050406050404);
132 const auto P5 = _mm512_set_epi64(0x0D0E0E0D0F0E0D0F,
133 0x0506060507060507,
134 0x0D0E0E0D0F0E0D0F,
135 0x0506060507060507,
136 0x0D0E0E0D0F0E0D0F,
137 0x0506060507060507,
138 0x0D0E0E0D0F0E0D0F,
139 0x0506060507060507);
140 const auto P6 = _mm512_set_epi64(0x0F0F0F0EFFFFFFFF,
141 0x07070706FFFFFFFF,
142 0x0F0F0F0EFFFFFFFF,
143 0x07070706FFFFFFFF,
144 0x0F0F0F0EFFFFFFFF,
145 0x07070706FFFFFFFF,
146 0x0F0F0F0EFFFFFFFF,
147 0x07070706FFFFFFFF);
148
149 const auto t1 = SIMD_8x64(_mm512_shuffle_epi8(sx, P1));
150 const auto t2 = SIMD_8x64(_mm512_shuffle_epi8(sx, P2));
151 const auto t3 = SIMD_8x64(_mm512_shuffle_epi8(sx, P3));
152 const auto t4 = SIMD_8x64(_mm512_shuffle_epi8(sx, P4));
153 const auto t5 = SIMD_8x64(_mm512_shuffle_epi8(sx, P5));
154 const auto t6 = SIMD_8x64(_mm512_shuffle_epi8(sx, P6));
155
156 return (t1 ^ t2 ^ t3 ^ t4 ^ t5 ^ t6);
157}
158
159BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SIMD_8x64 FL_8(SIMD_8x64 v, uint64_t K) {
160 const auto Kv = _mm512_set1_epi64(K);
161 auto vr = v.raw();
162
163 // x2 ^= rotl<1>(x1 & k1): AND, rotate 32-bit elements, shift high->low, XOR
164 vr = _mm512_xor_si512(vr, _mm512_srli_epi64(_mm512_rol_epi32(_mm512_and_si512(vr, Kv), 1), 32));
165
166 // x1 ^= (x2 | k2): OR, shift low->high, XOR
167 vr = _mm512_xor_si512(vr, _mm512_slli_epi64(_mm512_or_si512(vr, Kv), 32));
168
169 return SIMD_8x64(vr);
170}
171
172BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SIMD_8x64 FLINV_8(SIMD_8x64 v, uint64_t K) {
173 const auto Kv = _mm512_set1_epi64(K);
174 auto vr = v.raw();
175
176 // x1 ^= (x2 | k2): OR, shift low->high, XOR
177 vr = _mm512_xor_si512(vr, _mm512_slli_epi64(_mm512_or_si512(vr, Kv), 32));
178
179 // x2 ^= rotl<1>(x1 & k1): AND, rotate 32-bit elements, shift high->low, XOR
180 vr = _mm512_xor_si512(vr, _mm512_srli_epi64(_mm512_rol_epi32(_mm512_and_si512(vr, Kv), 1), 32));
181
182 return SIMD_8x64(vr);
183}
184
185/*
186* Load 8 blocks, byte-swap, and deinterleave into L (even) and R (odd) halves
187*/
188BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 void load_and_deinterleave(const uint8_t in[], SIMD_8x64& L, SIMD_8x64& R) {
189 const auto idx_l = _mm512_set_epi64(0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00);
190 const auto idx_r = _mm512_set_epi64(0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01);
191
192 auto A = SIMD_8x64::load_be(in);
193 auto B = SIMD_8x64::load_be(in + 64);
194
195 L = SIMD_8x64(_mm512_permutex2var_epi64(A.raw(), idx_l, B.raw()));
196 R = SIMD_8x64(_mm512_permutex2var_epi64(A.raw(), idx_r, B.raw()));
197}
198
199/*
200* Interleave R/L halves (note swap), byte-swap, and store 8 blocks
201*/
202BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 void interleave_and_store(uint8_t out[], SIMD_8x64 L, SIMD_8x64 R) {
203 const auto idx_lo = _mm512_set_epi64(0x0B, 0x03, 0x0A, 0x02, 0x09, 0x01, 0x08, 0x00);
204 const auto idx_hi = _mm512_set_epi64(0x0F, 0x07, 0x0E, 0x06, 0x0D, 0x05, 0x0C, 0x04);
205
206 auto A = SIMD_8x64(_mm512_permutex2var_epi64(R.raw(), idx_lo, L.raw()));
207 auto B = SIMD_8x64(_mm512_permutex2var_epi64(R.raw(), idx_hi, L.raw()));
208
209 A.store_be(out);
210 B.store_be(out + 64);
211}
212
213// NOLINTEND(portability-simd-intrinsics)
214
215// Helpers for 6 round iterations
216
217BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void six_e_rounds(SIMD_8x64& L,
218 SIMD_8x64& R,
219 std::span<const uint64_t> SK) {
220 R ^= camellia_f(L ^ SIMD_8x64::splat(SK[0]));
221 L ^= camellia_f(R ^ SIMD_8x64::splat(SK[1]));
222 R ^= camellia_f(L ^ SIMD_8x64::splat(SK[2]));
223 L ^= camellia_f(R ^ SIMD_8x64::splat(SK[3]));
224 R ^= camellia_f(L ^ SIMD_8x64::splat(SK[4]));
225 L ^= camellia_f(R ^ SIMD_8x64::splat(SK[5]));
226}
227
228BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void six_d_rounds(SIMD_8x64& L,
229 SIMD_8x64& R,
230 std::span<const uint64_t> SK) {
231 R ^= camellia_f(L ^ SIMD_8x64::splat(SK[5]));
232 L ^= camellia_f(R ^ SIMD_8x64::splat(SK[4]));
233 R ^= camellia_f(L ^ SIMD_8x64::splat(SK[3]));
234 L ^= camellia_f(R ^ SIMD_8x64::splat(SK[2]));
235 R ^= camellia_f(L ^ SIMD_8x64::splat(SK[1]));
236 L ^= camellia_f(R ^ SIMD_8x64::splat(SK[0]));
237}
238
239BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void six_e_rounds_x2(
240 SIMD_8x64& L1, SIMD_8x64& R1, SIMD_8x64& L2, SIMD_8x64& R2, std::span<const uint64_t> SK) {
241 const auto K0 = SIMD_8x64::splat(SK[0]);
242 const auto K1 = SIMD_8x64::splat(SK[1]);
243 const auto K2 = SIMD_8x64::splat(SK[2]);
244 const auto K3 = SIMD_8x64::splat(SK[3]);
245 const auto K4 = SIMD_8x64::splat(SK[4]);
246 const auto K5 = SIMD_8x64::splat(SK[5]);
247
248 R1 ^= camellia_f(L1 ^ K0);
249 R2 ^= camellia_f(L2 ^ K0);
250 L1 ^= camellia_f(R1 ^ K1);
251 L2 ^= camellia_f(R2 ^ K1);
252 R1 ^= camellia_f(L1 ^ K2);
253 R2 ^= camellia_f(L2 ^ K2);
254 L1 ^= camellia_f(R1 ^ K3);
255 L2 ^= camellia_f(R2 ^ K3);
256 R1 ^= camellia_f(L1 ^ K4);
257 R2 ^= camellia_f(L2 ^ K4);
258 L1 ^= camellia_f(R1 ^ K5);
259 L2 ^= camellia_f(R2 ^ K5);
260}
261
262BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void six_d_rounds_x2(
263 SIMD_8x64& L1, SIMD_8x64& R1, SIMD_8x64& L2, SIMD_8x64& R2, std::span<const uint64_t> SK) {
264 const auto K0 = SIMD_8x64::splat(SK[0]);
265 const auto K1 = SIMD_8x64::splat(SK[1]);
266 const auto K2 = SIMD_8x64::splat(SK[2]);
267 const auto K3 = SIMD_8x64::splat(SK[3]);
268 const auto K4 = SIMD_8x64::splat(SK[4]);
269 const auto K5 = SIMD_8x64::splat(SK[5]);
270
271 R1 ^= camellia_f(L1 ^ K5);
272 R2 ^= camellia_f(L2 ^ K5);
273 L1 ^= camellia_f(R1 ^ K4);
274 L2 ^= camellia_f(R2 ^ K4);
275 R1 ^= camellia_f(L1 ^ K3);
276 R2 ^= camellia_f(L2 ^ K3);
277 L1 ^= camellia_f(R1 ^ K2);
278 L2 ^= camellia_f(R2 ^ K2);
279 R1 ^= camellia_f(L1 ^ K1);
280 R2 ^= camellia_f(L2 ^ K1);
281 L1 ^= camellia_f(R1 ^ K0);
282 L2 ^= camellia_f(R2 ^ K0);
283}
284
285BOTAN_FN_ISA_AVX512_GFNI
286void camellia_encrypt_x16_18r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
287 SIMD_8x64 L1;
289 SIMD_8x64 L2;
291 load_and_deinterleave(in, L1, R1);
292 load_and_deinterleave(in + 128, L2, R2);
293
294 const auto K0 = SIMD_8x64::splat(SK[0]);
295 const auto K1 = SIMD_8x64::splat(SK[1]);
296 L1 ^= K0;
297 L2 ^= K0;
298 R1 ^= K1;
299 R2 ^= K1;
300
301 six_e_rounds_x2(L1, R1, L2, R2, SK.subspan(2));
302
303 L1 = FL_8(L1, SK[8]);
304 L2 = FL_8(L2, SK[8]);
305 R1 = FLINV_8(R1, SK[9]);
306 R2 = FLINV_8(R2, SK[9]);
307
308 six_e_rounds_x2(L1, R1, L2, R2, SK.subspan(10));
309
310 L1 = FL_8(L1, SK[16]);
311 L2 = FL_8(L2, SK[16]);
312 R1 = FLINV_8(R1, SK[17]);
313 R2 = FLINV_8(R2, SK[17]);
314
315 six_e_rounds_x2(L1, R1, L2, R2, SK.subspan(18));
316
317 const auto K24 = SIMD_8x64::splat(SK[24]);
318 const auto K25 = SIMD_8x64::splat(SK[25]);
319 R1 ^= K24;
320 R2 ^= K24;
321 L1 ^= K25;
322 L2 ^= K25;
323
324 interleave_and_store(out, L1, R1);
325 interleave_and_store(out + 128, L2, R2);
326}
327
328BOTAN_FN_ISA_AVX512_GFNI
329void camellia_decrypt_x16_18r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
330 SIMD_8x64 L1;
332 SIMD_8x64 L2;
334 load_and_deinterleave(in, L1, R1);
335 load_and_deinterleave(in + 128, L2, R2);
336
337 const auto K25 = SIMD_8x64::splat(SK[25]);
338 const auto K24 = SIMD_8x64::splat(SK[24]);
339 R1 ^= K25;
340 R2 ^= K25;
341 L1 ^= K24;
342 L2 ^= K24;
343
344 six_d_rounds_x2(L1, R1, L2, R2, SK.subspan(18));
345
346 L1 = FL_8(L1, SK[17]);
347 L2 = FL_8(L2, SK[17]);
348 R1 = FLINV_8(R1, SK[16]);
349 R2 = FLINV_8(R2, SK[16]);
350
351 six_d_rounds_x2(L1, R1, L2, R2, SK.subspan(10));
352
353 L1 = FL_8(L1, SK[9]);
354 L2 = FL_8(L2, SK[9]);
355 R1 = FLINV_8(R1, SK[8]);
356 R2 = FLINV_8(R2, SK[8]);
357
358 six_d_rounds_x2(L1, R1, L2, R2, SK.subspan(2));
359
360 const auto K1 = SIMD_8x64::splat(SK[1]);
361 const auto K0 = SIMD_8x64::splat(SK[0]);
362 L1 ^= K1;
363 L2 ^= K1;
364 R1 ^= K0;
365 R2 ^= K0;
366
367 interleave_and_store(out, L1, R1);
368 interleave_and_store(out + 128, L2, R2);
369}
370
371BOTAN_FN_ISA_AVX512_GFNI
372void camellia_encrypt_x16_24r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
373 SIMD_8x64 L1;
375 SIMD_8x64 L2;
377 load_and_deinterleave(in, L1, R1);
378 load_and_deinterleave(in + 128, L2, R2);
379
380 const auto K0 = SIMD_8x64::splat(SK[0]);
381 const auto K1 = SIMD_8x64::splat(SK[1]);
382 L1 ^= K0;
383 L2 ^= K0;
384 R1 ^= K1;
385 R2 ^= K1;
386
387 six_e_rounds_x2(L1, R1, L2, R2, SK.subspan(2));
388
389 L1 = FL_8(L1, SK[8]);
390 L2 = FL_8(L2, SK[8]);
391 R1 = FLINV_8(R1, SK[9]);
392 R2 = FLINV_8(R2, SK[9]);
393
394 six_e_rounds_x2(L1, R1, L2, R2, SK.subspan(10));
395
396 L1 = FL_8(L1, SK[16]);
397 L2 = FL_8(L2, SK[16]);
398 R1 = FLINV_8(R1, SK[17]);
399 R2 = FLINV_8(R2, SK[17]);
400
401 six_e_rounds_x2(L1, R1, L2, R2, SK.subspan(18));
402
403 L1 = FL_8(L1, SK[24]);
404 L2 = FL_8(L2, SK[24]);
405 R1 = FLINV_8(R1, SK[25]);
406 R2 = FLINV_8(R2, SK[25]);
407
408 six_e_rounds_x2(L1, R1, L2, R2, SK.subspan(26));
409
410 const auto K32 = SIMD_8x64::splat(SK[32]);
411 const auto K33 = SIMD_8x64::splat(SK[33]);
412 R1 ^= K32;
413 R2 ^= K32;
414 L1 ^= K33;
415 L2 ^= K33;
416
417 interleave_and_store(out, L1, R1);
418 interleave_and_store(out + 128, L2, R2);
419}
420
421BOTAN_FN_ISA_AVX512_GFNI
422void camellia_decrypt_x16_24r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
423 SIMD_8x64 L1;
425 SIMD_8x64 L2;
427 load_and_deinterleave(in, L1, R1);
428 load_and_deinterleave(in + 128, L2, R2);
429
430 const auto K33 = SIMD_8x64::splat(SK[33]);
431 const auto K32 = SIMD_8x64::splat(SK[32]);
432 R1 ^= K33;
433 R2 ^= K33;
434 L1 ^= K32;
435 L2 ^= K32;
436
437 six_d_rounds_x2(L1, R1, L2, R2, SK.subspan(26));
438
439 L1 = FL_8(L1, SK[25]);
440 L2 = FL_8(L2, SK[25]);
441 R1 = FLINV_8(R1, SK[24]);
442 R2 = FLINV_8(R2, SK[24]);
443
444 six_d_rounds_x2(L1, R1, L2, R2, SK.subspan(18));
445
446 L1 = FL_8(L1, SK[17]);
447 L2 = FL_8(L2, SK[17]);
448 R1 = FLINV_8(R1, SK[16]);
449 R2 = FLINV_8(R2, SK[16]);
450
451 six_d_rounds_x2(L1, R1, L2, R2, SK.subspan(10));
452
453 L1 = FL_8(L1, SK[9]);
454 L2 = FL_8(L2, SK[9]);
455 R1 = FLINV_8(R1, SK[8]);
456 R2 = FLINV_8(R2, SK[8]);
457
458 six_d_rounds_x2(L1, R1, L2, R2, SK.subspan(2));
459
460 const auto K1 = SIMD_8x64::splat(SK[1]);
461 const auto K0 = SIMD_8x64::splat(SK[0]);
462 L1 ^= K1;
463 L2 ^= K1;
464 R1 ^= K0;
465 R2 ^= K0;
466
467 interleave_and_store(out, L1, R1);
468 interleave_and_store(out + 128, L2, R2);
469}
470
471BOTAN_FN_ISA_AVX512_GFNI
472void camellia_encrypt_x8_18r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
473 SIMD_8x64 L;
474 SIMD_8x64 R;
475 load_and_deinterleave(in, L, R);
476
477 L ^= SIMD_8x64::splat(SK[0]);
478 R ^= SIMD_8x64::splat(SK[1]);
479
480 six_e_rounds(L, R, SK.subspan(2));
481
482 L = FL_8(L, SK[8]);
483 R = FLINV_8(R, SK[9]);
484
485 six_e_rounds(L, R, SK.subspan(10));
486
487 L = FL_8(L, SK[16]);
488 R = FLINV_8(R, SK[17]);
489
490 six_e_rounds(L, R, SK.subspan(18));
491
492 R ^= SIMD_8x64::splat(SK[24]);
493 L ^= SIMD_8x64::splat(SK[25]);
494
495 interleave_and_store(out, L, R);
496}
497
498BOTAN_FN_ISA_AVX512_GFNI
499void camellia_decrypt_x8_18r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
500 SIMD_8x64 L;
501 SIMD_8x64 R;
502 load_and_deinterleave(in, L, R);
503
504 R ^= SIMD_8x64::splat(SK[25]);
505 L ^= SIMD_8x64::splat(SK[24]);
506
507 six_d_rounds(L, R, SK.subspan(18));
508
509 L = FL_8(L, SK[17]);
510 R = FLINV_8(R, SK[16]);
511
512 six_d_rounds(L, R, SK.subspan(10));
513
514 L = FL_8(L, SK[9]);
515 R = FLINV_8(R, SK[8]);
516
517 six_d_rounds(L, R, SK.subspan(2));
518
519 L ^= SIMD_8x64::splat(SK[1]);
520 R ^= SIMD_8x64::splat(SK[0]);
521
522 interleave_and_store(out, L, R);
523}
524
525BOTAN_FN_ISA_AVX512_GFNI
526void camellia_encrypt_x8_24r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
527 SIMD_8x64 L;
528 SIMD_8x64 R;
529 load_and_deinterleave(in, L, R);
530
531 L ^= SIMD_8x64::splat(SK[0]);
532 R ^= SIMD_8x64::splat(SK[1]);
533
534 six_e_rounds(L, R, SK.subspan(2));
535
536 L = FL_8(L, SK[8]);
537 R = FLINV_8(R, SK[9]);
538
539 six_e_rounds(L, R, SK.subspan(10));
540
541 L = FL_8(L, SK[16]);
542 R = FLINV_8(R, SK[17]);
543
544 six_e_rounds(L, R, SK.subspan(18));
545
546 L = FL_8(L, SK[24]);
547 R = FLINV_8(R, SK[25]);
548
549 six_e_rounds(L, R, SK.subspan(26));
550
551 R ^= SIMD_8x64::splat(SK[32]);
552 L ^= SIMD_8x64::splat(SK[33]);
553
554 interleave_and_store(out, L, R);
555}
556
557BOTAN_FN_ISA_AVX512_GFNI
558void camellia_decrypt_x8_24r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
559 SIMD_8x64 L;
560 SIMD_8x64 R;
561 load_and_deinterleave(in, L, R);
562
563 R ^= SIMD_8x64::splat(SK[33]);
564 L ^= SIMD_8x64::splat(SK[32]);
565
566 six_d_rounds(L, R, SK.subspan(26));
567
568 L = FL_8(L, SK[25]);
569 R = FLINV_8(R, SK[24]);
570
571 six_d_rounds(L, R, SK.subspan(18));
572
573 L = FL_8(L, SK[17]);
574 R = FLINV_8(R, SK[16]);
575
576 six_d_rounds(L, R, SK.subspan(10));
577
578 L = FL_8(L, SK[9]);
579 R = FLINV_8(R, SK[8]);
580
581 six_d_rounds(L, R, SK.subspan(2));
582
583 L ^= SIMD_8x64::splat(SK[1]);
584 R ^= SIMD_8x64::splat(SK[0]);
585
586 interleave_and_store(out, L, R);
587}
588
589} // namespace
590
591} // namespace Camellia_AVX512
592
593// static
594void BOTAN_FN_ISA_AVX512_GFNI Camellia_128::avx512_gfni_encrypt(const uint8_t in[],
595 uint8_t out[],
596 size_t blocks,
597 std::span<const uint64_t> SK) {
598 while(blocks >= 16) {
599 Camellia_AVX512::camellia_encrypt_x16_18r(in, out, SK);
600 in += 16 * 16;
601 out += 16 * 16;
602 blocks -= 16;
603 }
604
605 while(blocks >= 8) {
606 Camellia_AVX512::camellia_encrypt_x8_18r(in, out, SK);
607 in += 8 * 16;
608 out += 8 * 16;
609 blocks -= 8;
610 }
611
612 if(blocks > 0) {
613 uint8_t ibuf[8 * 16] = {0};
614 uint8_t obuf[8 * 16] = {0};
615 copy_mem(ibuf, in, blocks * 16);
616 Camellia_AVX512::camellia_encrypt_x8_18r(ibuf, obuf, SK);
617 copy_mem(out, obuf, blocks * 16);
618 }
619}
620
621// static
622void BOTAN_FN_ISA_AVX512_GFNI Camellia_128::avx512_gfni_decrypt(const uint8_t in[],
623 uint8_t out[],
624 size_t blocks,
625 std::span<const uint64_t> SK) {
626 while(blocks >= 16) {
627 Camellia_AVX512::camellia_decrypt_x16_18r(in, out, SK);
628 in += 16 * 16;
629 out += 16 * 16;
630 blocks -= 16;
631 }
632
633 while(blocks >= 8) {
634 Camellia_AVX512::camellia_decrypt_x8_18r(in, out, SK);
635 in += 8 * 16;
636 out += 8 * 16;
637 blocks -= 8;
638 }
639
640 if(blocks > 0) {
641 uint8_t ibuf[8 * 16] = {0};
642 uint8_t obuf[8 * 16] = {0};
643 copy_mem(ibuf, in, blocks * 16);
644 Camellia_AVX512::camellia_decrypt_x8_18r(ibuf, obuf, SK);
645 copy_mem(out, obuf, blocks * 16);
646 }
647}
648
649// static
650void BOTAN_FN_ISA_AVX512_GFNI Camellia_192::avx512_gfni_encrypt(const uint8_t in[],
651 uint8_t out[],
652 size_t blocks,
653 std::span<const uint64_t> SK) {
654 while(blocks >= 16) {
655 Camellia_AVX512::camellia_encrypt_x16_24r(in, out, SK);
656 in += 16 * 16;
657 out += 16 * 16;
658 blocks -= 16;
659 }
660
661 while(blocks >= 8) {
662 Camellia_AVX512::camellia_encrypt_x8_24r(in, out, SK);
663 in += 8 * 16;
664 out += 8 * 16;
665 blocks -= 8;
666 }
667
668 if(blocks > 0) {
669 uint8_t ibuf[8 * 16] = {0};
670 uint8_t obuf[8 * 16] = {0};
671 copy_mem(ibuf, in, blocks * 16);
672 Camellia_AVX512::camellia_encrypt_x8_24r(ibuf, obuf, SK);
673 copy_mem(out, obuf, blocks * 16);
674 }
675}
676
677// static
678void BOTAN_FN_ISA_AVX512_GFNI Camellia_192::avx512_gfni_decrypt(const uint8_t in[],
679 uint8_t out[],
680 size_t blocks,
681 std::span<const uint64_t> SK) {
682 while(blocks >= 16) {
683 Camellia_AVX512::camellia_decrypt_x16_24r(in, out, SK);
684 in += 16 * 16;
685 out += 16 * 16;
686 blocks -= 16;
687 }
688
689 while(blocks >= 8) {
690 Camellia_AVX512::camellia_decrypt_x8_24r(in, out, SK);
691 in += 8 * 16;
692 out += 8 * 16;
693 blocks -= 8;
694 }
695
696 if(blocks > 0) {
697 uint8_t ibuf[8 * 16] = {0};
698 uint8_t obuf[8 * 16] = {0};
699 copy_mem(ibuf, in, blocks * 16);
700 Camellia_AVX512::camellia_decrypt_x8_24r(ibuf, obuf, SK);
701 copy_mem(out, obuf, blocks * 16);
702 }
703}
704
705// static
706void BOTAN_FN_ISA_AVX512_GFNI Camellia_256::avx512_gfni_encrypt(const uint8_t in[],
707 uint8_t out[],
708 size_t blocks,
709 std::span<const uint64_t> SK) {
710 while(blocks >= 16) {
711 Camellia_AVX512::camellia_encrypt_x16_24r(in, out, SK);
712 in += 16 * 16;
713 out += 16 * 16;
714 blocks -= 16;
715 }
716
717 while(blocks >= 8) {
718 Camellia_AVX512::camellia_encrypt_x8_24r(in, out, SK);
719 in += 8 * 16;
720 out += 8 * 16;
721 blocks -= 8;
722 }
723
724 if(blocks > 0) {
725 uint8_t ibuf[8 * 16] = {0};
726 uint8_t obuf[8 * 16] = {0};
727 copy_mem(ibuf, in, blocks * 16);
728 Camellia_AVX512::camellia_encrypt_x8_24r(ibuf, obuf, SK);
729 copy_mem(out, obuf, blocks * 16);
730 }
731}
732
733// static
734void BOTAN_FN_ISA_AVX512_GFNI Camellia_256::avx512_gfni_decrypt(const uint8_t in[],
735 uint8_t out[],
736 size_t blocks,
737 std::span<const uint64_t> SK) {
738 while(blocks >= 16) {
739 Camellia_AVX512::camellia_decrypt_x16_24r(in, out, SK);
740 in += 16 * 16;
741 out += 16 * 16;
742 blocks -= 16;
743 }
744
745 while(blocks >= 8) {
746 Camellia_AVX512::camellia_decrypt_x8_24r(in, out, SK);
747 in += 8 * 16;
748 out += 8 * 16;
749 blocks -= 8;
750 }
751
752 if(blocks > 0) {
753 uint8_t ibuf[8 * 16] = {0};
754 uint8_t obuf[8 * 16] = {0};
755 copy_mem(ibuf, in, blocks * 16);
756 Camellia_AVX512::camellia_decrypt_x8_24r(ibuf, obuf, SK);
757 copy_mem(out, obuf, blocks * 16);
758 }
759}
760
761} // namespace Botan
static BOTAN_FN_ISA_SIMD_8X64 SIMD_8x64 splat(uint64_t v)
Definition simd_8x64.h:154
__m512i BOTAN_FN_ISA_SIMD_8X64 raw() const noexcept
Definition simd_8x64.h:156
static BOTAN_FN_ISA_SIMD_8X64 SIMD_8x64 load_be(const void *in)
Definition simd_8x64.h:62
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
constexpr void copy_mem(T *out, const T *in, size_t n)
Definition mem_ops.h:144
uint32_t P1(uint32_t X)
Definition sm3_fn.h:65
consteval uint64_t gfni_matrix(std::string_view s)
void R2(uint32_t A, uint32_t &B, uint32_t C, uint32_t &D, uint32_t E, uint32_t &F, uint32_t G, uint32_t &H, uint32_t TJ, uint32_t Wi, uint32_t Wj)
Definition sm3_fn.h:43
void R1(uint32_t A, uint32_t &B, uint32_t C, uint32_t &D, uint32_t E, uint32_t &F, uint32_t G, uint32_t &H, uint32_t TJ, uint32_t Wi, uint32_t Wj)
Definition sm3_fn.h:21