40constexpr uint8_t pre_c = 0b01000101;
75 const __m512i xr = x.
raw();
84 const __m512i y123 = _mm512_gf2p8affine_epi64_epi8(xr, _mm512_set1_epi64(pre123_a), pre_c);
85 const __m512i y4 = _mm512_gf2p8affine_epi64_epi8(xr, _mm512_set1_epi64(pre4_a), pre_c);
87 const __m512i s1 = _mm512_gf2p8affineinv_epi64_epi8(y123, _mm512_set1_epi64(post14_a), 0x6E);
88 const __m512i s2 = _mm512_gf2p8affineinv_epi64_epi8(y123, _mm512_set1_epi64(post2_a), 0xDC);
89 const __m512i s3 = _mm512_gf2p8affineinv_epi64_epi8(y123, _mm512_set1_epi64(post3_a), 0x37);
90 const __m512i s4 = _mm512_gf2p8affineinv_epi64_epi8(y4, _mm512_set1_epi64(post14_a), 0x6E);
95 sx = _mm512_mask_blend_epi8(__mmask64(0x4848484848484848), sx, s2);
96 sx = _mm512_mask_blend_epi8(__mmask64(0x2424242424242424), sx, s3);
97 sx = _mm512_mask_blend_epi8(__mmask64(0x1212121212121212), sx, s4);
100 const auto P1 = _mm512_set_epi64(0x0808080908080809,
108 const auto P2 = _mm512_set_epi64(0x09090A0A09090A0A,
116 const auto P3 = _mm512_set_epi64(0x0A0B0B0B0A0B0B0B,
124 const auto P4 = _mm512_set_epi64(0x0C0C0D0C0E0D0C0C,
132 const auto P5 = _mm512_set_epi64(0x0D0E0E0D0F0E0D0F,
140 const auto P6 = _mm512_set_epi64(0x0F0F0F0EFFFFFFFF,
149 const auto t1 =
SIMD_8x64(_mm512_shuffle_epi8(sx,
P1));
150 const auto t2 =
SIMD_8x64(_mm512_shuffle_epi8(sx, P2));
151 const auto t3 =
SIMD_8x64(_mm512_shuffle_epi8(sx, P3));
152 const auto t4 =
SIMD_8x64(_mm512_shuffle_epi8(sx, P4));
153 const auto t5 =
SIMD_8x64(_mm512_shuffle_epi8(sx, P5));
154 const auto t6 =
SIMD_8x64(_mm512_shuffle_epi8(sx, P6));
156 return (t1 ^ t2 ^ t3 ^ t4 ^ t5 ^ t6);
160 const auto Kv = _mm512_set1_epi64(K);
164 vr = _mm512_xor_si512(vr, _mm512_srli_epi64(_mm512_rol_epi32(_mm512_and_si512(vr, Kv), 1), 32));
167 vr = _mm512_xor_si512(vr, _mm512_slli_epi64(_mm512_or_si512(vr, Kv), 32));
173 const auto Kv = _mm512_set1_epi64(K);
177 vr = _mm512_xor_si512(vr, _mm512_slli_epi64(_mm512_or_si512(vr, Kv), 32));
180 vr = _mm512_xor_si512(vr, _mm512_srli_epi64(_mm512_rol_epi32(_mm512_and_si512(vr, Kv), 1), 32));
189 const auto idx_l = _mm512_set_epi64(0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00);
190 const auto idx_r = _mm512_set_epi64(0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01);
195 L =
SIMD_8x64(_mm512_permutex2var_epi64(A.raw(), idx_l, B.raw()));
196 R =
SIMD_8x64(_mm512_permutex2var_epi64(A.raw(), idx_r, B.raw()));
203 const auto idx_lo = _mm512_set_epi64(0x0B, 0x03, 0x0A, 0x02, 0x09, 0x01, 0x08, 0x00);
204 const auto idx_hi = _mm512_set_epi64(0x0F, 0x07, 0x0E, 0x06, 0x0D, 0x05, 0x0C, 0x04);
206 auto A =
SIMD_8x64(_mm512_permutex2var_epi64(R.
raw(), idx_lo, L.
raw()));
207 auto B =
SIMD_8x64(_mm512_permutex2var_epi64(R.
raw(), idx_hi, L.
raw()));
210 B.store_be(out + 64);
219 std::span<const uint64_t> SK) {
230 std::span<const uint64_t> SK) {
248 R1 ^= camellia_f(L1 ^ K0);
249 R2 ^= camellia_f(L2 ^ K0);
250 L1 ^= camellia_f(
R1 ^ K1);
251 L2 ^= camellia_f(
R2 ^ K1);
252 R1 ^= camellia_f(L1 ^ K2);
253 R2 ^= camellia_f(L2 ^ K2);
254 L1 ^= camellia_f(
R1 ^ K3);
255 L2 ^= camellia_f(
R2 ^ K3);
256 R1 ^= camellia_f(L1 ^ K4);
257 R2 ^= camellia_f(L2 ^ K4);
258 L1 ^= camellia_f(
R1 ^ K5);
259 L2 ^= camellia_f(
R2 ^ K5);
271 R1 ^= camellia_f(L1 ^ K5);
272 R2 ^= camellia_f(L2 ^ K5);
273 L1 ^= camellia_f(
R1 ^ K4);
274 L2 ^= camellia_f(
R2 ^ K4);
275 R1 ^= camellia_f(L1 ^ K3);
276 R2 ^= camellia_f(L2 ^ K3);
277 L1 ^= camellia_f(
R1 ^ K2);
278 L2 ^= camellia_f(
R2 ^ K2);
279 R1 ^= camellia_f(L1 ^ K1);
280 R2 ^= camellia_f(L2 ^ K1);
281 L1 ^= camellia_f(
R1 ^ K0);
282 L2 ^= camellia_f(
R2 ^ K0);
285BOTAN_FN_ISA_AVX512_GFNI
286void camellia_encrypt_x16_18r(
const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
291 load_and_deinterleave(in, L1,
R1);
292 load_and_deinterleave(in + 128, L2,
R2);
301 six_e_rounds_x2(L1,
R1, L2,
R2, SK.subspan(2));
303 L1 = FL_8(L1, SK[8]);
304 L2 = FL_8(L2, SK[8]);
305 R1 = FLINV_8(
R1, SK[9]);
306 R2 = FLINV_8(
R2, SK[9]);
308 six_e_rounds_x2(L1,
R1, L2,
R2, SK.subspan(10));
310 L1 = FL_8(L1, SK[16]);
311 L2 = FL_8(L2, SK[16]);
312 R1 = FLINV_8(
R1, SK[17]);
313 R2 = FLINV_8(
R2, SK[17]);
315 six_e_rounds_x2(L1,
R1, L2,
R2, SK.subspan(18));
324 interleave_and_store(out, L1,
R1);
325 interleave_and_store(out + 128, L2,
R2);
328BOTAN_FN_ISA_AVX512_GFNI
329void camellia_decrypt_x16_18r(
const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
334 load_and_deinterleave(in, L1,
R1);
335 load_and_deinterleave(in + 128, L2,
R2);
344 six_d_rounds_x2(L1,
R1, L2,
R2, SK.subspan(18));
346 L1 = FL_8(L1, SK[17]);
347 L2 = FL_8(L2, SK[17]);
348 R1 = FLINV_8(
R1, SK[16]);
349 R2 = FLINV_8(
R2, SK[16]);
351 six_d_rounds_x2(L1,
R1, L2,
R2, SK.subspan(10));
353 L1 = FL_8(L1, SK[9]);
354 L2 = FL_8(L2, SK[9]);
355 R1 = FLINV_8(
R1, SK[8]);
356 R2 = FLINV_8(
R2, SK[8]);
358 six_d_rounds_x2(L1,
R1, L2,
R2, SK.subspan(2));
367 interleave_and_store(out, L1,
R1);
368 interleave_and_store(out + 128, L2,
R2);
371BOTAN_FN_ISA_AVX512_GFNI
372void camellia_encrypt_x16_24r(
const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
377 load_and_deinterleave(in, L1,
R1);
378 load_and_deinterleave(in + 128, L2,
R2);
387 six_e_rounds_x2(L1,
R1, L2,
R2, SK.subspan(2));
389 L1 = FL_8(L1, SK[8]);
390 L2 = FL_8(L2, SK[8]);
391 R1 = FLINV_8(
R1, SK[9]);
392 R2 = FLINV_8(
R2, SK[9]);
394 six_e_rounds_x2(L1,
R1, L2,
R2, SK.subspan(10));
396 L1 = FL_8(L1, SK[16]);
397 L2 = FL_8(L2, SK[16]);
398 R1 = FLINV_8(
R1, SK[17]);
399 R2 = FLINV_8(
R2, SK[17]);
401 six_e_rounds_x2(L1,
R1, L2,
R2, SK.subspan(18));
403 L1 = FL_8(L1, SK[24]);
404 L2 = FL_8(L2, SK[24]);
405 R1 = FLINV_8(
R1, SK[25]);
406 R2 = FLINV_8(
R2, SK[25]);
408 six_e_rounds_x2(L1,
R1, L2,
R2, SK.subspan(26));
417 interleave_and_store(out, L1,
R1);
418 interleave_and_store(out + 128, L2,
R2);
421BOTAN_FN_ISA_AVX512_GFNI
422void camellia_decrypt_x16_24r(
const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
427 load_and_deinterleave(in, L1,
R1);
428 load_and_deinterleave(in + 128, L2,
R2);
437 six_d_rounds_x2(L1,
R1, L2,
R2, SK.subspan(26));
439 L1 = FL_8(L1, SK[25]);
440 L2 = FL_8(L2, SK[25]);
441 R1 = FLINV_8(
R1, SK[24]);
442 R2 = FLINV_8(
R2, SK[24]);
444 six_d_rounds_x2(L1,
R1, L2,
R2, SK.subspan(18));
446 L1 = FL_8(L1, SK[17]);
447 L2 = FL_8(L2, SK[17]);
448 R1 = FLINV_8(
R1, SK[16]);
449 R2 = FLINV_8(
R2, SK[16]);
451 six_d_rounds_x2(L1,
R1, L2,
R2, SK.subspan(10));
453 L1 = FL_8(L1, SK[9]);
454 L2 = FL_8(L2, SK[9]);
455 R1 = FLINV_8(
R1, SK[8]);
456 R2 = FLINV_8(
R2, SK[8]);
458 six_d_rounds_x2(L1,
R1, L2,
R2, SK.subspan(2));
467 interleave_and_store(out, L1,
R1);
468 interleave_and_store(out + 128, L2,
R2);
471BOTAN_FN_ISA_AVX512_GFNI
472void camellia_encrypt_x8_18r(
const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
475 load_and_deinterleave(in, L, R);
480 six_e_rounds(L, R, SK.subspan(2));
483 R = FLINV_8(R, SK[9]);
485 six_e_rounds(L, R, SK.subspan(10));
488 R = FLINV_8(R, SK[17]);
490 six_e_rounds(L, R, SK.subspan(18));
495 interleave_and_store(out, L, R);
498BOTAN_FN_ISA_AVX512_GFNI
499void camellia_decrypt_x8_18r(
const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
502 load_and_deinterleave(in, L, R);
507 six_d_rounds(L, R, SK.subspan(18));
510 R = FLINV_8(R, SK[16]);
512 six_d_rounds(L, R, SK.subspan(10));
515 R = FLINV_8(R, SK[8]);
517 six_d_rounds(L, R, SK.subspan(2));
522 interleave_and_store(out, L, R);
525BOTAN_FN_ISA_AVX512_GFNI
526void camellia_encrypt_x8_24r(
const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
529 load_and_deinterleave(in, L, R);
534 six_e_rounds(L, R, SK.subspan(2));
537 R = FLINV_8(R, SK[9]);
539 six_e_rounds(L, R, SK.subspan(10));
542 R = FLINV_8(R, SK[17]);
544 six_e_rounds(L, R, SK.subspan(18));
547 R = FLINV_8(R, SK[25]);
549 six_e_rounds(L, R, SK.subspan(26));
554 interleave_and_store(out, L, R);
557BOTAN_FN_ISA_AVX512_GFNI
558void camellia_decrypt_x8_24r(
const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
561 load_and_deinterleave(in, L, R);
566 six_d_rounds(L, R, SK.subspan(26));
569 R = FLINV_8(R, SK[24]);
571 six_d_rounds(L, R, SK.subspan(18));
574 R = FLINV_8(R, SK[16]);
576 six_d_rounds(L, R, SK.subspan(10));
579 R = FLINV_8(R, SK[8]);
581 six_d_rounds(L, R, SK.subspan(2));
586 interleave_and_store(out, L, R);