Botan 3.11.0
Crypto and TLS for C&
camellia_avx2_gfni.cpp
Go to the documentation of this file.
1/*
2* (C) 2025,2026 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/camellia.h>
8
9#include <botan/mem_ops.h>
10#include <botan/internal/simd_4x64.h>
11#include <botan/internal/simd_avx2_gfni.h>
12
13namespace Botan {
14
16
17/*
18* This follows exactly the approach used in the AVX-512+GFNI implementation
19* with only minor complications due to missing rotate and masked operations.
20*/
21
22namespace {
23
24constexpr uint64_t pre123_a = gfni_matrix(R"(
25 1 1 1 0 1 1 0 1
26 0 0 1 1 0 0 1 0
27 1 1 0 1 0 0 0 0
28 1 0 1 1 0 0 1 1
29 0 0 0 0 1 1 0 0
30 1 0 1 0 0 1 0 0
31 0 0 1 0 1 1 0 0
32 1 0 0 0 0 1 1 0)");
33
34constexpr uint64_t pre4_a = gfni_matrix(R"(
35 1 1 0 1 1 0 1 1
36 0 1 1 0 0 1 0 0
37 1 0 1 0 0 0 0 1
38 0 1 1 0 0 1 1 1
39 0 0 0 1 1 0 0 0
40 0 1 0 0 1 0 0 1
41 0 1 0 1 1 0 0 0
42 0 0 0 0 1 1 0 1)");
43
44constexpr uint8_t pre_c = 0b01000101;
45
46constexpr uint64_t post2_a = gfni_matrix(R"(
47 0 0 0 1 1 1 0 0
48 0 0 0 0 0 0 0 1
49 0 1 1 0 0 1 1 0
50 1 0 1 1 1 1 1 0
51 0 0 0 1 1 0 1 1
52 1 0 0 0 1 1 1 0
53 0 1 0 1 1 1 1 0
54 0 1 1 1 1 1 1 1)");
55
56constexpr uint64_t post3_a = gfni_matrix(R"(
57 0 1 1 0 0 1 1 0
58 1 0 1 1 1 1 1 0
59 0 0 0 1 1 0 1 1
60 1 0 0 0 1 1 1 0
61 0 1 0 1 1 1 1 0
62 0 1 1 1 1 1 1 1
63 0 0 0 1 1 1 0 0
64 0 0 0 0 0 0 0 1)");
65
66constexpr uint64_t post14_a = gfni_matrix(R"(
67 0 0 0 0 0 0 0 1
68 0 1 1 0 0 1 1 0
69 1 0 1 1 1 1 1 0
70 0 0 0 1 1 0 1 1
71 1 0 0 0 1 1 1 0
72 0 1 0 1 1 1 1 0
73 0 1 1 1 1 1 1 1
74 0 0 0 1 1 1 0 0)");
75
76// NOLINTBEGIN(portability-simd-intrinsics)
77
78BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2_GFNI SIMD_4x64 camellia_f(SIMD_4x64 x) {
79 const __m256i xr = x.raw();
80
81 // Compute S1(x), S2(x), S3(x), S4(x) for all bytes
82 const auto y123 = _mm256_gf2p8affine_epi64_epi8(xr, _mm256_set1_epi64x(pre123_a), pre_c);
83 const auto y4 = _mm256_gf2p8affine_epi64_epi8(xr, _mm256_set1_epi64x(pre4_a), pre_c);
84
85 const auto s1 = _mm256_gf2p8affineinv_epi64_epi8(y123, _mm256_set1_epi64x(post14_a), 0x6E);
86 const auto s2 = _mm256_gf2p8affineinv_epi64_epi8(y123, _mm256_set1_epi64x(post2_a), 0xDC);
87 const auto s3 = _mm256_gf2p8affineinv_epi64_epi8(y123, _mm256_set1_epi64x(post3_a), 0x37);
88 const auto s4 = _mm256_gf2p8affineinv_epi64_epi8(y4, _mm256_set1_epi64x(post14_a), 0x6E);
89
90 // Blend to find correct S(x) for each byte position
91
92 const auto mask_s2 = _mm256_set1_epi64x(0x00FF0000FF000000);
93 const auto mask_s3 = _mm256_set1_epi64x(0x0000FF0000FF0000);
94 const auto mask_s4 = _mm256_set1_epi64x(0x000000FF0000FF00);
95
96 auto sx = s1;
97 sx = _mm256_blendv_epi8(sx, s2, mask_s2);
98 sx = _mm256_blendv_epi8(sx, s3, mask_s3);
99 sx = _mm256_blendv_epi8(sx, s4, mask_s4);
100
101 // Linear mixing layer
102 const auto P1 = _mm256_set_epi64x(0x0808080908080809, 0x0000000100000001, 0x0808080908080809, 0x0000000100000001);
103 const auto P2 = _mm256_set_epi64x(0x09090A0A09090A0A, 0x0101020201010202, 0x09090A0A09090A0A, 0x0101020201010202);
104 const auto P3 = _mm256_set_epi64x(0x0A0B0B0B0A0B0B0B, 0x0203030302030303, 0x0A0B0B0B0A0B0B0B, 0x0203030302030303);
105 const auto P4 = _mm256_set_epi64x(0x0C0C0D0C0E0D0C0C, 0x0404050406050404, 0x0C0C0D0C0E0D0C0C, 0x0404050406050404);
106 const auto P5 = _mm256_set_epi64x(0x0D0E0E0D0F0E0D0F, 0x0506060507060507, 0x0D0E0E0D0F0E0D0F, 0x0506060507060507);
107 const auto P6 = _mm256_set_epi64x(0x0F0F0F0EFFFFFFFF, 0x07070706FFFFFFFF, 0x0F0F0F0EFFFFFFFF, 0x07070706FFFFFFFF);
108
109 const auto t1 = SIMD_4x64(_mm256_shuffle_epi8(sx, P1));
110 const auto t2 = SIMD_4x64(_mm256_shuffle_epi8(sx, P2));
111 const auto t3 = SIMD_4x64(_mm256_shuffle_epi8(sx, P3));
112 const auto t4 = SIMD_4x64(_mm256_shuffle_epi8(sx, P4));
113 const auto t5 = SIMD_4x64(_mm256_shuffle_epi8(sx, P5));
114 const auto t6 = SIMD_4x64(_mm256_shuffle_epi8(sx, P6));
115
116 return (t1 ^ t2 ^ t3 ^ t4 ^ t5 ^ t6);
117}
118
119BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2 void load_and_deinterleave(const uint8_t in[], SIMD_4x64& L, SIMD_4x64& R) {
120 auto A = SIMD_4x64::load_be(in);
121 auto B = SIMD_4x64::load_be(in + 32);
122
123 auto Ap = _mm256_permute4x64_epi64(A.raw(), 0b11'01'10'00); // [L[0], L[1], R[0], R[1]]
124 auto Bp = _mm256_permute4x64_epi64(B.raw(), 0b11'01'10'00); // [L[2], L[3], R[2], R[3]]
125
126 L = SIMD_4x64(_mm256_permute2x128_si256(Ap, Bp, 0x20)); // [L[0], L[1], L[2], L[3]]
127 R = SIMD_4x64(_mm256_permute2x128_si256(Ap, Bp, 0x31)); // [R[0], R[1], R[2], R[3]]
128}
129
130BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2 void interleave_and_store(uint8_t out[], SIMD_4x64 L, SIMD_4x64 R) {
131 auto T1 = _mm256_permute2x128_si256(R.raw(), L.raw(), 0x20); // [R[0], R[1], L[0], L[1]]
132 auto T2 = _mm256_permute2x128_si256(R.raw(), L.raw(), 0x31); // [R[2], R[3], L[2], L[3]]
133
134 auto A = SIMD_4x64(_mm256_permute4x64_epi64(T1, 0b11'01'10'00)); // [R[0], L[0], R[1], L[1]]
135 auto B = SIMD_4x64(_mm256_permute4x64_epi64(T2, 0b11'01'10'00)); // [R[2], L[2], R[3], L[3]]
136
137 A.store_be(out);
138 B.store_be(out + 32);
139}
140
141/*
142* 32-bit rotate on SIMD_4x64 helper for FL/FLINV
143*/
144BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2 SIMD_4x64 rotl32_1(SIMD_4x64 t) {
145 return SIMD_4x64(_mm256_or_si256(_mm256_slli_epi32(t.raw(), 1), _mm256_srli_epi32(t.raw(), 31)));
146}
147
148// NOLINTEND(portability-simd-intrinsics)
149
150BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2 SIMD_4x64 FL_4(SIMD_4x64 v, uint64_t K) {
151 const uint32_t k1 = static_cast<uint32_t>(K >> 32);
152 const uint32_t k2 = static_cast<uint32_t>(K & 0xFFFFFFFF);
153
154 auto x1 = v.shr<32>();
155 auto x2 = v & SIMD_4x64::splat(0xFFFFFFFF);
156
157 x2 ^= rotl32_1(x1 & SIMD_4x64::splat(k1));
158 x1 ^= (x2 | SIMD_4x64::splat(k2));
159
160 return x1.shl<32>() | x2;
161}
162
163BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2 SIMD_4x64 FLINV_4(SIMD_4x64 v, uint64_t K) {
164 const uint32_t k1 = static_cast<uint32_t>(K >> 32);
165 const uint32_t k2 = static_cast<uint32_t>(K & 0xFFFFFFFF);
166
167 auto x1 = v.shr<32>();
168 auto x2 = v & SIMD_4x64::splat(0xFFFFFFFF);
169
170 x1 ^= (x2 | SIMD_4x64::splat(k2));
171 x2 ^= rotl32_1(x1 & SIMD_4x64::splat(k1));
172
173 return x1.shl<32>() | x2;
174}
175
176// Helpers for 6 round iterations
177
178BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2_GFNI void six_e_rounds(SIMD_4x64& L, SIMD_4x64& R, std::span<const uint64_t> SK) {
179 R ^= camellia_f(L ^ SIMD_4x64::splat(SK[0]));
180 L ^= camellia_f(R ^ SIMD_4x64::splat(SK[1]));
181 R ^= camellia_f(L ^ SIMD_4x64::splat(SK[2]));
182 L ^= camellia_f(R ^ SIMD_4x64::splat(SK[3]));
183 R ^= camellia_f(L ^ SIMD_4x64::splat(SK[4]));
184 L ^= camellia_f(R ^ SIMD_4x64::splat(SK[5]));
185}
186
187BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2_GFNI void six_d_rounds(SIMD_4x64& L, SIMD_4x64& R, std::span<const uint64_t> SK) {
188 R ^= camellia_f(L ^ SIMD_4x64::splat(SK[5]));
189 L ^= camellia_f(R ^ SIMD_4x64::splat(SK[4]));
190 R ^= camellia_f(L ^ SIMD_4x64::splat(SK[3]));
191 L ^= camellia_f(R ^ SIMD_4x64::splat(SK[2]));
192 R ^= camellia_f(L ^ SIMD_4x64::splat(SK[1]));
193 L ^= camellia_f(R ^ SIMD_4x64::splat(SK[0]));
194}
195
196BOTAN_FN_ISA_AVX2_GFNI
197void camellia_encrypt_x4_18r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
198 SIMD_4x64 L;
199 SIMD_4x64 R;
200 load_and_deinterleave(in, L, R);
201
202 L ^= SIMD_4x64::splat(SK[0]);
203 R ^= SIMD_4x64::splat(SK[1]);
204
205 six_e_rounds(L, R, SK.subspan(2));
206
207 L = FL_4(L, SK[8]);
208 R = FLINV_4(R, SK[9]);
209
210 six_e_rounds(L, R, SK.subspan(10));
211
212 L = FL_4(L, SK[16]);
213 R = FLINV_4(R, SK[17]);
214
215 six_e_rounds(L, R, SK.subspan(18));
216
217 R ^= SIMD_4x64::splat(SK[24]);
218 L ^= SIMD_4x64::splat(SK[25]);
219
220 interleave_and_store(out, L, R);
221}
222
223BOTAN_FN_ISA_AVX2_GFNI
224void camellia_decrypt_x4_18r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
225 SIMD_4x64 L;
226 SIMD_4x64 R;
227 load_and_deinterleave(in, L, R);
228
229 R ^= SIMD_4x64::splat(SK[25]);
230 L ^= SIMD_4x64::splat(SK[24]);
231
232 six_d_rounds(L, R, SK.subspan(18));
233
234 L = FL_4(L, SK[17]);
235 R = FLINV_4(R, SK[16]);
236
237 six_d_rounds(L, R, SK.subspan(10));
238
239 L = FL_4(L, SK[9]);
240 R = FLINV_4(R, SK[8]);
241
242 six_d_rounds(L, R, SK.subspan(2));
243
244 L ^= SIMD_4x64::splat(SK[1]);
245 R ^= SIMD_4x64::splat(SK[0]);
246
247 interleave_and_store(out, L, R);
248}
249
250BOTAN_FN_ISA_AVX2_GFNI
251void camellia_encrypt_x4_24r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
252 SIMD_4x64 L;
253 SIMD_4x64 R;
254 load_and_deinterleave(in, L, R);
255
256 L ^= SIMD_4x64::splat(SK[0]);
257 R ^= SIMD_4x64::splat(SK[1]);
258
259 six_e_rounds(L, R, SK.subspan(2));
260
261 L = FL_4(L, SK[8]);
262 R = FLINV_4(R, SK[9]);
263
264 six_e_rounds(L, R, SK.subspan(10));
265
266 L = FL_4(L, SK[16]);
267 R = FLINV_4(R, SK[17]);
268
269 six_e_rounds(L, R, SK.subspan(18));
270
271 L = FL_4(L, SK[24]);
272 R = FLINV_4(R, SK[25]);
273
274 six_e_rounds(L, R, SK.subspan(26));
275
276 R ^= SIMD_4x64::splat(SK[32]);
277 L ^= SIMD_4x64::splat(SK[33]);
278
279 interleave_and_store(out, L, R);
280}
281
282BOTAN_FN_ISA_AVX2_GFNI
283void camellia_decrypt_x4_24r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
284 SIMD_4x64 L;
285 SIMD_4x64 R;
286 load_and_deinterleave(in, L, R);
287
288 R ^= SIMD_4x64::splat(SK[33]);
289 L ^= SIMD_4x64::splat(SK[32]);
290
291 six_d_rounds(L, R, SK.subspan(26));
292
293 L = FL_4(L, SK[25]);
294 R = FLINV_4(R, SK[24]);
295
296 six_d_rounds(L, R, SK.subspan(18));
297
298 L = FL_4(L, SK[17]);
299 R = FLINV_4(R, SK[16]);
300
301 six_d_rounds(L, R, SK.subspan(10));
302
303 L = FL_4(L, SK[9]);
304 R = FLINV_4(R, SK[8]);
305
306 six_d_rounds(L, R, SK.subspan(2));
307
308 L ^= SIMD_4x64::splat(SK[1]);
309 R ^= SIMD_4x64::splat(SK[0]);
310
311 interleave_and_store(out, L, R);
312}
313
314} // namespace
315
316} // namespace Camellia_AVX2_GFNI
317
318// static
319void BOTAN_FN_ISA_AVX2_GFNI Camellia_128::avx2_gfni_encrypt(const uint8_t in[],
320 uint8_t out[],
321 size_t blocks,
322 std::span<const uint64_t> SK) {
323 while(blocks >= 4) {
324 Camellia_AVX2_GFNI::camellia_encrypt_x4_18r(in, out, SK);
325 in += 4 * 16;
326 out += 4 * 16;
327 blocks -= 4;
328 }
329
330 if(blocks > 0) {
331 uint8_t ibuf[4 * 16] = {0};
332 uint8_t obuf[4 * 16] = {0};
333 copy_mem(ibuf, in, blocks * 16);
334 Camellia_AVX2_GFNI::camellia_encrypt_x4_18r(ibuf, obuf, SK);
335 copy_mem(out, obuf, blocks * 16);
336 }
337}
338
339// static
340void BOTAN_FN_ISA_AVX2_GFNI Camellia_128::avx2_gfni_decrypt(const uint8_t in[],
341 uint8_t out[],
342 size_t blocks,
343 std::span<const uint64_t> SK) {
344 while(blocks >= 4) {
345 Camellia_AVX2_GFNI::camellia_decrypt_x4_18r(in, out, SK);
346 in += 4 * 16;
347 out += 4 * 16;
348 blocks -= 4;
349 }
350
351 if(blocks > 0) {
352 uint8_t ibuf[4 * 16] = {0};
353 uint8_t obuf[4 * 16] = {0};
354 copy_mem(ibuf, in, blocks * 16);
355 Camellia_AVX2_GFNI::camellia_decrypt_x4_18r(ibuf, obuf, SK);
356 copy_mem(out, obuf, blocks * 16);
357 }
358}
359
360// static
361void BOTAN_FN_ISA_AVX2_GFNI Camellia_192::avx2_gfni_encrypt(const uint8_t in[],
362 uint8_t out[],
363 size_t blocks,
364 std::span<const uint64_t> SK) {
365 while(blocks >= 4) {
366 Camellia_AVX2_GFNI::camellia_encrypt_x4_24r(in, out, SK);
367 in += 4 * 16;
368 out += 4 * 16;
369 blocks -= 4;
370 }
371
372 if(blocks > 0) {
373 uint8_t ibuf[4 * 16] = {0};
374 uint8_t obuf[4 * 16] = {0};
375 copy_mem(ibuf, in, blocks * 16);
376 Camellia_AVX2_GFNI::camellia_encrypt_x4_24r(ibuf, obuf, SK);
377 copy_mem(out, obuf, blocks * 16);
378 }
379}
380
381// static
382void BOTAN_FN_ISA_AVX2_GFNI Camellia_192::avx2_gfni_decrypt(const uint8_t in[],
383 uint8_t out[],
384 size_t blocks,
385 std::span<const uint64_t> SK) {
386 while(blocks >= 4) {
387 Camellia_AVX2_GFNI::camellia_decrypt_x4_24r(in, out, SK);
388 in += 4 * 16;
389 out += 4 * 16;
390 blocks -= 4;
391 }
392
393 if(blocks > 0) {
394 uint8_t ibuf[4 * 16] = {0};
395 uint8_t obuf[4 * 16] = {0};
396 copy_mem(ibuf, in, blocks * 16);
397 Camellia_AVX2_GFNI::camellia_decrypt_x4_24r(ibuf, obuf, SK);
398 copy_mem(out, obuf, blocks * 16);
399 }
400}
401
402// static
403void BOTAN_FN_ISA_AVX2_GFNI Camellia_256::avx2_gfni_encrypt(const uint8_t in[],
404 uint8_t out[],
405 size_t blocks,
406 std::span<const uint64_t> SK) {
407 while(blocks >= 4) {
408 Camellia_AVX2_GFNI::camellia_encrypt_x4_24r(in, out, SK);
409 in += 4 * 16;
410 out += 4 * 16;
411 blocks -= 4;
412 }
413
414 if(blocks > 0) {
415 uint8_t ibuf[4 * 16] = {0};
416 uint8_t obuf[4 * 16] = {0};
417 copy_mem(ibuf, in, blocks * 16);
418 Camellia_AVX2_GFNI::camellia_encrypt_x4_24r(ibuf, obuf, SK);
419 copy_mem(out, obuf, blocks * 16);
420 }
421}
422
423// static
424void BOTAN_FN_ISA_AVX2_GFNI Camellia_256::avx2_gfni_decrypt(const uint8_t in[],
425 uint8_t out[],
426 size_t blocks,
427 std::span<const uint64_t> SK) {
428 while(blocks >= 4) {
429 Camellia_AVX2_GFNI::camellia_decrypt_x4_24r(in, out, SK);
430 in += 4 * 16;
431 out += 4 * 16;
432 blocks -= 4;
433 }
434
435 if(blocks > 0) {
436 uint8_t ibuf[4 * 16] = {0};
437 uint8_t obuf[4 * 16] = {0};
438 copy_mem(ibuf, in, blocks * 16);
439 Camellia_AVX2_GFNI::camellia_decrypt_x4_24r(ibuf, obuf, SK);
440 copy_mem(out, obuf, blocks * 16);
441 }
442}
443
444} // namespace Botan
static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 load_be(const void *in)
Definition simd_4x64.h:50
__m256i BOTAN_FN_ISA_SIMD_4X64 raw() const noexcept
Definition simd_4x64.h:194
SIMD_4x64 BOTAN_FN_ISA_SIMD_4X64 shr() const noexcept
Definition simd_4x64.h:153
static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 splat(uint64_t v)
Definition simd_4x64.h:192
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
constexpr void copy_mem(T *out, const T *in, size_t n)
Definition mem_ops.h:144
uint32_t P1(uint32_t X)
Definition sm3_fn.h:65
consteval uint64_t gfni_matrix(std::string_view s)