Botan 3.11.1
Crypto and TLS for C&
camellia_hwaes.cpp
Go to the documentation of this file.
1/*
2* (C) 2026 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/camellia.h>
8
9#include <botan/mem_ops.h>
10#include <botan/internal/simd_hwaes.h>
11
12namespace Botan {
13
14namespace Camellia_HWAES {
15
16namespace {
17
18/* Helpers for 64-bit operations on SIMD_4x32 */
19
20BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 load_be64(const uint8_t* in) {
21 const auto bswap64 = SIMD_4x32(0x04050607, 0x00010203, 0x0C0D0E0F, 0x08090A0B);
23}
24
25BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void store_be64(uint8_t* out, SIMD_4x32 v) {
26 const auto bswap64 = SIMD_4x32(0x04050607, 0x00010203, 0x0C0D0E0F, 0x08090A0B);
27 SIMD_4x32::byte_shuffle(v, bswap64).store_le(out);
28}
29
30BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 splat64(uint64_t v) {
31 const uint32_t lo = static_cast<uint32_t>(v);
32 const uint32_t hi = static_cast<uint32_t>(v >> 32);
33 return SIMD_4x32(lo, hi, lo, hi);
34}
35
36/* The Camellia round function */
37BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 camellia_f(SIMD_4x32 x) {
38 // Pre-affine shared by S1/S2/S3
39 constexpr uint64_t pre123_a = gfni_matrix(R"(
40 1 1 1 0 1 1 0 1
41 0 0 1 1 0 0 1 0
42 1 1 0 1 0 0 0 0
43 1 0 1 1 0 0 1 1
44 0 0 0 0 1 1 0 0
45 1 0 1 0 0 1 0 0
46 0 0 1 0 1 1 0 0
47 1 0 0 0 0 1 1 0)");
48
49 // Pre-affine for S4
50 constexpr uint64_t pre4_a = gfni_matrix(R"(
51 1 1 0 1 1 0 1 1
52 0 1 1 0 0 1 0 0
53 1 0 1 0 0 0 0 1
54 0 1 1 0 0 1 1 1
55 0 0 0 1 1 0 0 0
56 0 1 0 0 1 0 0 1
57 0 1 0 1 1 0 0 0
58 0 0 0 0 1 1 0 1)");
59
60 constexpr uint8_t pre_c = 0x45;
61
62 // Post-affine for S1 and S4
63 constexpr uint64_t post14_a = gfni_matrix(R"(
64 0 0 0 0 0 0 0 1
65 0 1 1 0 0 1 1 0
66 1 0 1 1 1 1 1 0
67 0 0 0 1 1 0 1 1
68 1 0 0 0 1 1 1 0
69 0 1 0 1 1 1 1 0
70 0 1 1 1 1 1 1 1
71 0 0 0 1 1 1 0 0)");
72 constexpr uint8_t post14_c = 0x6E;
73
74 // Post-affine for S2
75 constexpr uint64_t post2_a = gfni_matrix(R"(
76 0 0 0 1 1 1 0 0
77 0 0 0 0 0 0 0 1
78 0 1 1 0 0 1 1 0
79 1 0 1 1 1 1 1 0
80 0 0 0 1 1 0 1 1
81 1 0 0 0 1 1 1 0
82 0 1 0 1 1 1 1 0
83 0 1 1 1 1 1 1 1)");
84 constexpr uint8_t post2_c = 0xDC;
85
86 // Post-affine for S3
87 constexpr uint64_t post3_a = gfni_matrix(R"(
88 0 1 1 0 0 1 1 0
89 1 0 1 1 1 1 1 0
90 0 0 0 1 1 0 1 1
91 1 0 0 0 1 1 1 0
92 0 1 0 1 1 1 1 0
93 0 1 1 1 1 1 1 1
94 0 0 0 1 1 1 0 0
95 0 0 0 0 0 0 0 1)");
96 constexpr uint8_t post3_c = 0x37;
97
98 constexpr auto PRE123 = Gf2AffineTransformation(pre123_a, pre_c);
99 constexpr auto PRE4 = Gf2AffineTransformation(pre4_a, pre_c);
100 constexpr auto POST14 = Gf2AffineTransformation::post_sbox(post14_a, post14_c);
101 constexpr auto POST2 = Gf2AffineTransformation::post_sbox(post2_a, post2_c);
102 constexpr auto POST3 = Gf2AffineTransformation::post_sbox(post3_a, post3_c);
103
104 const auto mask_s2 = SIMD_4x32(0xFF000000, 0x00FF0000, 0xFF000000, 0x00FF0000);
105 const auto mask_s3 = SIMD_4x32(0x00FF0000, 0x0000FF00, 0x00FF0000, 0x0000FF00);
106 const auto mask_s4 = SIMD_4x32(0x0000FF00, 0x000000FF, 0x0000FF00, 0x000000FF);
107
108 const auto pre123 = PRE123.affine_transform(x);
109 const auto pre4 = PRE4.affine_transform(x);
110
111 const auto sub = hw_aes_sbox(SIMD_4x32::byte_blend(mask_s4, pre4, pre123));
112
113 const auto s14 = POST14.affine_transform(sub);
114 const auto s2 = POST2.affine_transform(sub);
115 const auto s3 = POST3.affine_transform(sub);
116
117 // Final merged Sbox output for all bytes
118 const auto sbox = SIMD_4x32::byte_blend(mask_s3, s3, SIMD_4x32::byte_blend(mask_s2, s2, s14));
119
120 // The linear mixing step
121 const auto P1 = SIMD_4x32(0x00000001, 0x00000001, 0x08080809, 0x08080809);
122 const auto P2 = SIMD_4x32(0x01010202, 0x01010202, 0x09090A0A, 0x09090A0A);
123 const auto P3 = SIMD_4x32(0x02030303, 0x02030303, 0x0A0B0B0B, 0x0A0B0B0B);
124 const auto P4 = SIMD_4x32(0x06050404, 0x04040504, 0x0E0D0C0C, 0x0C0C0D0C);
125 const auto P5 = SIMD_4x32(0x07060507, 0x05060605, 0x0F0E0D0F, 0x0D0E0E0D);
126 const auto P6 = SIMD_4x32(0xFFFFFFFF, 0x07070706, 0xFFFFFFFF, 0x0F0F0F0E);
127
128 const auto sxp1 = SIMD_4x32::byte_shuffle(sbox, P1);
129 const auto sxp2 = SIMD_4x32::byte_shuffle(sbox, P2);
130 const auto sxp3 = SIMD_4x32::byte_shuffle(sbox, P3);
131 const auto sxp4 = SIMD_4x32::byte_shuffle(sbox, P4);
132 const auto sxp5 = SIMD_4x32::byte_shuffle(sbox, P5);
133 const auto sxp6 = SIMD_4x32::byte_shuffle(sbox, P6);
134
135 return (sxp1 ^ sxp2 ^ sxp3 ^ sxp4 ^ sxp5 ^ sxp6);
136}
137
138/*
139* FL and FL-inverse operate on 32-bit sub-halves within each 64-bit element.
140* We use byte_shuffle to broadcast each 32-bit half, then recombine with byte_blend.
141*/
142BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 FL_2(SIMD_4x32 v, uint64_t K) {
143 const uint32_t k1 = static_cast<uint32_t>(K >> 32);
144 const uint32_t k2 = static_cast<uint32_t>(K);
145
146 // Broadcast upper/lower 32-bit halves of each 64-bit element
147 const auto shuf_hi = SIMD_4x32(0x07060504, 0x07060504, 0x0F0E0D0C, 0x0F0E0D0C);
148 const auto shuf_lo = SIMD_4x32(0x03020100, 0x03020100, 0x0B0A0908, 0x0B0A0908);
149
150 auto x1 = SIMD_4x32::byte_shuffle(v, shuf_hi);
151 auto x2 = SIMD_4x32::byte_shuffle(v, shuf_lo);
152
153 x2 ^= (x1 & SIMD_4x32::splat(k1)).rotl<1>();
154 x1 ^= x2 | SIMD_4x32::splat(k2);
155
156 // Recombine: lo from x2, hi from x1
157 const auto mask_hi = SIMD_4x32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF);
158 return SIMD_4x32::byte_blend(mask_hi, x1, x2);
159}
160
161BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 FLINV_2(SIMD_4x32 v, uint64_t K) {
162 const uint32_t k1 = static_cast<uint32_t>(K >> 32);
163 const uint32_t k2 = static_cast<uint32_t>(K);
164
165 const auto shuf_hi = SIMD_4x32(0x07060504, 0x07060504, 0x0F0E0D0C, 0x0F0E0D0C);
166 const auto shuf_lo = SIMD_4x32(0x03020100, 0x03020100, 0x0B0A0908, 0x0B0A0908);
167
168 auto x1 = SIMD_4x32::byte_shuffle(v, shuf_hi);
169 auto x2 = SIMD_4x32::byte_shuffle(v, shuf_lo);
170
171 x1 ^= x2 | SIMD_4x32::splat(k2);
172 x2 ^= (x1 & SIMD_4x32::splat(k1)).rotl<1>();
173
174 const auto mask_hi = SIMD_4x32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF);
175 return SIMD_4x32::byte_blend(mask_hi, x1, x2);
176}
177
178BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void load_and_deinterleave(const uint8_t in[], SIMD_4x32& L, SIMD_4x32& R) {
179 auto A = load_be64(in); // block 0: [L0, R0]
180 auto B = load_be64(in + 16); // block 1: [L1, R1]
181 const auto mask_upper = SIMD_4x32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF);
182 L = SIMD_4x32::byte_blend(mask_upper, B.swap_halves(), A); // [L0, L1]
183 R = SIMD_4x32::byte_blend(mask_upper, B, A.swap_halves()); // [R0, R1]
184}
185
186BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void interleave_and_store(uint8_t out[], SIMD_4x32 L, SIMD_4x32 R) {
187 // Camellia output swaps L and R
188 const auto mask_upper = SIMD_4x32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF);
189 auto A = SIMD_4x32::byte_blend(mask_upper, L.swap_halves(), R); // [R0, L0]
190 auto B = SIMD_4x32::byte_blend(mask_upper, L, R.swap_halves()); // [R1, L1]
191 store_be64(out, A);
192 store_be64(out + 16, B);
193}
194
195BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void six_e_rounds(SIMD_4x32& L, SIMD_4x32& R, const uint64_t SK[]) {
196 R ^= camellia_f(L ^ splat64(SK[0]));
197 L ^= camellia_f(R ^ splat64(SK[1]));
198 R ^= camellia_f(L ^ splat64(SK[2]));
199 L ^= camellia_f(R ^ splat64(SK[3]));
200 R ^= camellia_f(L ^ splat64(SK[4]));
201 L ^= camellia_f(R ^ splat64(SK[5]));
202}
203
204BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void six_d_rounds(SIMD_4x32& L, SIMD_4x32& R, const uint64_t SK[]) {
205 R ^= camellia_f(L ^ splat64(SK[5]));
206 L ^= camellia_f(R ^ splat64(SK[4]));
207 R ^= camellia_f(L ^ splat64(SK[3]));
208 L ^= camellia_f(R ^ splat64(SK[2]));
209 R ^= camellia_f(L ^ splat64(SK[1]));
210 L ^= camellia_f(R ^ splat64(SK[0]));
211}
212
213BOTAN_FN_ISA_HWAES void camellia_encrypt_x2_18r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
214 SIMD_4x32 L;
215 SIMD_4x32 R;
216 load_and_deinterleave(in, L, R);
217
218 L ^= splat64(SK[0]);
219 R ^= splat64(SK[1]);
220
221 six_e_rounds(L, R, &SK[2]);
222 L = FL_2(L, SK[8]);
223 R = FLINV_2(R, SK[9]);
224 six_e_rounds(L, R, &SK[10]);
225 L = FL_2(L, SK[16]);
226 R = FLINV_2(R, SK[17]);
227 six_e_rounds(L, R, &SK[18]);
228
229 R ^= splat64(SK[24]);
230 L ^= splat64(SK[25]);
231
232 interleave_and_store(out, L, R);
233}
234
235BOTAN_FN_ISA_HWAES void camellia_decrypt_x2_18r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
236 SIMD_4x32 L;
237 SIMD_4x32 R;
238 load_and_deinterleave(in, L, R);
239
240 R ^= splat64(SK[25]);
241 L ^= splat64(SK[24]);
242
243 six_d_rounds(L, R, &SK[18]);
244 L = FL_2(L, SK[17]);
245 R = FLINV_2(R, SK[16]);
246 six_d_rounds(L, R, &SK[10]);
247 L = FL_2(L, SK[9]);
248 R = FLINV_2(R, SK[8]);
249 six_d_rounds(L, R, &SK[2]);
250
251 L ^= splat64(SK[1]);
252 R ^= splat64(SK[0]);
253
254 interleave_and_store(out, L, R);
255}
256
257BOTAN_FN_ISA_HWAES void camellia_encrypt_x2_24r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
258 SIMD_4x32 L;
259 SIMD_4x32 R;
260 load_and_deinterleave(in, L, R);
261
262 L ^= splat64(SK[0]);
263 R ^= splat64(SK[1]);
264
265 six_e_rounds(L, R, &SK[2]);
266 L = FL_2(L, SK[8]);
267 R = FLINV_2(R, SK[9]);
268 six_e_rounds(L, R, &SK[10]);
269 L = FL_2(L, SK[16]);
270 R = FLINV_2(R, SK[17]);
271 six_e_rounds(L, R, &SK[18]);
272 L = FL_2(L, SK[24]);
273 R = FLINV_2(R, SK[25]);
274 six_e_rounds(L, R, &SK[26]);
275
276 R ^= splat64(SK[32]);
277 L ^= splat64(SK[33]);
278
279 interleave_and_store(out, L, R);
280}
281
282BOTAN_FN_ISA_HWAES void camellia_decrypt_x2_24r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
283 SIMD_4x32 L;
284 SIMD_4x32 R;
285 load_and_deinterleave(in, L, R);
286
287 R ^= splat64(SK[33]);
288 L ^= splat64(SK[32]);
289
290 six_d_rounds(L, R, &SK[26]);
291 L = FL_2(L, SK[25]);
292 R = FLINV_2(R, SK[24]);
293 six_d_rounds(L, R, &SK[18]);
294 L = FL_2(L, SK[17]);
295 R = FLINV_2(R, SK[16]);
296 six_d_rounds(L, R, &SK[10]);
297 L = FL_2(L, SK[9]);
298 R = FLINV_2(R, SK[8]);
299 six_d_rounds(L, R, &SK[2]);
300
301 L ^= splat64(SK[1]);
302 R ^= splat64(SK[0]);
303
304 interleave_and_store(out, L, R);
305}
306
307} // namespace
308
309} // namespace Camellia_HWAES
310
311// static
312void BOTAN_FN_ISA_HWAES Camellia_128::hwaes_encrypt(const uint8_t in[],
313 uint8_t out[],
314 size_t blocks,
315 std::span<const uint64_t> SK) {
316 while(blocks >= 2) {
317 Camellia_HWAES::camellia_encrypt_x2_18r(in, out, SK);
318 in += 2 * 16;
319 out += 2 * 16;
320 blocks -= 2;
321 }
322
323 if(blocks > 0) {
324 uint8_t ibuf[2 * 16] = {0};
325 uint8_t obuf[2 * 16] = {0};
326 copy_mem(ibuf, in, 16);
327 Camellia_HWAES::camellia_encrypt_x2_18r(ibuf, obuf, SK);
328 copy_mem(out, obuf, 16);
329 }
330}
331
332// static
333void BOTAN_FN_ISA_HWAES Camellia_128::hwaes_decrypt(const uint8_t in[],
334 uint8_t out[],
335 size_t blocks,
336 std::span<const uint64_t> SK) {
337 while(blocks >= 2) {
338 Camellia_HWAES::camellia_decrypt_x2_18r(in, out, SK);
339 in += 2 * 16;
340 out += 2 * 16;
341 blocks -= 2;
342 }
343
344 if(blocks > 0) {
345 uint8_t ibuf[2 * 16] = {0};
346 uint8_t obuf[2 * 16] = {0};
347 copy_mem(ibuf, in, 16);
348 Camellia_HWAES::camellia_decrypt_x2_18r(ibuf, obuf, SK);
349 copy_mem(out, obuf, 16);
350 }
351}
352
353// static
354void BOTAN_FN_ISA_HWAES Camellia_192::hwaes_encrypt(const uint8_t in[],
355 uint8_t out[],
356 size_t blocks,
357 std::span<const uint64_t> SK) {
358 while(blocks >= 2) {
359 Camellia_HWAES::camellia_encrypt_x2_24r(in, out, SK);
360 in += 2 * 16;
361 out += 2 * 16;
362 blocks -= 2;
363 }
364
365 if(blocks > 0) {
366 uint8_t ibuf[2 * 16] = {0};
367 uint8_t obuf[2 * 16] = {0};
368 copy_mem(ibuf, in, 16);
369 Camellia_HWAES::camellia_encrypt_x2_24r(ibuf, obuf, SK);
370 copy_mem(out, obuf, 16);
371 }
372}
373
374// static
375void BOTAN_FN_ISA_HWAES Camellia_192::hwaes_decrypt(const uint8_t in[],
376 uint8_t out[],
377 size_t blocks,
378 std::span<const uint64_t> SK) {
379 while(blocks >= 2) {
380 Camellia_HWAES::camellia_decrypt_x2_24r(in, out, SK);
381 in += 2 * 16;
382 out += 2 * 16;
383 blocks -= 2;
384 }
385
386 if(blocks > 0) {
387 uint8_t ibuf[2 * 16] = {0};
388 uint8_t obuf[2 * 16] = {0};
389 copy_mem(ibuf, in, 16);
390 Camellia_HWAES::camellia_decrypt_x2_24r(ibuf, obuf, SK);
391 copy_mem(out, obuf, 16);
392 }
393}
394
395// static
396void BOTAN_FN_ISA_HWAES Camellia_256::hwaes_encrypt(const uint8_t in[],
397 uint8_t out[],
398 size_t blocks,
399 std::span<const uint64_t> SK) {
400 while(blocks >= 2) {
401 Camellia_HWAES::camellia_encrypt_x2_24r(in, out, SK);
402 in += 2 * 16;
403 out += 2 * 16;
404 blocks -= 2;
405 }
406
407 if(blocks > 0) {
408 uint8_t ibuf[2 * 16] = {0};
409 uint8_t obuf[2 * 16] = {0};
410 copy_mem(ibuf, in, 16);
411 Camellia_HWAES::camellia_encrypt_x2_24r(ibuf, obuf, SK);
412 copy_mem(out, obuf, 16);
413 }
414}
415
416// static
417void BOTAN_FN_ISA_HWAES Camellia_256::hwaes_decrypt(const uint8_t in[],
418 uint8_t out[],
419 size_t blocks,
420 std::span<const uint64_t> SK) {
421 while(blocks >= 2) {
422 Camellia_HWAES::camellia_decrypt_x2_24r(in, out, SK);
423 in += 2 * 16;
424 out += 2 * 16;
425 blocks -= 2;
426 }
427
428 if(blocks > 0) {
429 uint8_t ibuf[2 * 16] = {0};
430 uint8_t obuf[2 * 16] = {0};
431 copy_mem(ibuf, in, 16);
432 Camellia_HWAES::camellia_decrypt_x2_24r(ibuf, obuf, SK);
433 copy_mem(out, obuf, 16);
434 }
435}
436
437} // namespace Botan
static consteval Gf2AffineTransformation post_sbox(uint64_t M, uint8_t c)
Definition simd_hwaes.h:139
void BOTAN_FN_ISA_SIMD_4X32 store_le(uint32_t out[4]) const noexcept
Definition simd_4x32.h:219
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 load_le(const void *in) noexcept
Definition simd_4x32.h:162
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 byte_shuffle(const SIMD_4x32 &tbl, const SIMD_4x32 &idx)
Definition simd_4x32.h:803
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 byte_blend(const SIMD_4x32 &mask, const SIMD_4x32 &a, const SIMD_4x32 &b) noexcept
Definition simd_4x32.h:772
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 splat(uint32_t B) noexcept
Definition simd_4x32.h:127
SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 swap_halves() const
Definition simd_4x32.h:934
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
constexpr void copy_mem(T *out, const T *in, size_t n)
Definition mem_ops.h:144
uint32_t P1(uint32_t X)
Definition sm3_fn.h:65
consteval uint64_t gfni_matrix(std::string_view s)
Definition gfni_utils.h:17
BOTAN_FORCE_INLINE constexpr T rotl(T input)
Definition rotate.h:23
SIMD_4x32 BOTAN_FN_ISA_HWAES hw_aes_sbox(SIMD_4x32 x)
Definition simd_hwaes.h:19