Botan 3.9.0
Crypto and TLS for C&
aes.cpp
Go to the documentation of this file.
1/*
2* (C) 1999-2010,2015,2017,2018,2020 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/aes.h>
8
9#include <botan/internal/bit_ops.h>
10#include <botan/internal/bswap.h>
11#include <botan/internal/ct_utils.h>
12#include <botan/internal/loadstor.h>
13#include <botan/internal/rotate.h>
14
15#if defined(BOTAN_HAS_CPUID)
16 #include <botan/internal/cpuid.h>
17#endif
18
19#if defined(BOTAN_HAS_AES_POWER8) || defined(BOTAN_HAS_AES_ARMV8) || defined(BOTAN_HAS_AES_NI)
20 #define BOTAN_HAS_HW_AES_SUPPORT
21#endif
22
23#if defined(BOTAN_HAS_HW_AES_SUPPORT)
24 #include <bit>
25#endif
26
27namespace Botan {
28
29/*
30* One of three AES implementation strategies are used to get a constant time
31* implementation which is immune to common cache/timing based side channels:
32*
33* - If AES hardware support is available (AES-NI, POWER8, Aarch64) use that
34*
35* - If 128-bit SIMD with byte shuffles are available (SSSE3, NEON, or Altivec),
36* use the vperm technique published by Mike Hamburg at CHES 2009.
37*
38* - If no hardware or SIMD support, fall back to a constant time bitsliced
39* implementation. This uses 32-bit words resulting in 2 blocks being processed
40* in parallel. Moving to 4 blocks (with 64-bit words) would approximately
41* double performance on 64-bit CPUs. Likewise moving to 128 bit SIMD would
42* again approximately double performance vs 64-bit. However the assumption is
43* that most 64-bit CPUs either have hardware AES or SIMD shuffle support and
44* that the majority of users falling back to this code will be 32-bit cores.
45* If this assumption proves to be unsound, the bitsliced code can easily be
46* extended to operate on either 32 or 64 bit words depending on the native
47* wordsize of the target processor.
48*
49* Useful references
50*
51* - "Accelerating AES with Vector Permute Instructions" Mike Hamburg
52* https://www.shiftleft.org/papers/vector_aes/vector_aes.pdf
53*
54* - "Faster and Timing-Attack Resistant AES-GCM" Käsper and Schwabe
55* https://eprint.iacr.org/2009/129.pdf
56*
57* - "A new combinational logic minimization technique with applications to cryptology."
58* Boyar and Peralta https://eprint.iacr.org/2009/191.pdf
59*
60* - "A depth-16 circuit for the AES S-box" Boyar and Peralta
61* https://eprint.iacr.org/2011/332.pdf
62*
63* - "A Very Compact S-box for AES" Canright
64* https://www.iacr.org/archive/ches2005/032.pdf
65* https://core.ac.uk/download/pdf/36694529.pdf (extended)
66*/
67
68namespace {
69
70/*
71This is an AES sbox circuit which can execute in bitsliced mode up to 32x in
72parallel.
73
74The circuit is from the "Circuit Minimization Team" group
75http://www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html
76http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt
77
78This circuit has size 113 and depth 27. In software it is much faster than
79circuits which are considered faster for hardware purposes (where circuit depth
80is the critical constraint), because unlike in hardware, on common CPUs we can
81only execute - at best - 3 or 4 logic operations per cycle. So a smaller circuit
82is superior. On an x86-64 machine this circuit is about 15% faster than the
83circuit of size 128 and depth 16 given in "A depth-16 circuit for the AES S-box".
84
85Another circuit for AES Sbox of size 102 and depth 24 is describted in "New
86Circuit Minimization Techniques for Smaller and Faster AES SBoxes"
87[https://eprint.iacr.org/2019/802] however it relies on "non-standard" gates
88like MUX, NOR, NAND, etc and so in practice in bitsliced software, its size is
89actually a bit larger than this circuit, as few CPUs have such instructions and
90otherwise they must be emulated using a sequence of available bit operations.
91*/
92void AES_SBOX(uint32_t V[8]) {
93 const uint32_t U0 = V[0];
94 const uint32_t U1 = V[1];
95 const uint32_t U2 = V[2];
96 const uint32_t U3 = V[3];
97 const uint32_t U4 = V[4];
98 const uint32_t U5 = V[5];
99 const uint32_t U6 = V[6];
100 const uint32_t U7 = V[7];
101
102 const uint32_t y14 = U3 ^ U5;
103 const uint32_t y13 = U0 ^ U6;
104 const uint32_t y9 = U0 ^ U3;
105 const uint32_t y8 = U0 ^ U5;
106 const uint32_t t0 = U1 ^ U2;
107 const uint32_t y1 = t0 ^ U7;
108 const uint32_t y4 = y1 ^ U3;
109 const uint32_t y12 = y13 ^ y14;
110 const uint32_t y2 = y1 ^ U0;
111 const uint32_t y5 = y1 ^ U6;
112 const uint32_t y3 = y5 ^ y8;
113 const uint32_t t1 = U4 ^ y12;
114 const uint32_t y15 = t1 ^ U5;
115 const uint32_t y20 = t1 ^ U1;
116 const uint32_t y6 = y15 ^ U7;
117 const uint32_t y10 = y15 ^ t0;
118 const uint32_t y11 = y20 ^ y9;
119 const uint32_t y7 = U7 ^ y11;
120 const uint32_t y17 = y10 ^ y11;
121 const uint32_t y19 = y10 ^ y8;
122 const uint32_t y16 = t0 ^ y11;
123 const uint32_t y21 = y13 ^ y16;
124 const uint32_t y18 = U0 ^ y16;
125 const uint32_t t2 = y12 & y15;
126 const uint32_t t3 = y3 & y6;
127 const uint32_t t4 = t3 ^ t2;
128 const uint32_t t5 = y4 & U7;
129 const uint32_t t6 = t5 ^ t2;
130 const uint32_t t7 = y13 & y16;
131 const uint32_t t8 = y5 & y1;
132 const uint32_t t9 = t8 ^ t7;
133 const uint32_t t10 = y2 & y7;
134 const uint32_t t11 = t10 ^ t7;
135 const uint32_t t12 = y9 & y11;
136 const uint32_t t13 = y14 & y17;
137 const uint32_t t14 = t13 ^ t12;
138 const uint32_t t15 = y8 & y10;
139 const uint32_t t16 = t15 ^ t12;
140 const uint32_t t17 = t4 ^ y20;
141 const uint32_t t18 = t6 ^ t16;
142 const uint32_t t19 = t9 ^ t14;
143 const uint32_t t20 = t11 ^ t16;
144 const uint32_t t21 = t17 ^ t14;
145 const uint32_t t22 = t18 ^ y19;
146 const uint32_t t23 = t19 ^ y21;
147 const uint32_t t24 = t20 ^ y18;
148 const uint32_t t25 = t21 ^ t22;
149 const uint32_t t26 = t21 & t23;
150 const uint32_t t27 = t24 ^ t26;
151 const uint32_t t28 = t25 & t27;
152 const uint32_t t29 = t28 ^ t22;
153 const uint32_t t30 = t23 ^ t24;
154 const uint32_t t31 = t22 ^ t26;
155 const uint32_t t32 = t31 & t30;
156 const uint32_t t33 = t32 ^ t24;
157 const uint32_t t34 = t23 ^ t33;
158 const uint32_t t35 = t27 ^ t33;
159 const uint32_t t36 = t24 & t35;
160 const uint32_t t37 = t36 ^ t34;
161 const uint32_t t38 = t27 ^ t36;
162 const uint32_t t39 = t29 & t38;
163 const uint32_t t40 = t25 ^ t39;
164 const uint32_t t41 = t40 ^ t37;
165 const uint32_t t42 = t29 ^ t33;
166 const uint32_t t43 = t29 ^ t40;
167 const uint32_t t44 = t33 ^ t37;
168 const uint32_t t45 = t42 ^ t41;
169 const uint32_t z0 = t44 & y15;
170 const uint32_t z1 = t37 & y6;
171 const uint32_t z2 = t33 & U7;
172 const uint32_t z3 = t43 & y16;
173 const uint32_t z4 = t40 & y1;
174 const uint32_t z5 = t29 & y7;
175 const uint32_t z6 = t42 & y11;
176 const uint32_t z7 = t45 & y17;
177 const uint32_t z8 = t41 & y10;
178 const uint32_t z9 = t44 & y12;
179 const uint32_t z10 = t37 & y3;
180 const uint32_t z11 = t33 & y4;
181 const uint32_t z12 = t43 & y13;
182 const uint32_t z13 = t40 & y5;
183 const uint32_t z14 = t29 & y2;
184 const uint32_t z15 = t42 & y9;
185 const uint32_t z16 = t45 & y14;
186 const uint32_t z17 = t41 & y8;
187 const uint32_t tc1 = z15 ^ z16;
188 const uint32_t tc2 = z10 ^ tc1;
189 const uint32_t tc3 = z9 ^ tc2;
190 const uint32_t tc4 = z0 ^ z2;
191 const uint32_t tc5 = z1 ^ z0;
192 const uint32_t tc6 = z3 ^ z4;
193 const uint32_t tc7 = z12 ^ tc4;
194 const uint32_t tc8 = z7 ^ tc6;
195 const uint32_t tc9 = z8 ^ tc7;
196 const uint32_t tc10 = tc8 ^ tc9;
197 const uint32_t tc11 = tc6 ^ tc5;
198 const uint32_t tc12 = z3 ^ z5;
199 const uint32_t tc13 = z13 ^ tc1;
200 const uint32_t tc14 = tc4 ^ tc12;
201 const uint32_t S3 = tc3 ^ tc11;
202 const uint32_t tc16 = z6 ^ tc8;
203 const uint32_t tc17 = z14 ^ tc10;
204 const uint32_t tc18 = ~tc13 ^ tc14;
205 const uint32_t S7 = z12 ^ tc18;
206 const uint32_t tc20 = z15 ^ tc16;
207 const uint32_t tc21 = tc2 ^ z11;
208 const uint32_t S0 = tc3 ^ tc16;
209 const uint32_t S6 = tc10 ^ tc18;
210 const uint32_t S4 = tc14 ^ S3;
211 const uint32_t S1 = ~(S3 ^ tc16);
212 const uint32_t tc26 = tc17 ^ tc20;
213 const uint32_t S2 = ~(tc26 ^ z17);
214 const uint32_t S5 = tc21 ^ tc17;
215
216 V[0] = S0;
217 V[1] = S1;
218 V[2] = S2;
219 V[3] = S3;
220 V[4] = S4;
221 V[5] = S5;
222 V[6] = S6;
223 V[7] = S7;
224}
225
226/*
227A circuit for inverse AES Sbox of size 121 and depth 21 from
228http://www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html
229http://www.cs.yale.edu/homes/peralta/CircuitStuff/Sinv.txt
230*/
231void AES_INV_SBOX(uint32_t V[8]) {
232 const uint32_t U0 = V[0];
233 const uint32_t U1 = V[1];
234 const uint32_t U2 = V[2];
235 const uint32_t U3 = V[3];
236 const uint32_t U4 = V[4];
237 const uint32_t U5 = V[5];
238 const uint32_t U6 = V[6];
239 const uint32_t U7 = V[7];
240
241 const uint32_t Y0 = U0 ^ U3;
242 const uint32_t Y2 = ~(U1 ^ U3);
243 const uint32_t Y4 = U0 ^ Y2;
244 const uint32_t RTL0 = U6 ^ U7;
245 const uint32_t Y1 = Y2 ^ RTL0;
246 const uint32_t Y7 = ~(U2 ^ Y1);
247 const uint32_t RTL1 = U3 ^ U4;
248 const uint32_t Y6 = ~(U7 ^ RTL1);
249 const uint32_t Y3 = Y1 ^ RTL1;
250 const uint32_t RTL2 = ~(U0 ^ U2);
251 const uint32_t Y5 = U5 ^ RTL2;
252 const uint32_t sa1 = Y0 ^ Y2;
253 const uint32_t sa0 = Y1 ^ Y3;
254 const uint32_t sb1 = Y4 ^ Y6;
255 const uint32_t sb0 = Y5 ^ Y7;
256 const uint32_t ah = Y0 ^ Y1;
257 const uint32_t al = Y2 ^ Y3;
258 const uint32_t aa = sa0 ^ sa1;
259 const uint32_t bh = Y4 ^ Y5;
260 const uint32_t bl = Y6 ^ Y7;
261 const uint32_t bb = sb0 ^ sb1;
262 const uint32_t ab20 = sa0 ^ sb0;
263 const uint32_t ab22 = al ^ bl;
264 const uint32_t ab23 = Y3 ^ Y7;
265 const uint32_t ab21 = sa1 ^ sb1;
266 const uint32_t abcd1 = ah & bh;
267 const uint32_t rr1 = Y0 & Y4;
268 const uint32_t ph11 = ab20 ^ abcd1;
269 const uint32_t t01 = Y1 & Y5;
270 const uint32_t ph01 = t01 ^ abcd1;
271 const uint32_t abcd2 = al & bl;
272 const uint32_t r1 = Y2 & Y6;
273 const uint32_t pl11 = ab22 ^ abcd2;
274 const uint32_t r2 = Y3 & Y7;
275 const uint32_t pl01 = r2 ^ abcd2;
276 const uint32_t r3 = sa0 & sb0;
277 const uint32_t vr1 = aa & bb;
278 const uint32_t pr1 = vr1 ^ r3;
279 const uint32_t wr1 = sa1 & sb1;
280 const uint32_t qr1 = wr1 ^ r3;
281 const uint32_t ab0 = ph11 ^ rr1;
282 const uint32_t ab1 = ph01 ^ ab21;
283 const uint32_t ab2 = pl11 ^ r1;
284 const uint32_t ab3 = pl01 ^ qr1;
285 const uint32_t cp1 = ab0 ^ pr1;
286 const uint32_t cp2 = ab1 ^ qr1;
287 const uint32_t cp3 = ab2 ^ pr1;
288 const uint32_t cp4 = ab3 ^ ab23;
289 const uint32_t tinv1 = cp3 ^ cp4;
290 const uint32_t tinv2 = cp3 & cp1;
291 const uint32_t tinv3 = cp2 ^ tinv2;
292 const uint32_t tinv4 = cp1 ^ cp2;
293 const uint32_t tinv5 = cp4 ^ tinv2;
294 const uint32_t tinv6 = tinv5 & tinv4;
295 const uint32_t tinv7 = tinv3 & tinv1;
296 const uint32_t d2 = cp4 ^ tinv7;
297 const uint32_t d0 = cp2 ^ tinv6;
298 const uint32_t tinv8 = cp1 & cp4;
299 const uint32_t tinv9 = tinv4 & tinv8;
300 const uint32_t tinv10 = tinv4 ^ tinv2;
301 const uint32_t d1 = tinv9 ^ tinv10;
302 const uint32_t tinv11 = cp2 & cp3;
303 const uint32_t tinv12 = tinv1 & tinv11;
304 const uint32_t tinv13 = tinv1 ^ tinv2;
305 const uint32_t d3 = tinv12 ^ tinv13;
306 const uint32_t sd1 = d1 ^ d3;
307 const uint32_t sd0 = d0 ^ d2;
308 const uint32_t dl = d0 ^ d1; // NOLINT(misc-confusable-identifiers)
309 const uint32_t dh = d2 ^ d3;
310 const uint32_t dd = sd0 ^ sd1;
311 const uint32_t abcd3 = dh & bh;
312 const uint32_t rr2 = d3 & Y4;
313 const uint32_t t02 = d2 & Y5;
314 const uint32_t abcd4 = dl & bl;
315 const uint32_t r4 = d1 & Y6;
316 const uint32_t r5 = d0 & Y7;
317 const uint32_t r6 = sd0 & sb0;
318 const uint32_t vr2 = dd & bb;
319 const uint32_t wr2 = sd1 & sb1;
320 const uint32_t abcd5 = dh & ah;
321 const uint32_t r7 = d3 & Y0;
322 const uint32_t r8 = d2 & Y1;
323 const uint32_t abcd6 = dl & al;
324 const uint32_t r9 = d1 & Y2;
325 const uint32_t r10 = d0 & Y3;
326 const uint32_t r11 = sd0 & sa0;
327 const uint32_t vr3 = dd & aa;
328 const uint32_t wr3 = sd1 & sa1;
329 const uint32_t ph12 = rr2 ^ abcd3;
330 const uint32_t ph02 = t02 ^ abcd3;
331 const uint32_t pl12 = r4 ^ abcd4;
332 const uint32_t pl02 = r5 ^ abcd4;
333 const uint32_t pr2 = vr2 ^ r6;
334 const uint32_t qr2 = wr2 ^ r6;
335 const uint32_t p0 = ph12 ^ pr2;
336 const uint32_t p1 = ph02 ^ qr2;
337 const uint32_t p2 = pl12 ^ pr2;
338 const uint32_t p3 = pl02 ^ qr2;
339 const uint32_t ph13 = r7 ^ abcd5;
340 const uint32_t ph03 = r8 ^ abcd5;
341 const uint32_t pl13 = r9 ^ abcd6;
342 const uint32_t pl03 = r10 ^ abcd6;
343 const uint32_t pr3 = vr3 ^ r11;
344 const uint32_t qr3 = wr3 ^ r11;
345 const uint32_t p4 = ph13 ^ pr3;
346 const uint32_t S7 = ph03 ^ qr3;
347 const uint32_t p6 = pl13 ^ pr3;
348 const uint32_t p7 = pl03 ^ qr3;
349 const uint32_t S3 = p1 ^ p6;
350 const uint32_t S6 = p2 ^ p6;
351 const uint32_t S0 = p3 ^ p6;
352 const uint32_t X11 = p0 ^ p2;
353 const uint32_t S5 = S0 ^ X11;
354 const uint32_t X13 = p4 ^ p7;
355 const uint32_t X14 = X11 ^ X13;
356 const uint32_t S1 = S3 ^ X14;
357 const uint32_t X16 = p1 ^ S7;
358 const uint32_t S2 = X14 ^ X16;
359 const uint32_t X18 = p0 ^ p4;
360 const uint32_t X19 = S5 ^ X16;
361 const uint32_t S4 = X18 ^ X19;
362
363 V[0] = S0;
364 V[1] = S1;
365 V[2] = S2;
366 V[3] = S3;
367 V[4] = S4;
368 V[5] = S5;
369 V[6] = S6;
370 V[7] = S7;
371}
372
373inline void bit_transpose(uint32_t B[8]) {
374 swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1);
375 swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1);
376 swap_bits<uint32_t>(B[5], B[4], 0x55555555, 1);
377 swap_bits<uint32_t>(B[7], B[6], 0x55555555, 1);
378
379 swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2);
380 swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2);
381 swap_bits<uint32_t>(B[6], B[4], 0x33333333, 2);
382 swap_bits<uint32_t>(B[7], B[5], 0x33333333, 2);
383
384 swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4);
385 swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4);
386 swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4);
387 swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4);
388}
389
390inline void ks_expand(uint32_t B[8], const uint32_t K[], size_t r) {
391 /*
392 This is bit_transpose of K[r..r+4] || K[r..r+4], we can save some computation
393 due to knowing the first and second halves are the same data.
394 */
395 for(size_t i = 0; i != 4; ++i) {
396 B[i] = K[r + i];
397 }
398
399 swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1);
400 swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1);
401
402 swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2);
403 swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2);
404
405 B[4] = B[0];
406 B[5] = B[1];
407 B[6] = B[2];
408 B[7] = B[3];
409
410 swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4);
411 swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4);
412 swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4);
413 swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4);
414}
415
416inline void shift_rows(uint32_t B[8]) {
417 // 3 0 1 2 7 4 5 6 10 11 8 9 14 15 12 13 17 18 19 16 21 22 23 20 24 25 26 27 28 29 30 31
418 if constexpr(HasNative64BitRegisters) {
419 for(size_t i = 0; i != 8; i += 2) {
420 uint64_t x = (static_cast<uint64_t>(B[i]) << 32) | B[i + 1];
421 x = bit_permute_step<uint64_t>(x, 0x0022331100223311, 2);
422 x = bit_permute_step<uint64_t>(x, 0x0055005500550055, 1);
423 B[i] = static_cast<uint32_t>(x >> 32);
424 B[i + 1] = static_cast<uint32_t>(x);
425 }
426 } else {
427 for(size_t i = 0; i != 8; ++i) {
428 uint32_t x = B[i];
429 x = bit_permute_step<uint32_t>(x, 0x00223311, 2);
430 x = bit_permute_step<uint32_t>(x, 0x00550055, 1);
431 B[i] = x;
432 }
433 }
434}
435
436inline void inv_shift_rows(uint32_t B[8]) {
437 // Inverse of shift_rows, just inverting the steps
438
439 if constexpr(HasNative64BitRegisters) {
440 for(size_t i = 0; i != 8; i += 2) {
441 uint64_t x = (static_cast<uint64_t>(B[i]) << 32) | B[i + 1];
442 x = bit_permute_step<uint64_t>(x, 0x0055005500550055, 1);
443 x = bit_permute_step<uint64_t>(x, 0x0022331100223311, 2);
444 B[i] = static_cast<uint32_t>(x >> 32);
445 B[i + 1] = static_cast<uint32_t>(x);
446 }
447 } else {
448 for(size_t i = 0; i != 8; ++i) {
449 uint32_t x = B[i];
450 x = bit_permute_step<uint32_t>(x, 0x00550055, 1);
451 x = bit_permute_step<uint32_t>(x, 0x00223311, 2);
452 B[i] = x;
453 }
454 }
455}
456
457inline void mix_columns(uint32_t B[8]) {
458 // carry high bits in B[0] to positions in 0x1b == 0b11011
459 const uint32_t X2[8] = {
460 B[1],
461 B[2],
462 B[3],
463 B[4] ^ B[0],
464 B[5] ^ B[0],
465 B[6],
466 B[7] ^ B[0],
467 B[0],
468 };
469
470 for(size_t i = 0; i != 8; i++) {
471 const uint32_t X3 = B[i] ^ X2[i];
472 B[i] = X2[i] ^ rotr<8>(B[i]) ^ rotr<16>(B[i]) ^ rotr<24>(X3);
473 }
474}
475
476void inv_mix_columns(uint32_t B[8]) {
477 /*
478 OpenSSL's bsaes implementation credits Jussi Kivilinna with the lovely
479 matrix decomposition
480
481 | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
482 | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
483 | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
484 | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
485
486 Notice the first component is simply the MixColumns matrix. So we can
487 multiply first by (05,00,04,00) then perform MixColumns to get the equivalent
488 of InvMixColumn.
489 */
490 const uint32_t X4[8] = {
491 B[2],
492 B[3],
493 B[4] ^ B[0],
494 B[5] ^ B[0] ^ B[1],
495 B[6] ^ B[1],
496 B[7] ^ B[0],
497 B[0] ^ B[1],
498 B[1],
499 };
500
501 for(size_t i = 0; i != 8; i++) {
502 const uint32_t X5 = X4[i] ^ B[i];
503 B[i] = X5 ^ rotr<16>(X4[i]);
504 }
505
506 mix_columns(B);
507}
508
509/*
510* AES Encryption
511*/
512void aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, const secure_vector<uint32_t>& EK) {
513 BOTAN_ASSERT(EK.size() == 44 || EK.size() == 52 || EK.size() == 60, "Key was set");
514
515 const size_t rounds = (EK.size() - 4) / 4;
516
517 uint32_t KS[13 * 8] = {0}; // actual maximum is (rounds - 1) * 8
518 for(size_t i = 0; i < rounds - 1; i += 1) {
519 ks_expand(&KS[8 * i], EK.data(), 4 * i + 4);
520 }
521
522 const size_t BLOCK_SIZE = 16;
523 const size_t BITSLICED_BLOCKS = 8 * sizeof(uint32_t) / BLOCK_SIZE;
524
525 while(blocks > 0) {
526 const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS);
527
528 uint32_t B[8] = {0};
529
530 load_be(B, in, this_loop * 4);
531
532 CT::poison(B, 8);
533
534 for(size_t i = 0; i != 8; ++i) {
535 B[i] ^= EK[i % 4];
536 }
537
538 bit_transpose(B);
539
540 for(size_t r = 0; r != rounds - 1; ++r) {
541 AES_SBOX(B);
542 shift_rows(B);
543 mix_columns(B);
544
545 for(size_t i = 0; i != 8; ++i) {
546 B[i] ^= KS[8 * r + i];
547 }
548 }
549
550 // Final round:
551 AES_SBOX(B);
552 shift_rows(B);
553 bit_transpose(B);
554
555 for(size_t i = 0; i != 8; ++i) {
556 B[i] ^= EK[4 * rounds + i % 4];
557 }
558
559 CT::unpoison(B, 8);
560
561 copy_out_be(std::span(out, this_loop * 4 * sizeof(uint32_t)), B);
562
563 in += this_loop * BLOCK_SIZE;
564 out += this_loop * BLOCK_SIZE;
565 blocks -= this_loop;
566 }
567}
568
569/*
570* AES Decryption
571*/
572void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, const secure_vector<uint32_t>& DK) {
573 BOTAN_ASSERT(DK.size() == 44 || DK.size() == 52 || DK.size() == 60, "Key was set");
574
575 const size_t rounds = (DK.size() - 4) / 4;
576
577 uint32_t KS[13 * 8] = {0}; // actual maximum is (rounds - 1) * 8
578 for(size_t i = 0; i < rounds - 1; i += 1) {
579 ks_expand(&KS[8 * i], DK.data(), 4 * i + 4);
580 }
581
582 const size_t BLOCK_SIZE = 16;
583 const size_t BITSLICED_BLOCKS = 8 * sizeof(uint32_t) / BLOCK_SIZE;
584
585 while(blocks > 0) {
586 const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS);
587
588 uint32_t B[8] = {0};
589
590 CT::poison(B, 8);
591
592 load_be(B, in, this_loop * 4);
593
594 for(size_t i = 0; i != 8; ++i) {
595 B[i] ^= DK[i % 4];
596 }
597
598 bit_transpose(B);
599
600 for(size_t r = 0; r != rounds - 1; ++r) {
601 AES_INV_SBOX(B);
602 inv_shift_rows(B);
603 inv_mix_columns(B);
604
605 for(size_t i = 0; i != 8; ++i) {
606 B[i] ^= KS[8 * r + i];
607 }
608 }
609
610 // Final round:
611 AES_INV_SBOX(B);
612 inv_shift_rows(B);
613 bit_transpose(B);
614
615 for(size_t i = 0; i != 8; ++i) {
616 B[i] ^= DK[4 * rounds + i % 4];
617 }
618
619 CT::unpoison(B, 8);
620
621 copy_out_be(std::span(out, this_loop * 4 * sizeof(uint32_t)), B);
622
623 in += this_loop * BLOCK_SIZE;
624 out += this_loop * BLOCK_SIZE;
625 blocks -= this_loop;
626 }
627}
628
629inline uint32_t xtime32(uint32_t s) {
630 const uint32_t lo_bit = 0x01010101;
631 const uint32_t mask = 0x7F7F7F7F;
632 const uint32_t poly = 0x1B;
633
634 return ((s & mask) << 1) ^ (((s >> 7) & lo_bit) * poly);
635}
636
637inline uint32_t InvMixColumn(uint32_t s1) {
638 const uint32_t s2 = xtime32(s1);
639 const uint32_t s4 = xtime32(s2);
640 const uint32_t s8 = xtime32(s4);
641 const uint32_t s9 = s8 ^ s1;
642 const uint32_t s11 = s9 ^ s2;
643 const uint32_t s13 = s9 ^ s4;
644 const uint32_t s14 = s8 ^ s4 ^ s2;
645
646 return s14 ^ rotr<8>(s9) ^ rotr<16>(s13) ^ rotr<24>(s11);
647}
648
649void InvMixColumn_x4(uint32_t x[4]) {
650 x[0] = InvMixColumn(x[0]);
651 x[1] = InvMixColumn(x[1]);
652 x[2] = InvMixColumn(x[2]);
653 x[3] = InvMixColumn(x[3]);
654}
655
656uint32_t SE_word(uint32_t x) {
657 uint32_t I[8] = {0};
658
659 for(size_t i = 0; i != 8; ++i) {
660 I[i] = (x >> (7 - i)) & 0x01010101;
661 }
662
663 AES_SBOX(I);
664
665 x = 0;
666
667 for(size_t i = 0; i != 8; ++i) {
668 x |= ((I[i] & 0x01010101) << (7 - i));
669 }
670
671 return x;
672}
673
674void aes_key_schedule(const uint8_t key[],
675 size_t length,
678 bool bswap_keys = false) {
679 static const uint32_t RC[10] = {0x01000000,
680 0x02000000,
681 0x04000000,
682 0x08000000,
683 0x10000000,
684 0x20000000,
685 0x40000000,
686 0x80000000,
687 0x1B000000,
688 0x36000000};
689
690 const size_t X = length / 4;
691
692 // Can't happen, but make static analyzers happy
693 BOTAN_ASSERT_NOMSG(X == 4 || X == 6 || X == 8);
694
695 const size_t rounds = (length / 4) + 6;
696
697 // Help the optimizer
698 BOTAN_ASSERT_NOMSG(rounds == 10 || rounds == 12 || rounds == 14);
699
700 CT::poison(key, length);
701
702 const size_t KS_len = length + 28;
703 EK.resize(KS_len);
704 DK.resize(KS_len);
705
706 for(size_t i = 0; i != X; ++i) {
707 EK[i] = load_be<uint32_t>(key, i);
708 }
709
710 for(size_t i = X; i < 4 * (rounds + 1); i += X) {
711 EK[i] = EK[i - X] ^ RC[(i - X) / X] ^ rotl<8>(SE_word(EK[i - 1]));
712
713 for(size_t j = 1; j != X && (i + j) < EK.size(); ++j) {
714 EK[i + j] = EK[i + j - X];
715
716 if(X == 8 && j == 4) {
717 EK[i + j] ^= SE_word(EK[i + j - 1]);
718 } else {
719 EK[i + j] ^= EK[i + j - 1];
720 }
721 }
722 }
723
724 for(size_t i = 0; i != 4 * (rounds + 1); i += 4) {
725 DK[i] = EK[4 * rounds - i];
726 DK[i + 1] = EK[4 * rounds - i + 1];
727 DK[i + 2] = EK[4 * rounds - i + 2];
728 DK[i + 3] = EK[4 * rounds - i + 3];
729 }
730
731 for(size_t i = 4; i != 4 * rounds; i += 4) {
732 InvMixColumn_x4(&DK[i]);
733 }
734
735 if(bswap_keys) {
736 // HW AES on little endian needs the subkeys to be byte reversed
737 for(size_t i = 0; i != KS_len; ++i) {
738 EK[i] = reverse_bytes(EK[i]);
739 DK[i] = reverse_bytes(DK[i]);
740 }
741 }
742
743 CT::unpoison(EK.data(), EK.size());
744 CT::unpoison(DK.data(), DK.size());
745 CT::unpoison(key, length);
746}
747
748size_t aes_parallelism() {
749#if defined(BOTAN_HAS_AES_VAES)
751 return 8; // pipelined
752 }
753#endif
754
755#if defined(BOTAN_HAS_HW_AES_SUPPORT)
757 return 4; // pipelined
758 }
759#endif
760
761#if defined(BOTAN_HAS_AES_VPERM)
763 return 2; // pipelined
764 }
765#endif
766
767 // bitsliced:
768 return 2;
769}
770
771std::string aes_provider() {
772#if defined(BOTAN_HAS_AES_VAES)
773 if(auto feat = CPUID::check(CPUID::Feature::AVX2_AES)) {
774 return *feat;
775 }
776#endif
777
778#if defined(BOTAN_HAS_HW_AES_SUPPORT)
779 if(auto feat = CPUID::check(CPUID::Feature::HW_AES)) {
780 return *feat;
781 }
782#endif
783
784#if defined(BOTAN_HAS_AES_VPERM)
785 if(auto feat = CPUID::check(CPUID::Feature::SIMD_4X32)) {
786 return *feat;
787 }
788#endif
789
790 return "base";
791}
792
793} // namespace
794
795std::string AES_128::provider() const {
796 return aes_provider();
797}
798
799std::string AES_192::provider() const {
800 return aes_provider();
801}
802
803std::string AES_256::provider() const {
804 return aes_provider();
805}
806
807size_t AES_128::parallelism() const {
808 return aes_parallelism();
809}
810
811size_t AES_192::parallelism() const {
812 return aes_parallelism();
813}
814
815size_t AES_256::parallelism() const {
816 return aes_parallelism();
817}
818
820 return !m_EK.empty();
821}
822
824 return !m_EK.empty();
825}
826
828 return !m_EK.empty();
829}
830
831void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
833
834#if defined(BOTAN_HAS_AES_VAES)
836 return x86_vaes_encrypt_n(in, out, blocks);
837 }
838#endif
839
840#if defined(BOTAN_HAS_HW_AES_SUPPORT)
842 return hw_aes_encrypt_n(in, out, blocks);
843 }
844#endif
845
846#if defined(BOTAN_HAS_AES_VPERM)
848 return vperm_encrypt_n(in, out, blocks);
849 }
850#endif
851
852 aes_encrypt_n(in, out, blocks, m_EK);
853}
854
855void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
857
858#if defined(BOTAN_HAS_AES_VAES)
860 return x86_vaes_decrypt_n(in, out, blocks);
861 }
862#endif
863
864#if defined(BOTAN_HAS_HW_AES_SUPPORT)
866 return hw_aes_decrypt_n(in, out, blocks);
867 }
868#endif
869
870#if defined(BOTAN_HAS_AES_VPERM)
872 return vperm_decrypt_n(in, out, blocks);
873 }
874#endif
875
876 aes_decrypt_n(in, out, blocks, m_DK);
877}
878
879void AES_128::key_schedule(std::span<const uint8_t> key) {
880#if defined(BOTAN_HAS_AES_NI)
882 return aesni_key_schedule(key.data(), key.size());
883 }
884#endif
885
886#if defined(BOTAN_HAS_AES_VAES)
888 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, true);
889 }
890#endif
891
892#if defined(BOTAN_HAS_HW_AES_SUPPORT)
894 constexpr bool is_little_endian = std::endian::native == std::endian::little;
895 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, is_little_endian);
896 }
897#endif
898
899#if defined(BOTAN_HAS_AES_VPERM)
901 return vperm_key_schedule(key.data(), key.size());
902 }
903#endif
904
905 aes_key_schedule(key.data(), key.size(), m_EK, m_DK);
906}
907
909 zap(m_EK);
910 zap(m_DK);
911}
912
913void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
915
916#if defined(BOTAN_HAS_AES_VAES)
918 return x86_vaes_encrypt_n(in, out, blocks);
919 }
920#endif
921
922#if defined(BOTAN_HAS_HW_AES_SUPPORT)
924 return hw_aes_encrypt_n(in, out, blocks);
925 }
926#endif
927
928#if defined(BOTAN_HAS_AES_VPERM)
930 return vperm_encrypt_n(in, out, blocks);
931 }
932#endif
933
934 aes_encrypt_n(in, out, blocks, m_EK);
935}
936
937void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
939
940#if defined(BOTAN_HAS_AES_VAES)
942 return x86_vaes_decrypt_n(in, out, blocks);
943 }
944#endif
945
946#if defined(BOTAN_HAS_HW_AES_SUPPORT)
948 return hw_aes_decrypt_n(in, out, blocks);
949 }
950#endif
951
952#if defined(BOTAN_HAS_AES_VPERM)
954 return vperm_decrypt_n(in, out, blocks);
955 }
956#endif
957
958 aes_decrypt_n(in, out, blocks, m_DK);
959}
960
961void AES_192::key_schedule(std::span<const uint8_t> key) {
962#if defined(BOTAN_HAS_AES_NI)
964 return aesni_key_schedule(key.data(), key.size());
965 }
966#endif
967
968#if defined(BOTAN_HAS_AES_VAES)
970 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, true);
971 }
972#endif
973
974#if defined(BOTAN_HAS_HW_AES_SUPPORT)
976 constexpr bool is_little_endian = std::endian::native == std::endian::little;
977 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, is_little_endian);
978 }
979#endif
980
981#if defined(BOTAN_HAS_AES_VPERM)
983 return vperm_key_schedule(key.data(), key.size());
984 }
985#endif
986
987 aes_key_schedule(key.data(), key.size(), m_EK, m_DK);
988}
989
991 zap(m_EK);
992 zap(m_DK);
993}
994
995void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
997
998#if defined(BOTAN_HAS_AES_VAES)
1000 return x86_vaes_encrypt_n(in, out, blocks);
1001 }
1002#endif
1003
1004#if defined(BOTAN_HAS_HW_AES_SUPPORT)
1006 return hw_aes_encrypt_n(in, out, blocks);
1007 }
1008#endif
1009
1010#if defined(BOTAN_HAS_AES_VPERM)
1012 return vperm_encrypt_n(in, out, blocks);
1013 }
1014#endif
1015
1016 aes_encrypt_n(in, out, blocks, m_EK);
1017}
1018
1019void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
1021
1022#if defined(BOTAN_HAS_AES_VAES)
1024 return x86_vaes_decrypt_n(in, out, blocks);
1025 }
1026#endif
1027
1028#if defined(BOTAN_HAS_HW_AES_SUPPORT)
1030 return hw_aes_decrypt_n(in, out, blocks);
1031 }
1032#endif
1033
1034#if defined(BOTAN_HAS_AES_VPERM)
1036 return vperm_decrypt_n(in, out, blocks);
1037 }
1038#endif
1039
1040 aes_decrypt_n(in, out, blocks, m_DK);
1041}
1042
1043void AES_256::key_schedule(std::span<const uint8_t> key) {
1044#if defined(BOTAN_HAS_AES_NI)
1046 return aesni_key_schedule(key.data(), key.size());
1047 }
1048#endif
1049
1050#if defined(BOTAN_HAS_AES_VAES)
1052 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, true);
1053 }
1054#endif
1055
1056#if defined(BOTAN_HAS_HW_AES_SUPPORT)
1058 constexpr bool is_little_endian = std::endian::native == std::endian::little;
1059 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, is_little_endian);
1060 }
1061#endif
1062
1063#if defined(BOTAN_HAS_AES_VPERM)
1065 return vperm_key_schedule(key.data(), key.size());
1066 }
1067#endif
1068
1069 aes_key_schedule(key.data(), key.size(), m_EK, m_DK);
1070}
1071
1073 zap(m_EK);
1074 zap(m_DK);
1075}
1076
1077} // namespace Botan
#define BOTAN_ASSERT_NOMSG(expr)
Definition assert.h:75
#define BOTAN_ASSERT(expr, assertion_made)
Definition assert.h:62
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:855
std::string provider() const override
Definition aes.cpp:795
size_t parallelism() const override
Definition aes.cpp:807
bool has_keying_material() const override
Definition aes.cpp:819
void clear() override
Definition aes.cpp:908
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:831
bool has_keying_material() const override
Definition aes.cpp:823
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:937
std::string provider() const override
Definition aes.cpp:799
size_t parallelism() const override
Definition aes.cpp:811
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:913
void clear() override
Definition aes.cpp:990
bool has_keying_material() const override
Definition aes.cpp:827
void clear() override
Definition aes.cpp:1072
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:995
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:1019
std::string provider() const override
Definition aes.cpp:803
size_t parallelism() const override
Definition aes.cpp:815
static std::optional< std::string > check(CPUID::Feature feat)
Definition cpuid.h:67
static bool has(CPUID::Feature feat)
Definition cpuid.h:94
constexpr void unpoison(const T *p, size_t n)
Definition ct_utils.h:65
constexpr void poison(const T *p, size_t n)
Definition ct_utils.h:54
void zap(std::vector< T, Alloc > &vec)
Definition secmem.h:134
BOTAN_FORCE_INLINE constexpr void swap_bits(T &x, T &y, T mask, size_t shift)
Definition bit_ops.h:182
BOTAN_FORCE_INLINE constexpr T bit_permute_step(T x, T mask, size_t shift)
Definition bit_ops.h:172
void copy_out_be(std::span< uint8_t > out, const InR &in)
Definition loadstor.h:773
BOTAN_FORCE_INLINE constexpr T rotr(T input)
Definition rotate.h:35
constexpr T reverse_bytes(T x)
Definition bswap.h:27
BOTAN_FORCE_INLINE constexpr T rotl(T input)
Definition rotate.h:23
std::vector< T, secure_allocator< T > > secure_vector
Definition secmem.h:69
constexpr auto load_be(ParamTs &&... params)
Definition loadstor.h:504