Botan 3.5.0
Crypto and TLS for C&
aes.cpp
Go to the documentation of this file.
1/*
2* (C) 1999-2010,2015,2017,2018,2020 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/aes.h>
8
9#include <botan/internal/bit_ops.h>
10#include <botan/internal/bswap.h>
11#include <botan/internal/cpuid.h>
12#include <botan/internal/ct_utils.h>
13#include <botan/internal/loadstor.h>
14#include <botan/internal/rotate.h>
15
16namespace Botan {
17
18#if defined(BOTAN_HAS_AES_POWER8) || defined(BOTAN_HAS_AES_ARMV8) || defined(BOTAN_HAS_AES_NI)
19 #define BOTAN_HAS_HW_AES_SUPPORT
20#endif
21
22/*
23* One of three AES implementation strategies are used to get a constant time
24* implementation which is immune to common cache/timing based side channels:
25*
26* - If AES hardware support is available (AES-NI, POWER8, Aarch64) use that
27*
28* - If 128-bit SIMD with byte shuffles are available (SSSE3, NEON, or Altivec),
29* use the vperm technique published by Mike Hamburg at CHES 2009.
30*
31* - If no hardware or SIMD support, fall back to a constant time bitsliced
32* implementation. This uses 32-bit words resulting in 2 blocks being processed
33* in parallel. Moving to 4 blocks (with 64-bit words) would approximately
34* double performance on 64-bit CPUs. Likewise moving to 128 bit SIMD would
35* again approximately double performance vs 64-bit. However the assumption is
36* that most 64-bit CPUs either have hardware AES or SIMD shuffle support and
37* that the majority of users falling back to this code will be 32-bit cores.
38* If this assumption proves to be unsound, the bitsliced code can easily be
39* extended to operate on either 32 or 64 bit words depending on the native
40* wordsize of the target processor.
41*
42* Useful references
43*
44* - "Accelerating AES with Vector Permute Instructions" Mike Hamburg
45* https://www.shiftleft.org/papers/vector_aes/vector_aes.pdf
46*
47* - "Faster and Timing-Attack Resistant AES-GCM" Käsper and Schwabe
48* https://eprint.iacr.org/2009/129.pdf
49*
50* - "A new combinational logic minimization technique with applications to cryptology."
51* Boyar and Peralta https://eprint.iacr.org/2009/191.pdf
52*
53* - "A depth-16 circuit for the AES S-box" Boyar and Peralta
54* https://eprint.iacr.org/2011/332.pdf
55*
56* - "A Very Compact S-box for AES" Canright
57* https://www.iacr.org/archive/ches2005/032.pdf
58* https://core.ac.uk/download/pdf/36694529.pdf (extended)
59*/
60
61namespace {
62
63/*
64This is an AES sbox circuit which can execute in bitsliced mode up to 32x in
65parallel.
66
67The circuit is from the "Circuit Minimization Team" group
68http://www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html
69http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt
70
71This circuit has size 113 and depth 27. In software it is much faster than
72circuits which are considered faster for hardware purposes (where circuit depth
73is the critical constraint), because unlike in hardware, on common CPUs we can
74only execute - at best - 3 or 4 logic operations per cycle. So a smaller circuit
75is superior. On an x86-64 machine this circuit is about 15% faster than the
76circuit of size 128 and depth 16 given in "A depth-16 circuit for the AES S-box".
77
78Another circuit for AES Sbox of size 102 and depth 24 is describted in "New
79Circuit Minimization Techniques for Smaller and Faster AES SBoxes"
80[https://eprint.iacr.org/2019/802] however it relies on "non-standard" gates
81like MUX, NOR, NAND, etc and so in practice in bitsliced software, its size is
82actually a bit larger than this circuit, as few CPUs have such instructions and
83otherwise they must be emulated using a sequence of available bit operations.
84*/
85void AES_SBOX(uint32_t V[8]) {
86 const uint32_t U0 = V[0];
87 const uint32_t U1 = V[1];
88 const uint32_t U2 = V[2];
89 const uint32_t U3 = V[3];
90 const uint32_t U4 = V[4];
91 const uint32_t U5 = V[5];
92 const uint32_t U6 = V[6];
93 const uint32_t U7 = V[7];
94
95 const uint32_t y14 = U3 ^ U5;
96 const uint32_t y13 = U0 ^ U6;
97 const uint32_t y9 = U0 ^ U3;
98 const uint32_t y8 = U0 ^ U5;
99 const uint32_t t0 = U1 ^ U2;
100 const uint32_t y1 = t0 ^ U7;
101 const uint32_t y4 = y1 ^ U3;
102 const uint32_t y12 = y13 ^ y14;
103 const uint32_t y2 = y1 ^ U0;
104 const uint32_t y5 = y1 ^ U6;
105 const uint32_t y3 = y5 ^ y8;
106 const uint32_t t1 = U4 ^ y12;
107 const uint32_t y15 = t1 ^ U5;
108 const uint32_t y20 = t1 ^ U1;
109 const uint32_t y6 = y15 ^ U7;
110 const uint32_t y10 = y15 ^ t0;
111 const uint32_t y11 = y20 ^ y9;
112 const uint32_t y7 = U7 ^ y11;
113 const uint32_t y17 = y10 ^ y11;
114 const uint32_t y19 = y10 ^ y8;
115 const uint32_t y16 = t0 ^ y11;
116 const uint32_t y21 = y13 ^ y16;
117 const uint32_t y18 = U0 ^ y16;
118 const uint32_t t2 = y12 & y15;
119 const uint32_t t3 = y3 & y6;
120 const uint32_t t4 = t3 ^ t2;
121 const uint32_t t5 = y4 & U7;
122 const uint32_t t6 = t5 ^ t2;
123 const uint32_t t7 = y13 & y16;
124 const uint32_t t8 = y5 & y1;
125 const uint32_t t9 = t8 ^ t7;
126 const uint32_t t10 = y2 & y7;
127 const uint32_t t11 = t10 ^ t7;
128 const uint32_t t12 = y9 & y11;
129 const uint32_t t13 = y14 & y17;
130 const uint32_t t14 = t13 ^ t12;
131 const uint32_t t15 = y8 & y10;
132 const uint32_t t16 = t15 ^ t12;
133 const uint32_t t17 = t4 ^ y20;
134 const uint32_t t18 = t6 ^ t16;
135 const uint32_t t19 = t9 ^ t14;
136 const uint32_t t20 = t11 ^ t16;
137 const uint32_t t21 = t17 ^ t14;
138 const uint32_t t22 = t18 ^ y19;
139 const uint32_t t23 = t19 ^ y21;
140 const uint32_t t24 = t20 ^ y18;
141 const uint32_t t25 = t21 ^ t22;
142 const uint32_t t26 = t21 & t23;
143 const uint32_t t27 = t24 ^ t26;
144 const uint32_t t28 = t25 & t27;
145 const uint32_t t29 = t28 ^ t22;
146 const uint32_t t30 = t23 ^ t24;
147 const uint32_t t31 = t22 ^ t26;
148 const uint32_t t32 = t31 & t30;
149 const uint32_t t33 = t32 ^ t24;
150 const uint32_t t34 = t23 ^ t33;
151 const uint32_t t35 = t27 ^ t33;
152 const uint32_t t36 = t24 & t35;
153 const uint32_t t37 = t36 ^ t34;
154 const uint32_t t38 = t27 ^ t36;
155 const uint32_t t39 = t29 & t38;
156 const uint32_t t40 = t25 ^ t39;
157 const uint32_t t41 = t40 ^ t37;
158 const uint32_t t42 = t29 ^ t33;
159 const uint32_t t43 = t29 ^ t40;
160 const uint32_t t44 = t33 ^ t37;
161 const uint32_t t45 = t42 ^ t41;
162 const uint32_t z0 = t44 & y15;
163 const uint32_t z1 = t37 & y6;
164 const uint32_t z2 = t33 & U7;
165 const uint32_t z3 = t43 & y16;
166 const uint32_t z4 = t40 & y1;
167 const uint32_t z5 = t29 & y7;
168 const uint32_t z6 = t42 & y11;
169 const uint32_t z7 = t45 & y17;
170 const uint32_t z8 = t41 & y10;
171 const uint32_t z9 = t44 & y12;
172 const uint32_t z10 = t37 & y3;
173 const uint32_t z11 = t33 & y4;
174 const uint32_t z12 = t43 & y13;
175 const uint32_t z13 = t40 & y5;
176 const uint32_t z14 = t29 & y2;
177 const uint32_t z15 = t42 & y9;
178 const uint32_t z16 = t45 & y14;
179 const uint32_t z17 = t41 & y8;
180 const uint32_t tc1 = z15 ^ z16;
181 const uint32_t tc2 = z10 ^ tc1;
182 const uint32_t tc3 = z9 ^ tc2;
183 const uint32_t tc4 = z0 ^ z2;
184 const uint32_t tc5 = z1 ^ z0;
185 const uint32_t tc6 = z3 ^ z4;
186 const uint32_t tc7 = z12 ^ tc4;
187 const uint32_t tc8 = z7 ^ tc6;
188 const uint32_t tc9 = z8 ^ tc7;
189 const uint32_t tc10 = tc8 ^ tc9;
190 const uint32_t tc11 = tc6 ^ tc5;
191 const uint32_t tc12 = z3 ^ z5;
192 const uint32_t tc13 = z13 ^ tc1;
193 const uint32_t tc14 = tc4 ^ tc12;
194 const uint32_t S3 = tc3 ^ tc11;
195 const uint32_t tc16 = z6 ^ tc8;
196 const uint32_t tc17 = z14 ^ tc10;
197 const uint32_t tc18 = ~tc13 ^ tc14;
198 const uint32_t S7 = z12 ^ tc18;
199 const uint32_t tc20 = z15 ^ tc16;
200 const uint32_t tc21 = tc2 ^ z11;
201 const uint32_t S0 = tc3 ^ tc16;
202 const uint32_t S6 = tc10 ^ tc18;
203 const uint32_t S4 = tc14 ^ S3;
204 const uint32_t S1 = ~(S3 ^ tc16);
205 const uint32_t tc26 = tc17 ^ tc20;
206 const uint32_t S2 = ~(tc26 ^ z17);
207 const uint32_t S5 = tc21 ^ tc17;
208
209 V[0] = S0;
210 V[1] = S1;
211 V[2] = S2;
212 V[3] = S3;
213 V[4] = S4;
214 V[5] = S5;
215 V[6] = S6;
216 V[7] = S7;
217}
218
219/*
220A circuit for inverse AES Sbox of size 121 and depth 21 from
221http://www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html
222http://www.cs.yale.edu/homes/peralta/CircuitStuff/Sinv.txt
223*/
224void AES_INV_SBOX(uint32_t V[8]) {
225 const uint32_t U0 = V[0];
226 const uint32_t U1 = V[1];
227 const uint32_t U2 = V[2];
228 const uint32_t U3 = V[3];
229 const uint32_t U4 = V[4];
230 const uint32_t U5 = V[5];
231 const uint32_t U6 = V[6];
232 const uint32_t U7 = V[7];
233
234 const uint32_t Y0 = U0 ^ U3;
235 const uint32_t Y2 = ~(U1 ^ U3);
236 const uint32_t Y4 = U0 ^ Y2;
237 const uint32_t RTL0 = U6 ^ U7;
238 const uint32_t Y1 = Y2 ^ RTL0;
239 const uint32_t Y7 = ~(U2 ^ Y1);
240 const uint32_t RTL1 = U3 ^ U4;
241 const uint32_t Y6 = ~(U7 ^ RTL1);
242 const uint32_t Y3 = Y1 ^ RTL1;
243 const uint32_t RTL2 = ~(U0 ^ U2);
244 const uint32_t Y5 = U5 ^ RTL2;
245 const uint32_t sa1 = Y0 ^ Y2;
246 const uint32_t sa0 = Y1 ^ Y3;
247 const uint32_t sb1 = Y4 ^ Y6;
248 const uint32_t sb0 = Y5 ^ Y7;
249 const uint32_t ah = Y0 ^ Y1;
250 const uint32_t al = Y2 ^ Y3;
251 const uint32_t aa = sa0 ^ sa1;
252 const uint32_t bh = Y4 ^ Y5;
253 const uint32_t bl = Y6 ^ Y7;
254 const uint32_t bb = sb0 ^ sb1;
255 const uint32_t ab20 = sa0 ^ sb0;
256 const uint32_t ab22 = al ^ bl;
257 const uint32_t ab23 = Y3 ^ Y7;
258 const uint32_t ab21 = sa1 ^ sb1;
259 const uint32_t abcd1 = ah & bh;
260 const uint32_t rr1 = Y0 & Y4;
261 const uint32_t ph11 = ab20 ^ abcd1;
262 const uint32_t t01 = Y1 & Y5;
263 const uint32_t ph01 = t01 ^ abcd1;
264 const uint32_t abcd2 = al & bl;
265 const uint32_t r1 = Y2 & Y6;
266 const uint32_t pl11 = ab22 ^ abcd2;
267 const uint32_t r2 = Y3 & Y7;
268 const uint32_t pl01 = r2 ^ abcd2;
269 const uint32_t r3 = sa0 & sb0;
270 const uint32_t vr1 = aa & bb;
271 const uint32_t pr1 = vr1 ^ r3;
272 const uint32_t wr1 = sa1 & sb1;
273 const uint32_t qr1 = wr1 ^ r3;
274 const uint32_t ab0 = ph11 ^ rr1;
275 const uint32_t ab1 = ph01 ^ ab21;
276 const uint32_t ab2 = pl11 ^ r1;
277 const uint32_t ab3 = pl01 ^ qr1;
278 const uint32_t cp1 = ab0 ^ pr1;
279 const uint32_t cp2 = ab1 ^ qr1;
280 const uint32_t cp3 = ab2 ^ pr1;
281 const uint32_t cp4 = ab3 ^ ab23;
282 const uint32_t tinv1 = cp3 ^ cp4;
283 const uint32_t tinv2 = cp3 & cp1;
284 const uint32_t tinv3 = cp2 ^ tinv2;
285 const uint32_t tinv4 = cp1 ^ cp2;
286 const uint32_t tinv5 = cp4 ^ tinv2;
287 const uint32_t tinv6 = tinv5 & tinv4;
288 const uint32_t tinv7 = tinv3 & tinv1;
289 const uint32_t d2 = cp4 ^ tinv7;
290 const uint32_t d0 = cp2 ^ tinv6;
291 const uint32_t tinv8 = cp1 & cp4;
292 const uint32_t tinv9 = tinv4 & tinv8;
293 const uint32_t tinv10 = tinv4 ^ tinv2;
294 const uint32_t d1 = tinv9 ^ tinv10;
295 const uint32_t tinv11 = cp2 & cp3;
296 const uint32_t tinv12 = tinv1 & tinv11;
297 const uint32_t tinv13 = tinv1 ^ tinv2;
298 const uint32_t d3 = tinv12 ^ tinv13;
299 const uint32_t sd1 = d1 ^ d3;
300 const uint32_t sd0 = d0 ^ d2;
301 const uint32_t dl = d0 ^ d1;
302 const uint32_t dh = d2 ^ d3;
303 const uint32_t dd = sd0 ^ sd1;
304 const uint32_t abcd3 = dh & bh;
305 const uint32_t rr2 = d3 & Y4;
306 const uint32_t t02 = d2 & Y5;
307 const uint32_t abcd4 = dl & bl;
308 const uint32_t r4 = d1 & Y6;
309 const uint32_t r5 = d0 & Y7;
310 const uint32_t r6 = sd0 & sb0;
311 const uint32_t vr2 = dd & bb;
312 const uint32_t wr2 = sd1 & sb1;
313 const uint32_t abcd5 = dh & ah;
314 const uint32_t r7 = d3 & Y0;
315 const uint32_t r8 = d2 & Y1;
316 const uint32_t abcd6 = dl & al;
317 const uint32_t r9 = d1 & Y2;
318 const uint32_t r10 = d0 & Y3;
319 const uint32_t r11 = sd0 & sa0;
320 const uint32_t vr3 = dd & aa;
321 const uint32_t wr3 = sd1 & sa1;
322 const uint32_t ph12 = rr2 ^ abcd3;
323 const uint32_t ph02 = t02 ^ abcd3;
324 const uint32_t pl12 = r4 ^ abcd4;
325 const uint32_t pl02 = r5 ^ abcd4;
326 const uint32_t pr2 = vr2 ^ r6;
327 const uint32_t qr2 = wr2 ^ r6;
328 const uint32_t p0 = ph12 ^ pr2;
329 const uint32_t p1 = ph02 ^ qr2;
330 const uint32_t p2 = pl12 ^ pr2;
331 const uint32_t p3 = pl02 ^ qr2;
332 const uint32_t ph13 = r7 ^ abcd5;
333 const uint32_t ph03 = r8 ^ abcd5;
334 const uint32_t pl13 = r9 ^ abcd6;
335 const uint32_t pl03 = r10 ^ abcd6;
336 const uint32_t pr3 = vr3 ^ r11;
337 const uint32_t qr3 = wr3 ^ r11;
338 const uint32_t p4 = ph13 ^ pr3;
339 const uint32_t S7 = ph03 ^ qr3;
340 const uint32_t p6 = pl13 ^ pr3;
341 const uint32_t p7 = pl03 ^ qr3;
342 const uint32_t S3 = p1 ^ p6;
343 const uint32_t S6 = p2 ^ p6;
344 const uint32_t S0 = p3 ^ p6;
345 const uint32_t X11 = p0 ^ p2;
346 const uint32_t S5 = S0 ^ X11;
347 const uint32_t X13 = p4 ^ p7;
348 const uint32_t X14 = X11 ^ X13;
349 const uint32_t S1 = S3 ^ X14;
350 const uint32_t X16 = p1 ^ S7;
351 const uint32_t S2 = X14 ^ X16;
352 const uint32_t X18 = p0 ^ p4;
353 const uint32_t X19 = S5 ^ X16;
354 const uint32_t S4 = X18 ^ X19;
355
356 V[0] = S0;
357 V[1] = S1;
358 V[2] = S2;
359 V[3] = S3;
360 V[4] = S4;
361 V[5] = S5;
362 V[6] = S6;
363 V[7] = S7;
364}
365
366inline void bit_transpose(uint32_t B[8]) {
367 swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1);
368 swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1);
369 swap_bits<uint32_t>(B[5], B[4], 0x55555555, 1);
370 swap_bits<uint32_t>(B[7], B[6], 0x55555555, 1);
371
372 swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2);
373 swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2);
374 swap_bits<uint32_t>(B[6], B[4], 0x33333333, 2);
375 swap_bits<uint32_t>(B[7], B[5], 0x33333333, 2);
376
377 swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4);
378 swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4);
379 swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4);
380 swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4);
381}
382
383inline void ks_expand(uint32_t B[8], const uint32_t K[], size_t r) {
384 /*
385 This is bit_transpose of K[r..r+4] || K[r..r+4], we can save some computation
386 due to knowing the first and second halves are the same data.
387 */
388 for(size_t i = 0; i != 4; ++i) {
389 B[i] = K[r + i];
390 }
391
392 swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1);
393 swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1);
394
395 swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2);
396 swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2);
397
398 B[4] = B[0];
399 B[5] = B[1];
400 B[6] = B[2];
401 B[7] = B[3];
402
403 swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4);
404 swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4);
405 swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4);
406 swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4);
407}
408
409inline void shift_rows(uint32_t B[8]) {
410 // 3 0 1 2 7 4 5 6 10 11 8 9 14 15 12 13 17 18 19 16 21 22 23 20 24 25 26 27 28 29 30 31
411#if defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT)
412 for(size_t i = 0; i != 8; i += 2) {
413 uint64_t x = (static_cast<uint64_t>(B[i]) << 32) | B[i + 1];
414 x = bit_permute_step<uint64_t>(x, 0x0022331100223311, 2);
415 x = bit_permute_step<uint64_t>(x, 0x0055005500550055, 1);
416 B[i] = static_cast<uint32_t>(x >> 32);
417 B[i + 1] = static_cast<uint32_t>(x);
418 }
419#else
420 for(size_t i = 0; i != 8; ++i) {
421 uint32_t x = B[i];
422 x = bit_permute_step<uint32_t>(x, 0x00223311, 2);
423 x = bit_permute_step<uint32_t>(x, 0x00550055, 1);
424 B[i] = x;
425 }
426#endif
427}
428
429inline void inv_shift_rows(uint32_t B[8]) {
430 // Inverse of shift_rows, just inverting the steps
431
432#if defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT)
433 for(size_t i = 0; i != 8; i += 2) {
434 uint64_t x = (static_cast<uint64_t>(B[i]) << 32) | B[i + 1];
435 x = bit_permute_step<uint64_t>(x, 0x0055005500550055, 1);
436 x = bit_permute_step<uint64_t>(x, 0x0022331100223311, 2);
437 B[i] = static_cast<uint32_t>(x >> 32);
438 B[i + 1] = static_cast<uint32_t>(x);
439 }
440#else
441 for(size_t i = 0; i != 8; ++i) {
442 uint32_t x = B[i];
443 x = bit_permute_step<uint32_t>(x, 0x00550055, 1);
444 x = bit_permute_step<uint32_t>(x, 0x00223311, 2);
445 B[i] = x;
446 }
447#endif
448}
449
450inline void mix_columns(uint32_t B[8]) {
451 // carry high bits in B[0] to positions in 0x1b == 0b11011
452 const uint32_t X2[8] = {
453 B[1],
454 B[2],
455 B[3],
456 B[4] ^ B[0],
457 B[5] ^ B[0],
458 B[6],
459 B[7] ^ B[0],
460 B[0],
461 };
462
463 for(size_t i = 0; i != 8; i++) {
464 const uint32_t X3 = B[i] ^ X2[i];
465 B[i] = X2[i] ^ rotr<8>(B[i]) ^ rotr<16>(B[i]) ^ rotr<24>(X3);
466 }
467}
468
469void inv_mix_columns(uint32_t B[8]) {
470 /*
471 OpenSSL's bsaes implementation credits Jussi Kivilinna with the lovely
472 matrix decomposition
473
474 | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
475 | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
476 | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
477 | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
478
479 Notice the first component is simply the MixColumns matrix. So we can
480 multiply first by (05,00,04,00) then perform MixColumns to get the equivalent
481 of InvMixColumn.
482 */
483 const uint32_t X4[8] = {
484 B[2],
485 B[3],
486 B[4] ^ B[0],
487 B[5] ^ B[0] ^ B[1],
488 B[6] ^ B[1],
489 B[7] ^ B[0],
490 B[0] ^ B[1],
491 B[1],
492 };
493
494 for(size_t i = 0; i != 8; i++) {
495 const uint32_t X5 = X4[i] ^ B[i];
496 B[i] = X5 ^ rotr<16>(X4[i]);
497 }
498
499 mix_columns(B);
500}
501
502/*
503* AES Encryption
504*/
505void aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, const secure_vector<uint32_t>& EK) {
506 BOTAN_ASSERT(EK.size() == 44 || EK.size() == 52 || EK.size() == 60, "Key was set");
507
508 const size_t rounds = (EK.size() - 4) / 4;
509
510 uint32_t KS[13 * 8] = {0}; // actual maximum is (rounds - 1) * 8
511 for(size_t i = 0; i < rounds - 1; i += 1) {
512 ks_expand(&KS[8 * i], EK.data(), 4 * i + 4);
513 }
514
515 const size_t BLOCK_SIZE = 16;
516 const size_t BITSLICED_BLOCKS = 8 * sizeof(uint32_t) / BLOCK_SIZE;
517
518 while(blocks > 0) {
519 const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS);
520
521 uint32_t B[8] = {0};
522
523 load_be(B, in, this_loop * 4);
524
525 CT::poison(B, 8);
526
527 for(size_t i = 0; i != 8; ++i) {
528 B[i] ^= EK[i % 4];
529 }
530
531 bit_transpose(B);
532
533 for(size_t r = 0; r != rounds - 1; ++r) {
534 AES_SBOX(B);
535 shift_rows(B);
536 mix_columns(B);
537
538 for(size_t i = 0; i != 8; ++i) {
539 B[i] ^= KS[8 * r + i];
540 }
541 }
542
543 // Final round:
544 AES_SBOX(B);
545 shift_rows(B);
546 bit_transpose(B);
547
548 for(size_t i = 0; i != 8; ++i) {
549 B[i] ^= EK[4 * rounds + i % 4];
550 }
551
552 CT::unpoison(B, 8);
553
554 copy_out_be(std::span(out, this_loop * 4 * sizeof(uint32_t)), B);
555
556 in += this_loop * BLOCK_SIZE;
557 out += this_loop * BLOCK_SIZE;
558 blocks -= this_loop;
559 }
560}
561
562/*
563* AES Decryption
564*/
565void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, const secure_vector<uint32_t>& DK) {
566 BOTAN_ASSERT(DK.size() == 44 || DK.size() == 52 || DK.size() == 60, "Key was set");
567
568 const size_t rounds = (DK.size() - 4) / 4;
569
570 uint32_t KS[13 * 8] = {0}; // actual maximum is (rounds - 1) * 8
571 for(size_t i = 0; i < rounds - 1; i += 1) {
572 ks_expand(&KS[8 * i], DK.data(), 4 * i + 4);
573 }
574
575 const size_t BLOCK_SIZE = 16;
576 const size_t BITSLICED_BLOCKS = 8 * sizeof(uint32_t) / BLOCK_SIZE;
577
578 while(blocks > 0) {
579 const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS);
580
581 uint32_t B[8] = {0};
582
583 CT::poison(B, 8);
584
585 load_be(B, in, this_loop * 4);
586
587 for(size_t i = 0; i != 8; ++i) {
588 B[i] ^= DK[i % 4];
589 }
590
591 bit_transpose(B);
592
593 for(size_t r = 0; r != rounds - 1; ++r) {
594 AES_INV_SBOX(B);
595 inv_shift_rows(B);
596 inv_mix_columns(B);
597
598 for(size_t i = 0; i != 8; ++i) {
599 B[i] ^= KS[8 * r + i];
600 }
601 }
602
603 // Final round:
604 AES_INV_SBOX(B);
605 inv_shift_rows(B);
606 bit_transpose(B);
607
608 for(size_t i = 0; i != 8; ++i) {
609 B[i] ^= DK[4 * rounds + i % 4];
610 }
611
612 CT::unpoison(B, 8);
613
614 copy_out_be(std::span(out, this_loop * 4 * sizeof(uint32_t)), B);
615
616 in += this_loop * BLOCK_SIZE;
617 out += this_loop * BLOCK_SIZE;
618 blocks -= this_loop;
619 }
620}
621
622inline uint32_t xtime32(uint32_t s) {
623 const uint32_t lo_bit = 0x01010101;
624 const uint32_t mask = 0x7F7F7F7F;
625 const uint32_t poly = 0x1B;
626
627 return ((s & mask) << 1) ^ (((s >> 7) & lo_bit) * poly);
628}
629
630inline uint32_t InvMixColumn(uint32_t s1) {
631 const uint32_t s2 = xtime32(s1);
632 const uint32_t s4 = xtime32(s2);
633 const uint32_t s8 = xtime32(s4);
634 const uint32_t s9 = s8 ^ s1;
635 const uint32_t s11 = s9 ^ s2;
636 const uint32_t s13 = s9 ^ s4;
637 const uint32_t s14 = s8 ^ s4 ^ s2;
638
639 return s14 ^ rotr<8>(s9) ^ rotr<16>(s13) ^ rotr<24>(s11);
640}
641
642void InvMixColumn_x4(uint32_t x[4]) {
643 x[0] = InvMixColumn(x[0]);
644 x[1] = InvMixColumn(x[1]);
645 x[2] = InvMixColumn(x[2]);
646 x[3] = InvMixColumn(x[3]);
647}
648
649uint32_t SE_word(uint32_t x) {
650 uint32_t I[8] = {0};
651
652 for(size_t i = 0; i != 8; ++i) {
653 I[i] = (x >> (7 - i)) & 0x01010101;
654 }
655
656 AES_SBOX(I);
657
658 x = 0;
659
660 for(size_t i = 0; i != 8; ++i) {
661 x |= ((I[i] & 0x01010101) << (7 - i));
662 }
663
664 return x;
665}
666
667void aes_key_schedule(const uint8_t key[],
668 size_t length,
671 bool bswap_keys = false) {
672 static const uint32_t RC[10] = {0x01000000,
673 0x02000000,
674 0x04000000,
675 0x08000000,
676 0x10000000,
677 0x20000000,
678 0x40000000,
679 0x80000000,
680 0x1B000000,
681 0x36000000};
682
683 const size_t X = length / 4;
684
685 // Can't happen, but make static analyzers happy
686 BOTAN_ASSERT_NOMSG(X == 4 || X == 6 || X == 8);
687
688 const size_t rounds = (length / 4) + 6;
689
690 // Help the optimizer
691 BOTAN_ASSERT_NOMSG(rounds == 10 || rounds == 12 || rounds == 14);
692
693 CT::poison(key, length);
694
695 EK.resize(length + 28);
696 DK.resize(length + 28);
697
698 for(size_t i = 0; i != X; ++i) {
699 EK[i] = load_be<uint32_t>(key, i);
700 }
701
702 for(size_t i = X; i < 4 * (rounds + 1); i += X) {
703 EK[i] = EK[i - X] ^ RC[(i - X) / X] ^ rotl<8>(SE_word(EK[i - 1]));
704
705 for(size_t j = 1; j != X && (i + j) < EK.size(); ++j) {
706 EK[i + j] = EK[i + j - X];
707
708 if(X == 8 && j == 4) {
709 EK[i + j] ^= SE_word(EK[i + j - 1]);
710 } else {
711 EK[i + j] ^= EK[i + j - 1];
712 }
713 }
714 }
715
716 for(size_t i = 0; i != 4 * (rounds + 1); i += 4) {
717 DK[i] = EK[4 * rounds - i];
718 DK[i + 1] = EK[4 * rounds - i + 1];
719 DK[i + 2] = EK[4 * rounds - i + 2];
720 DK[i + 3] = EK[4 * rounds - i + 3];
721 }
722
723 for(size_t i = 4; i != 4 * rounds; i += 4) {
724 InvMixColumn_x4(&DK[i]);
725 }
726
727 if(bswap_keys) {
728 // HW AES on little endian needs the subkeys to be byte reversed
729 for(size_t i = 0; i != EK.size(); ++i) {
730 EK[i] = reverse_bytes(EK[i]);
731 }
732 for(size_t i = 0; i != DK.size(); ++i) {
733 DK[i] = reverse_bytes(DK[i]);
734 }
735 }
736
737 CT::unpoison(EK.data(), EK.size());
738 CT::unpoison(DK.data(), DK.size());
739 CT::unpoison(key, length);
740}
741
742size_t aes_parallelism() {
743#if defined(BOTAN_HAS_HW_AES_SUPPORT)
744 if(CPUID::has_hw_aes()) {
745 return 4; // pipelined
746 }
747#endif
748
749#if defined(BOTAN_HAS_AES_VPERM)
750 if(CPUID::has_vperm()) {
751 return 2; // pipelined
752 }
753#endif
754
755 // bitsliced:
756 return 2;
757}
758
759const char* aes_provider() {
760#if defined(BOTAN_HAS_HW_AES_SUPPORT)
761 if(CPUID::has_hw_aes()) {
762 return "cpu";
763 }
764#endif
765
766#if defined(BOTAN_HAS_AES_VPERM)
767 if(CPUID::has_vperm()) {
768 return "vperm";
769 }
770#endif
771
772 return "base";
773}
774
775} // namespace
776
777std::string AES_128::provider() const {
778 return aes_provider();
779}
780
781std::string AES_192::provider() const {
782 return aes_provider();
783}
784
785std::string AES_256::provider() const {
786 return aes_provider();
787}
788
789size_t AES_128::parallelism() const {
790 return aes_parallelism();
791}
792
793size_t AES_192::parallelism() const {
794 return aes_parallelism();
795}
796
797size_t AES_256::parallelism() const {
798 return aes_parallelism();
799}
800
802 return !m_EK.empty();
803}
804
806 return !m_EK.empty();
807}
808
810 return !m_EK.empty();
811}
812
813void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
815
816#if defined(BOTAN_HAS_HW_AES_SUPPORT)
817 if(CPUID::has_hw_aes()) {
818 return hw_aes_encrypt_n(in, out, blocks);
819 }
820#endif
821
822#if defined(BOTAN_HAS_AES_VPERM)
823 if(CPUID::has_vperm()) {
824 return vperm_encrypt_n(in, out, blocks);
825 }
826#endif
827
828 aes_encrypt_n(in, out, blocks, m_EK);
829}
830
831void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
833
834#if defined(BOTAN_HAS_HW_AES_SUPPORT)
835 if(CPUID::has_hw_aes()) {
836 return hw_aes_decrypt_n(in, out, blocks);
837 }
838#endif
839
840#if defined(BOTAN_HAS_AES_VPERM)
841 if(CPUID::has_vperm()) {
842 return vperm_decrypt_n(in, out, blocks);
843 }
844#endif
845
846 aes_decrypt_n(in, out, blocks, m_DK);
847}
848
849void AES_128::key_schedule(std::span<const uint8_t> key) {
850#if defined(BOTAN_HAS_AES_NI)
851 if(CPUID::has_aes_ni()) {
852 return aesni_key_schedule(key.data(), key.size());
853 }
854#endif
855
856#if defined(BOTAN_HAS_HW_AES_SUPPORT)
857 if(CPUID::has_hw_aes()) {
858 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, CPUID::is_little_endian());
859 }
860#endif
861
862#if defined(BOTAN_HAS_AES_VPERM)
863 if(CPUID::has_vperm()) {
864 return vperm_key_schedule(key.data(), key.size());
865 }
866#endif
867
868 aes_key_schedule(key.data(), key.size(), m_EK, m_DK);
869}
870
872 zap(m_EK);
873 zap(m_DK);
874}
875
876void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
878
879#if defined(BOTAN_HAS_HW_AES_SUPPORT)
880 if(CPUID::has_hw_aes()) {
881 return hw_aes_encrypt_n(in, out, blocks);
882 }
883#endif
884
885#if defined(BOTAN_HAS_AES_VPERM)
886 if(CPUID::has_vperm()) {
887 return vperm_encrypt_n(in, out, blocks);
888 }
889#endif
890
891 aes_encrypt_n(in, out, blocks, m_EK);
892}
893
894void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
896
897#if defined(BOTAN_HAS_HW_AES_SUPPORT)
898 if(CPUID::has_hw_aes()) {
899 return hw_aes_decrypt_n(in, out, blocks);
900 }
901#endif
902
903#if defined(BOTAN_HAS_AES_VPERM)
904 if(CPUID::has_vperm()) {
905 return vperm_decrypt_n(in, out, blocks);
906 }
907#endif
908
909 aes_decrypt_n(in, out, blocks, m_DK);
910}
911
912void AES_192::key_schedule(std::span<const uint8_t> key) {
913#if defined(BOTAN_HAS_AES_NI)
914 if(CPUID::has_aes_ni()) {
915 return aesni_key_schedule(key.data(), key.size());
916 }
917#endif
918
919#if defined(BOTAN_HAS_HW_AES_SUPPORT)
920 if(CPUID::has_hw_aes()) {
921 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, CPUID::is_little_endian());
922 }
923#endif
924
925#if defined(BOTAN_HAS_AES_VPERM)
926 if(CPUID::has_vperm()) {
927 return vperm_key_schedule(key.data(), key.size());
928 }
929#endif
930
931 aes_key_schedule(key.data(), key.size(), m_EK, m_DK);
932}
933
935 zap(m_EK);
936 zap(m_DK);
937}
938
939void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
941
942#if defined(BOTAN_HAS_HW_AES_SUPPORT)
943 if(CPUID::has_hw_aes()) {
944 return hw_aes_encrypt_n(in, out, blocks);
945 }
946#endif
947
948#if defined(BOTAN_HAS_AES_VPERM)
949 if(CPUID::has_vperm()) {
950 return vperm_encrypt_n(in, out, blocks);
951 }
952#endif
953
954 aes_encrypt_n(in, out, blocks, m_EK);
955}
956
957void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
959
960#if defined(BOTAN_HAS_HW_AES_SUPPORT)
961 if(CPUID::has_hw_aes()) {
962 return hw_aes_decrypt_n(in, out, blocks);
963 }
964#endif
965
966#if defined(BOTAN_HAS_AES_VPERM)
967 if(CPUID::has_vperm()) {
968 return vperm_decrypt_n(in, out, blocks);
969 }
970#endif
971
972 aes_decrypt_n(in, out, blocks, m_DK);
973}
974
975void AES_256::key_schedule(std::span<const uint8_t> key) {
976#if defined(BOTAN_HAS_AES_NI)
977 if(CPUID::has_aes_ni()) {
978 return aesni_key_schedule(key.data(), key.size());
979 }
980#endif
981
982#if defined(BOTAN_HAS_HW_AES_SUPPORT)
983 if(CPUID::has_hw_aes()) {
984 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, CPUID::is_little_endian());
985 }
986#endif
987
988#if defined(BOTAN_HAS_AES_VPERM)
989 if(CPUID::has_vperm()) {
990 return vperm_key_schedule(key.data(), key.size());
991 }
992#endif
993
994 aes_key_schedule(key.data(), key.size(), m_EK, m_DK);
995}
996
998 zap(m_EK);
999 zap(m_DK);
1000}
1001
1002} // namespace Botan
#define BOTAN_ASSERT_NOMSG(expr)
Definition assert.h:59
#define BOTAN_ASSERT(expr, assertion_made)
Definition assert.h:50
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:831
std::string provider() const override
Definition aes.cpp:777
size_t parallelism() const override
Definition aes.cpp:789
bool has_keying_material() const override
Definition aes.cpp:801
void clear() override
Definition aes.cpp:871
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:813
bool has_keying_material() const override
Definition aes.cpp:805
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:894
std::string provider() const override
Definition aes.cpp:781
size_t parallelism() const override
Definition aes.cpp:793
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:876
void clear() override
Definition aes.cpp:934
bool has_keying_material() const override
Definition aes.cpp:809
void clear() override
Definition aes.cpp:997
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:939
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:957
std::string provider() const override
Definition aes.cpp:785
size_t parallelism() const override
Definition aes.cpp:797
static bool has_vperm()
Definition cpuid.h:277
static bool is_little_endian()
Definition cpuid.h:59
static bool has_hw_aes()
Definition cpuid.h:292
void assert_key_material_set() const
Definition sym_algo.h:139
FE_25519 X
Definition ge.cpp:25
constexpr void unpoison(const T *p, size_t n)
Definition ct_utils.h:57
constexpr void poison(const T *p, size_t n)
Definition ct_utils.h:46
void copy_out_be(std::span< uint8_t > out, InR &&in)
Definition loadstor.h:735
void zap(std::vector< T, Alloc > &vec)
Definition secmem.h:117
constexpr T rotl(T input)
Definition rotate.h:21
constexpr T rotr(T input)
Definition rotate.h:33
constexpr void swap_bits(T &x, T &y, T mask, size_t shift)
Definition bit_ops.h:186
constexpr T reverse_bytes(T x)
Definition bswap.h:24
std::vector< T, secure_allocator< T > > secure_vector
Definition secmem.h:61
constexpr auto load_be(ParamTs &&... params)
Definition loadstor.h:467
constexpr T bit_permute_step(T x, T mask, size_t shift)
Definition bit_ops.h:176