Botan 3.6.1
Crypto and TLS for C&
aes.cpp
Go to the documentation of this file.
1/*
2* (C) 1999-2010,2015,2017,2018,2020 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/aes.h>
8
9#include <botan/internal/bit_ops.h>
10#include <botan/internal/bswap.h>
11#include <botan/internal/cpuid.h>
12#include <botan/internal/ct_utils.h>
13#include <botan/internal/loadstor.h>
14#include <botan/internal/rotate.h>
15
16namespace Botan {
17
18#if defined(BOTAN_HAS_AES_POWER8) || defined(BOTAN_HAS_AES_ARMV8) || defined(BOTAN_HAS_AES_NI)
19 #define BOTAN_HAS_HW_AES_SUPPORT
20#endif
21
22/*
23* One of three AES implementation strategies are used to get a constant time
24* implementation which is immune to common cache/timing based side channels:
25*
26* - If AES hardware support is available (AES-NI, POWER8, Aarch64) use that
27*
28* - If 128-bit SIMD with byte shuffles are available (SSSE3, NEON, or Altivec),
29* use the vperm technique published by Mike Hamburg at CHES 2009.
30*
31* - If no hardware or SIMD support, fall back to a constant time bitsliced
32* implementation. This uses 32-bit words resulting in 2 blocks being processed
33* in parallel. Moving to 4 blocks (with 64-bit words) would approximately
34* double performance on 64-bit CPUs. Likewise moving to 128 bit SIMD would
35* again approximately double performance vs 64-bit. However the assumption is
36* that most 64-bit CPUs either have hardware AES or SIMD shuffle support and
37* that the majority of users falling back to this code will be 32-bit cores.
38* If this assumption proves to be unsound, the bitsliced code can easily be
39* extended to operate on either 32 or 64 bit words depending on the native
40* wordsize of the target processor.
41*
42* Useful references
43*
44* - "Accelerating AES with Vector Permute Instructions" Mike Hamburg
45* https://www.shiftleft.org/papers/vector_aes/vector_aes.pdf
46*
47* - "Faster and Timing-Attack Resistant AES-GCM" Käsper and Schwabe
48* https://eprint.iacr.org/2009/129.pdf
49*
50* - "A new combinational logic minimization technique with applications to cryptology."
51* Boyar and Peralta https://eprint.iacr.org/2009/191.pdf
52*
53* - "A depth-16 circuit for the AES S-box" Boyar and Peralta
54* https://eprint.iacr.org/2011/332.pdf
55*
56* - "A Very Compact S-box for AES" Canright
57* https://www.iacr.org/archive/ches2005/032.pdf
58* https://core.ac.uk/download/pdf/36694529.pdf (extended)
59*/
60
61namespace {
62
63/*
64This is an AES sbox circuit which can execute in bitsliced mode up to 32x in
65parallel.
66
67The circuit is from the "Circuit Minimization Team" group
68http://www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html
69http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt
70
71This circuit has size 113 and depth 27. In software it is much faster than
72circuits which are considered faster for hardware purposes (where circuit depth
73is the critical constraint), because unlike in hardware, on common CPUs we can
74only execute - at best - 3 or 4 logic operations per cycle. So a smaller circuit
75is superior. On an x86-64 machine this circuit is about 15% faster than the
76circuit of size 128 and depth 16 given in "A depth-16 circuit for the AES S-box".
77
78Another circuit for AES Sbox of size 102 and depth 24 is describted in "New
79Circuit Minimization Techniques for Smaller and Faster AES SBoxes"
80[https://eprint.iacr.org/2019/802] however it relies on "non-standard" gates
81like MUX, NOR, NAND, etc and so in practice in bitsliced software, its size is
82actually a bit larger than this circuit, as few CPUs have such instructions and
83otherwise they must be emulated using a sequence of available bit operations.
84*/
85void AES_SBOX(uint32_t V[8]) {
86 const uint32_t U0 = V[0];
87 const uint32_t U1 = V[1];
88 const uint32_t U2 = V[2];
89 const uint32_t U3 = V[3];
90 const uint32_t U4 = V[4];
91 const uint32_t U5 = V[5];
92 const uint32_t U6 = V[6];
93 const uint32_t U7 = V[7];
94
95 const uint32_t y14 = U3 ^ U5;
96 const uint32_t y13 = U0 ^ U6;
97 const uint32_t y9 = U0 ^ U3;
98 const uint32_t y8 = U0 ^ U5;
99 const uint32_t t0 = U1 ^ U2;
100 const uint32_t y1 = t0 ^ U7;
101 const uint32_t y4 = y1 ^ U3;
102 const uint32_t y12 = y13 ^ y14;
103 const uint32_t y2 = y1 ^ U0;
104 const uint32_t y5 = y1 ^ U6;
105 const uint32_t y3 = y5 ^ y8;
106 const uint32_t t1 = U4 ^ y12;
107 const uint32_t y15 = t1 ^ U5;
108 const uint32_t y20 = t1 ^ U1;
109 const uint32_t y6 = y15 ^ U7;
110 const uint32_t y10 = y15 ^ t0;
111 const uint32_t y11 = y20 ^ y9;
112 const uint32_t y7 = U7 ^ y11;
113 const uint32_t y17 = y10 ^ y11;
114 const uint32_t y19 = y10 ^ y8;
115 const uint32_t y16 = t0 ^ y11;
116 const uint32_t y21 = y13 ^ y16;
117 const uint32_t y18 = U0 ^ y16;
118 const uint32_t t2 = y12 & y15;
119 const uint32_t t3 = y3 & y6;
120 const uint32_t t4 = t3 ^ t2;
121 const uint32_t t5 = y4 & U7;
122 const uint32_t t6 = t5 ^ t2;
123 const uint32_t t7 = y13 & y16;
124 const uint32_t t8 = y5 & y1;
125 const uint32_t t9 = t8 ^ t7;
126 const uint32_t t10 = y2 & y7;
127 const uint32_t t11 = t10 ^ t7;
128 const uint32_t t12 = y9 & y11;
129 const uint32_t t13 = y14 & y17;
130 const uint32_t t14 = t13 ^ t12;
131 const uint32_t t15 = y8 & y10;
132 const uint32_t t16 = t15 ^ t12;
133 const uint32_t t17 = t4 ^ y20;
134 const uint32_t t18 = t6 ^ t16;
135 const uint32_t t19 = t9 ^ t14;
136 const uint32_t t20 = t11 ^ t16;
137 const uint32_t t21 = t17 ^ t14;
138 const uint32_t t22 = t18 ^ y19;
139 const uint32_t t23 = t19 ^ y21;
140 const uint32_t t24 = t20 ^ y18;
141 const uint32_t t25 = t21 ^ t22;
142 const uint32_t t26 = t21 & t23;
143 const uint32_t t27 = t24 ^ t26;
144 const uint32_t t28 = t25 & t27;
145 const uint32_t t29 = t28 ^ t22;
146 const uint32_t t30 = t23 ^ t24;
147 const uint32_t t31 = t22 ^ t26;
148 const uint32_t t32 = t31 & t30;
149 const uint32_t t33 = t32 ^ t24;
150 const uint32_t t34 = t23 ^ t33;
151 const uint32_t t35 = t27 ^ t33;
152 const uint32_t t36 = t24 & t35;
153 const uint32_t t37 = t36 ^ t34;
154 const uint32_t t38 = t27 ^ t36;
155 const uint32_t t39 = t29 & t38;
156 const uint32_t t40 = t25 ^ t39;
157 const uint32_t t41 = t40 ^ t37;
158 const uint32_t t42 = t29 ^ t33;
159 const uint32_t t43 = t29 ^ t40;
160 const uint32_t t44 = t33 ^ t37;
161 const uint32_t t45 = t42 ^ t41;
162 const uint32_t z0 = t44 & y15;
163 const uint32_t z1 = t37 & y6;
164 const uint32_t z2 = t33 & U7;
165 const uint32_t z3 = t43 & y16;
166 const uint32_t z4 = t40 & y1;
167 const uint32_t z5 = t29 & y7;
168 const uint32_t z6 = t42 & y11;
169 const uint32_t z7 = t45 & y17;
170 const uint32_t z8 = t41 & y10;
171 const uint32_t z9 = t44 & y12;
172 const uint32_t z10 = t37 & y3;
173 const uint32_t z11 = t33 & y4;
174 const uint32_t z12 = t43 & y13;
175 const uint32_t z13 = t40 & y5;
176 const uint32_t z14 = t29 & y2;
177 const uint32_t z15 = t42 & y9;
178 const uint32_t z16 = t45 & y14;
179 const uint32_t z17 = t41 & y8;
180 const uint32_t tc1 = z15 ^ z16;
181 const uint32_t tc2 = z10 ^ tc1;
182 const uint32_t tc3 = z9 ^ tc2;
183 const uint32_t tc4 = z0 ^ z2;
184 const uint32_t tc5 = z1 ^ z0;
185 const uint32_t tc6 = z3 ^ z4;
186 const uint32_t tc7 = z12 ^ tc4;
187 const uint32_t tc8 = z7 ^ tc6;
188 const uint32_t tc9 = z8 ^ tc7;
189 const uint32_t tc10 = tc8 ^ tc9;
190 const uint32_t tc11 = tc6 ^ tc5;
191 const uint32_t tc12 = z3 ^ z5;
192 const uint32_t tc13 = z13 ^ tc1;
193 const uint32_t tc14 = tc4 ^ tc12;
194 const uint32_t S3 = tc3 ^ tc11;
195 const uint32_t tc16 = z6 ^ tc8;
196 const uint32_t tc17 = z14 ^ tc10;
197 const uint32_t tc18 = ~tc13 ^ tc14;
198 const uint32_t S7 = z12 ^ tc18;
199 const uint32_t tc20 = z15 ^ tc16;
200 const uint32_t tc21 = tc2 ^ z11;
201 const uint32_t S0 = tc3 ^ tc16;
202 const uint32_t S6 = tc10 ^ tc18;
203 const uint32_t S4 = tc14 ^ S3;
204 const uint32_t S1 = ~(S3 ^ tc16);
205 const uint32_t tc26 = tc17 ^ tc20;
206 const uint32_t S2 = ~(tc26 ^ z17);
207 const uint32_t S5 = tc21 ^ tc17;
208
209 V[0] = S0;
210 V[1] = S1;
211 V[2] = S2;
212 V[3] = S3;
213 V[4] = S4;
214 V[5] = S5;
215 V[6] = S6;
216 V[7] = S7;
217}
218
219/*
220A circuit for inverse AES Sbox of size 121 and depth 21 from
221http://www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html
222http://www.cs.yale.edu/homes/peralta/CircuitStuff/Sinv.txt
223*/
224void AES_INV_SBOX(uint32_t V[8]) {
225 const uint32_t U0 = V[0];
226 const uint32_t U1 = V[1];
227 const uint32_t U2 = V[2];
228 const uint32_t U3 = V[3];
229 const uint32_t U4 = V[4];
230 const uint32_t U5 = V[5];
231 const uint32_t U6 = V[6];
232 const uint32_t U7 = V[7];
233
234 const uint32_t Y0 = U0 ^ U3;
235 const uint32_t Y2 = ~(U1 ^ U3);
236 const uint32_t Y4 = U0 ^ Y2;
237 const uint32_t RTL0 = U6 ^ U7;
238 const uint32_t Y1 = Y2 ^ RTL0;
239 const uint32_t Y7 = ~(U2 ^ Y1);
240 const uint32_t RTL1 = U3 ^ U4;
241 const uint32_t Y6 = ~(U7 ^ RTL1);
242 const uint32_t Y3 = Y1 ^ RTL1;
243 const uint32_t RTL2 = ~(U0 ^ U2);
244 const uint32_t Y5 = U5 ^ RTL2;
245 const uint32_t sa1 = Y0 ^ Y2;
246 const uint32_t sa0 = Y1 ^ Y3;
247 const uint32_t sb1 = Y4 ^ Y6;
248 const uint32_t sb0 = Y5 ^ Y7;
249 const uint32_t ah = Y0 ^ Y1;
250 const uint32_t al = Y2 ^ Y3;
251 const uint32_t aa = sa0 ^ sa1;
252 const uint32_t bh = Y4 ^ Y5;
253 const uint32_t bl = Y6 ^ Y7;
254 const uint32_t bb = sb0 ^ sb1;
255 const uint32_t ab20 = sa0 ^ sb0;
256 const uint32_t ab22 = al ^ bl;
257 const uint32_t ab23 = Y3 ^ Y7;
258 const uint32_t ab21 = sa1 ^ sb1;
259 const uint32_t abcd1 = ah & bh;
260 const uint32_t rr1 = Y0 & Y4;
261 const uint32_t ph11 = ab20 ^ abcd1;
262 const uint32_t t01 = Y1 & Y5;
263 const uint32_t ph01 = t01 ^ abcd1;
264 const uint32_t abcd2 = al & bl;
265 const uint32_t r1 = Y2 & Y6;
266 const uint32_t pl11 = ab22 ^ abcd2;
267 const uint32_t r2 = Y3 & Y7;
268 const uint32_t pl01 = r2 ^ abcd2;
269 const uint32_t r3 = sa0 & sb0;
270 const uint32_t vr1 = aa & bb;
271 const uint32_t pr1 = vr1 ^ r3;
272 const uint32_t wr1 = sa1 & sb1;
273 const uint32_t qr1 = wr1 ^ r3;
274 const uint32_t ab0 = ph11 ^ rr1;
275 const uint32_t ab1 = ph01 ^ ab21;
276 const uint32_t ab2 = pl11 ^ r1;
277 const uint32_t ab3 = pl01 ^ qr1;
278 const uint32_t cp1 = ab0 ^ pr1;
279 const uint32_t cp2 = ab1 ^ qr1;
280 const uint32_t cp3 = ab2 ^ pr1;
281 const uint32_t cp4 = ab3 ^ ab23;
282 const uint32_t tinv1 = cp3 ^ cp4;
283 const uint32_t tinv2 = cp3 & cp1;
284 const uint32_t tinv3 = cp2 ^ tinv2;
285 const uint32_t tinv4 = cp1 ^ cp2;
286 const uint32_t tinv5 = cp4 ^ tinv2;
287 const uint32_t tinv6 = tinv5 & tinv4;
288 const uint32_t tinv7 = tinv3 & tinv1;
289 const uint32_t d2 = cp4 ^ tinv7;
290 const uint32_t d0 = cp2 ^ tinv6;
291 const uint32_t tinv8 = cp1 & cp4;
292 const uint32_t tinv9 = tinv4 & tinv8;
293 const uint32_t tinv10 = tinv4 ^ tinv2;
294 const uint32_t d1 = tinv9 ^ tinv10;
295 const uint32_t tinv11 = cp2 & cp3;
296 const uint32_t tinv12 = tinv1 & tinv11;
297 const uint32_t tinv13 = tinv1 ^ tinv2;
298 const uint32_t d3 = tinv12 ^ tinv13;
299 const uint32_t sd1 = d1 ^ d3;
300 const uint32_t sd0 = d0 ^ d2;
301 const uint32_t dl = d0 ^ d1;
302 const uint32_t dh = d2 ^ d3;
303 const uint32_t dd = sd0 ^ sd1;
304 const uint32_t abcd3 = dh & bh;
305 const uint32_t rr2 = d3 & Y4;
306 const uint32_t t02 = d2 & Y5;
307 const uint32_t abcd4 = dl & bl;
308 const uint32_t r4 = d1 & Y6;
309 const uint32_t r5 = d0 & Y7;
310 const uint32_t r6 = sd0 & sb0;
311 const uint32_t vr2 = dd & bb;
312 const uint32_t wr2 = sd1 & sb1;
313 const uint32_t abcd5 = dh & ah;
314 const uint32_t r7 = d3 & Y0;
315 const uint32_t r8 = d2 & Y1;
316 const uint32_t abcd6 = dl & al;
317 const uint32_t r9 = d1 & Y2;
318 const uint32_t r10 = d0 & Y3;
319 const uint32_t r11 = sd0 & sa0;
320 const uint32_t vr3 = dd & aa;
321 const uint32_t wr3 = sd1 & sa1;
322 const uint32_t ph12 = rr2 ^ abcd3;
323 const uint32_t ph02 = t02 ^ abcd3;
324 const uint32_t pl12 = r4 ^ abcd4;
325 const uint32_t pl02 = r5 ^ abcd4;
326 const uint32_t pr2 = vr2 ^ r6;
327 const uint32_t qr2 = wr2 ^ r6;
328 const uint32_t p0 = ph12 ^ pr2;
329 const uint32_t p1 = ph02 ^ qr2;
330 const uint32_t p2 = pl12 ^ pr2;
331 const uint32_t p3 = pl02 ^ qr2;
332 const uint32_t ph13 = r7 ^ abcd5;
333 const uint32_t ph03 = r8 ^ abcd5;
334 const uint32_t pl13 = r9 ^ abcd6;
335 const uint32_t pl03 = r10 ^ abcd6;
336 const uint32_t pr3 = vr3 ^ r11;
337 const uint32_t qr3 = wr3 ^ r11;
338 const uint32_t p4 = ph13 ^ pr3;
339 const uint32_t S7 = ph03 ^ qr3;
340 const uint32_t p6 = pl13 ^ pr3;
341 const uint32_t p7 = pl03 ^ qr3;
342 const uint32_t S3 = p1 ^ p6;
343 const uint32_t S6 = p2 ^ p6;
344 const uint32_t S0 = p3 ^ p6;
345 const uint32_t X11 = p0 ^ p2;
346 const uint32_t S5 = S0 ^ X11;
347 const uint32_t X13 = p4 ^ p7;
348 const uint32_t X14 = X11 ^ X13;
349 const uint32_t S1 = S3 ^ X14;
350 const uint32_t X16 = p1 ^ S7;
351 const uint32_t S2 = X14 ^ X16;
352 const uint32_t X18 = p0 ^ p4;
353 const uint32_t X19 = S5 ^ X16;
354 const uint32_t S4 = X18 ^ X19;
355
356 V[0] = S0;
357 V[1] = S1;
358 V[2] = S2;
359 V[3] = S3;
360 V[4] = S4;
361 V[5] = S5;
362 V[6] = S6;
363 V[7] = S7;
364}
365
366inline void bit_transpose(uint32_t B[8]) {
367 swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1);
368 swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1);
369 swap_bits<uint32_t>(B[5], B[4], 0x55555555, 1);
370 swap_bits<uint32_t>(B[7], B[6], 0x55555555, 1);
371
372 swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2);
373 swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2);
374 swap_bits<uint32_t>(B[6], B[4], 0x33333333, 2);
375 swap_bits<uint32_t>(B[7], B[5], 0x33333333, 2);
376
377 swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4);
378 swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4);
379 swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4);
380 swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4);
381}
382
383inline void ks_expand(uint32_t B[8], const uint32_t K[], size_t r) {
384 /*
385 This is bit_transpose of K[r..r+4] || K[r..r+4], we can save some computation
386 due to knowing the first and second halves are the same data.
387 */
388 for(size_t i = 0; i != 4; ++i) {
389 B[i] = K[r + i];
390 }
391
392 swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1);
393 swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1);
394
395 swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2);
396 swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2);
397
398 B[4] = B[0];
399 B[5] = B[1];
400 B[6] = B[2];
401 B[7] = B[3];
402
403 swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4);
404 swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4);
405 swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4);
406 swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4);
407}
408
409inline void shift_rows(uint32_t B[8]) {
410 // 3 0 1 2 7 4 5 6 10 11 8 9 14 15 12 13 17 18 19 16 21 22 23 20 24 25 26 27 28 29 30 31
411#if defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT)
412 for(size_t i = 0; i != 8; i += 2) {
413 uint64_t x = (static_cast<uint64_t>(B[i]) << 32) | B[i + 1];
414 x = bit_permute_step<uint64_t>(x, 0x0022331100223311, 2);
415 x = bit_permute_step<uint64_t>(x, 0x0055005500550055, 1);
416 B[i] = static_cast<uint32_t>(x >> 32);
417 B[i + 1] = static_cast<uint32_t>(x);
418 }
419#else
420 for(size_t i = 0; i != 8; ++i) {
421 uint32_t x = B[i];
422 x = bit_permute_step<uint32_t>(x, 0x00223311, 2);
423 x = bit_permute_step<uint32_t>(x, 0x00550055, 1);
424 B[i] = x;
425 }
426#endif
427}
428
429inline void inv_shift_rows(uint32_t B[8]) {
430 // Inverse of shift_rows, just inverting the steps
431
432#if defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT)
433 for(size_t i = 0; i != 8; i += 2) {
434 uint64_t x = (static_cast<uint64_t>(B[i]) << 32) | B[i + 1];
435 x = bit_permute_step<uint64_t>(x, 0x0055005500550055, 1);
436 x = bit_permute_step<uint64_t>(x, 0x0022331100223311, 2);
437 B[i] = static_cast<uint32_t>(x >> 32);
438 B[i + 1] = static_cast<uint32_t>(x);
439 }
440#else
441 for(size_t i = 0; i != 8; ++i) {
442 uint32_t x = B[i];
443 x = bit_permute_step<uint32_t>(x, 0x00550055, 1);
444 x = bit_permute_step<uint32_t>(x, 0x00223311, 2);
445 B[i] = x;
446 }
447#endif
448}
449
450inline void mix_columns(uint32_t B[8]) {
451 // carry high bits in B[0] to positions in 0x1b == 0b11011
452 const uint32_t X2[8] = {
453 B[1],
454 B[2],
455 B[3],
456 B[4] ^ B[0],
457 B[5] ^ B[0],
458 B[6],
459 B[7] ^ B[0],
460 B[0],
461 };
462
463 for(size_t i = 0; i != 8; i++) {
464 const uint32_t X3 = B[i] ^ X2[i];
465 B[i] = X2[i] ^ rotr<8>(B[i]) ^ rotr<16>(B[i]) ^ rotr<24>(X3);
466 }
467}
468
469void inv_mix_columns(uint32_t B[8]) {
470 /*
471 OpenSSL's bsaes implementation credits Jussi Kivilinna with the lovely
472 matrix decomposition
473
474 | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
475 | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
476 | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
477 | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
478
479 Notice the first component is simply the MixColumns matrix. So we can
480 multiply first by (05,00,04,00) then perform MixColumns to get the equivalent
481 of InvMixColumn.
482 */
483 const uint32_t X4[8] = {
484 B[2],
485 B[3],
486 B[4] ^ B[0],
487 B[5] ^ B[0] ^ B[1],
488 B[6] ^ B[1],
489 B[7] ^ B[0],
490 B[0] ^ B[1],
491 B[1],
492 };
493
494 for(size_t i = 0; i != 8; i++) {
495 const uint32_t X5 = X4[i] ^ B[i];
496 B[i] = X5 ^ rotr<16>(X4[i]);
497 }
498
499 mix_columns(B);
500}
501
502/*
503* AES Encryption
504*/
505void aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, const secure_vector<uint32_t>& EK) {
506 BOTAN_ASSERT(EK.size() == 44 || EK.size() == 52 || EK.size() == 60, "Key was set");
507
508 const size_t rounds = (EK.size() - 4) / 4;
509
510 uint32_t KS[13 * 8] = {0}; // actual maximum is (rounds - 1) * 8
511 for(size_t i = 0; i < rounds - 1; i += 1) {
512 ks_expand(&KS[8 * i], EK.data(), 4 * i + 4);
513 }
514
515 const size_t BLOCK_SIZE = 16;
516 const size_t BITSLICED_BLOCKS = 8 * sizeof(uint32_t) / BLOCK_SIZE;
517
518 while(blocks > 0) {
519 const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS);
520
521 uint32_t B[8] = {0};
522
523 load_be(B, in, this_loop * 4);
524
525 CT::poison(B, 8);
526
527 for(size_t i = 0; i != 8; ++i) {
528 B[i] ^= EK[i % 4];
529 }
530
531 bit_transpose(B);
532
533 for(size_t r = 0; r != rounds - 1; ++r) {
534 AES_SBOX(B);
535 shift_rows(B);
536 mix_columns(B);
537
538 for(size_t i = 0; i != 8; ++i) {
539 B[i] ^= KS[8 * r + i];
540 }
541 }
542
543 // Final round:
544 AES_SBOX(B);
545 shift_rows(B);
546 bit_transpose(B);
547
548 for(size_t i = 0; i != 8; ++i) {
549 B[i] ^= EK[4 * rounds + i % 4];
550 }
551
552 CT::unpoison(B, 8);
553
554 copy_out_be(std::span(out, this_loop * 4 * sizeof(uint32_t)), B);
555
556 in += this_loop * BLOCK_SIZE;
557 out += this_loop * BLOCK_SIZE;
558 blocks -= this_loop;
559 }
560}
561
562/*
563* AES Decryption
564*/
565void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, const secure_vector<uint32_t>& DK) {
566 BOTAN_ASSERT(DK.size() == 44 || DK.size() == 52 || DK.size() == 60, "Key was set");
567
568 const size_t rounds = (DK.size() - 4) / 4;
569
570 uint32_t KS[13 * 8] = {0}; // actual maximum is (rounds - 1) * 8
571 for(size_t i = 0; i < rounds - 1; i += 1) {
572 ks_expand(&KS[8 * i], DK.data(), 4 * i + 4);
573 }
574
575 const size_t BLOCK_SIZE = 16;
576 const size_t BITSLICED_BLOCKS = 8 * sizeof(uint32_t) / BLOCK_SIZE;
577
578 while(blocks > 0) {
579 const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS);
580
581 uint32_t B[8] = {0};
582
583 CT::poison(B, 8);
584
585 load_be(B, in, this_loop * 4);
586
587 for(size_t i = 0; i != 8; ++i) {
588 B[i] ^= DK[i % 4];
589 }
590
591 bit_transpose(B);
592
593 for(size_t r = 0; r != rounds - 1; ++r) {
594 AES_INV_SBOX(B);
595 inv_shift_rows(B);
596 inv_mix_columns(B);
597
598 for(size_t i = 0; i != 8; ++i) {
599 B[i] ^= KS[8 * r + i];
600 }
601 }
602
603 // Final round:
604 AES_INV_SBOX(B);
605 inv_shift_rows(B);
606 bit_transpose(B);
607
608 for(size_t i = 0; i != 8; ++i) {
609 B[i] ^= DK[4 * rounds + i % 4];
610 }
611
612 CT::unpoison(B, 8);
613
614 copy_out_be(std::span(out, this_loop * 4 * sizeof(uint32_t)), B);
615
616 in += this_loop * BLOCK_SIZE;
617 out += this_loop * BLOCK_SIZE;
618 blocks -= this_loop;
619 }
620}
621
622inline uint32_t xtime32(uint32_t s) {
623 const uint32_t lo_bit = 0x01010101;
624 const uint32_t mask = 0x7F7F7F7F;
625 const uint32_t poly = 0x1B;
626
627 return ((s & mask) << 1) ^ (((s >> 7) & lo_bit) * poly);
628}
629
630inline uint32_t InvMixColumn(uint32_t s1) {
631 const uint32_t s2 = xtime32(s1);
632 const uint32_t s4 = xtime32(s2);
633 const uint32_t s8 = xtime32(s4);
634 const uint32_t s9 = s8 ^ s1;
635 const uint32_t s11 = s9 ^ s2;
636 const uint32_t s13 = s9 ^ s4;
637 const uint32_t s14 = s8 ^ s4 ^ s2;
638
639 return s14 ^ rotr<8>(s9) ^ rotr<16>(s13) ^ rotr<24>(s11);
640}
641
642void InvMixColumn_x4(uint32_t x[4]) {
643 x[0] = InvMixColumn(x[0]);
644 x[1] = InvMixColumn(x[1]);
645 x[2] = InvMixColumn(x[2]);
646 x[3] = InvMixColumn(x[3]);
647}
648
649uint32_t SE_word(uint32_t x) {
650 uint32_t I[8] = {0};
651
652 for(size_t i = 0; i != 8; ++i) {
653 I[i] = (x >> (7 - i)) & 0x01010101;
654 }
655
656 AES_SBOX(I);
657
658 x = 0;
659
660 for(size_t i = 0; i != 8; ++i) {
661 x |= ((I[i] & 0x01010101) << (7 - i));
662 }
663
664 return x;
665}
666
667void aes_key_schedule(const uint8_t key[],
668 size_t length,
671 bool bswap_keys = false) {
672 static const uint32_t RC[10] = {0x01000000,
673 0x02000000,
674 0x04000000,
675 0x08000000,
676 0x10000000,
677 0x20000000,
678 0x40000000,
679 0x80000000,
680 0x1B000000,
681 0x36000000};
682
683 const size_t X = length / 4;
684
685 // Can't happen, but make static analyzers happy
686 BOTAN_ASSERT_NOMSG(X == 4 || X == 6 || X == 8);
687
688 const size_t rounds = (length / 4) + 6;
689
690 // Help the optimizer
691 BOTAN_ASSERT_NOMSG(rounds == 10 || rounds == 12 || rounds == 14);
692
693 CT::poison(key, length);
694
695 EK.resize(length + 28);
696 DK.resize(length + 28);
697
698 for(size_t i = 0; i != X; ++i) {
699 EK[i] = load_be<uint32_t>(key, i);
700 }
701
702 for(size_t i = X; i < 4 * (rounds + 1); i += X) {
703 EK[i] = EK[i - X] ^ RC[(i - X) / X] ^ rotl<8>(SE_word(EK[i - 1]));
704
705 for(size_t j = 1; j != X && (i + j) < EK.size(); ++j) {
706 EK[i + j] = EK[i + j - X];
707
708 if(X == 8 && j == 4) {
709 EK[i + j] ^= SE_word(EK[i + j - 1]);
710 } else {
711 EK[i + j] ^= EK[i + j - 1];
712 }
713 }
714 }
715
716 for(size_t i = 0; i != 4 * (rounds + 1); i += 4) {
717 DK[i] = EK[4 * rounds - i];
718 DK[i + 1] = EK[4 * rounds - i + 1];
719 DK[i + 2] = EK[4 * rounds - i + 2];
720 DK[i + 3] = EK[4 * rounds - i + 3];
721 }
722
723 for(size_t i = 4; i != 4 * rounds; i += 4) {
724 InvMixColumn_x4(&DK[i]);
725 }
726
727 if(bswap_keys) {
728 // HW AES on little endian needs the subkeys to be byte reversed
729 for(size_t i = 0; i != EK.size(); ++i) {
730 EK[i] = reverse_bytes(EK[i]);
731 }
732 for(size_t i = 0; i != DK.size(); ++i) {
733 DK[i] = reverse_bytes(DK[i]);
734 }
735 }
736
737 CT::unpoison(EK.data(), EK.size());
738 CT::unpoison(DK.data(), DK.size());
739 CT::unpoison(key, length);
740}
741
742size_t aes_parallelism() {
743#if defined(BOTAN_HAS_AES_VAES)
744 if(CPUID::has_avx2_vaes()) {
745 return 8; // pipelined
746 }
747#endif
748
749#if defined(BOTAN_HAS_HW_AES_SUPPORT)
750 if(CPUID::has_hw_aes()) {
751 return 4; // pipelined
752 }
753#endif
754
755#if defined(BOTAN_HAS_AES_VPERM)
756 if(CPUID::has_vperm()) {
757 return 2; // pipelined
758 }
759#endif
760
761 // bitsliced:
762 return 2;
763}
764
765const char* aes_provider() {
766#if defined(BOTAN_HAS_AES_VAES)
767 if(CPUID::has_avx2_vaes()) {
768 return "vaes";
769 }
770#endif
771
772#if defined(BOTAN_HAS_HW_AES_SUPPORT)
773 if(CPUID::has_hw_aes()) {
774 return "cpu";
775 }
776#endif
777
778#if defined(BOTAN_HAS_AES_VPERM)
779 if(CPUID::has_vperm()) {
780 return "vperm";
781 }
782#endif
783
784 return "base";
785}
786
787} // namespace
788
789std::string AES_128::provider() const {
790 return aes_provider();
791}
792
793std::string AES_192::provider() const {
794 return aes_provider();
795}
796
797std::string AES_256::provider() const {
798 return aes_provider();
799}
800
801size_t AES_128::parallelism() const {
802 return aes_parallelism();
803}
804
805size_t AES_192::parallelism() const {
806 return aes_parallelism();
807}
808
809size_t AES_256::parallelism() const {
810 return aes_parallelism();
811}
812
814 return !m_EK.empty();
815}
816
818 return !m_EK.empty();
819}
820
822 return !m_EK.empty();
823}
824
825void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
827
828#if defined(BOTAN_HAS_AES_VAES)
829 if(CPUID::has_avx2_vaes()) {
830 return x86_vaes_encrypt_n(in, out, blocks);
831 }
832#endif
833
834#if defined(BOTAN_HAS_HW_AES_SUPPORT)
835 if(CPUID::has_hw_aes()) {
836 return hw_aes_encrypt_n(in, out, blocks);
837 }
838#endif
839
840#if defined(BOTAN_HAS_AES_VPERM)
841 if(CPUID::has_vperm()) {
842 return vperm_encrypt_n(in, out, blocks);
843 }
844#endif
845
846 aes_encrypt_n(in, out, blocks, m_EK);
847}
848
849void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
851
852#if defined(BOTAN_HAS_AES_VAES)
853 if(CPUID::has_avx2_vaes()) {
854 return x86_vaes_decrypt_n(in, out, blocks);
855 }
856#endif
857
858#if defined(BOTAN_HAS_HW_AES_SUPPORT)
859 if(CPUID::has_hw_aes()) {
860 return hw_aes_decrypt_n(in, out, blocks);
861 }
862#endif
863
864#if defined(BOTAN_HAS_AES_VPERM)
865 if(CPUID::has_vperm()) {
866 return vperm_decrypt_n(in, out, blocks);
867 }
868#endif
869
870 aes_decrypt_n(in, out, blocks, m_DK);
871}
872
873void AES_128::key_schedule(std::span<const uint8_t> key) {
874#if defined(BOTAN_HAS_AES_NI)
875 if(CPUID::has_aes_ni()) {
876 return aesni_key_schedule(key.data(), key.size());
877 }
878#endif
879
880#if defined(BOTAN_HAS_AES_VAES)
881 if(CPUID::has_avx2_vaes()) {
882 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, CPUID::is_little_endian());
883 }
884#endif
885
886#if defined(BOTAN_HAS_HW_AES_SUPPORT)
887 if(CPUID::has_hw_aes()) {
888 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, CPUID::is_little_endian());
889 }
890#endif
891
892#if defined(BOTAN_HAS_AES_VPERM)
893 if(CPUID::has_vperm()) {
894 return vperm_key_schedule(key.data(), key.size());
895 }
896#endif
897
898 aes_key_schedule(key.data(), key.size(), m_EK, m_DK);
899}
900
902 zap(m_EK);
903 zap(m_DK);
904}
905
906void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
908
909#if defined(BOTAN_HAS_AES_VAES)
910 if(CPUID::has_avx2_vaes()) {
911 return x86_vaes_encrypt_n(in, out, blocks);
912 }
913#endif
914
915#if defined(BOTAN_HAS_HW_AES_SUPPORT)
916 if(CPUID::has_hw_aes()) {
917 return hw_aes_encrypt_n(in, out, blocks);
918 }
919#endif
920
921#if defined(BOTAN_HAS_AES_VPERM)
922 if(CPUID::has_vperm()) {
923 return vperm_encrypt_n(in, out, blocks);
924 }
925#endif
926
927 aes_encrypt_n(in, out, blocks, m_EK);
928}
929
930void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
932
933#if defined(BOTAN_HAS_AES_VAES)
934 if(CPUID::has_avx2_vaes()) {
935 return x86_vaes_decrypt_n(in, out, blocks);
936 }
937#endif
938
939#if defined(BOTAN_HAS_HW_AES_SUPPORT)
940 if(CPUID::has_hw_aes()) {
941 return hw_aes_decrypt_n(in, out, blocks);
942 }
943#endif
944
945#if defined(BOTAN_HAS_AES_VPERM)
946 if(CPUID::has_vperm()) {
947 return vperm_decrypt_n(in, out, blocks);
948 }
949#endif
950
951 aes_decrypt_n(in, out, blocks, m_DK);
952}
953
954void AES_192::key_schedule(std::span<const uint8_t> key) {
955#if defined(BOTAN_HAS_AES_NI)
956 if(CPUID::has_aes_ni()) {
957 return aesni_key_schedule(key.data(), key.size());
958 }
959#endif
960
961#if defined(BOTAN_HAS_AES_VAES)
962 if(CPUID::has_avx2_vaes()) {
963 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, CPUID::is_little_endian());
964 }
965#endif
966
967#if defined(BOTAN_HAS_HW_AES_SUPPORT)
968 if(CPUID::has_hw_aes()) {
969 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, CPUID::is_little_endian());
970 }
971#endif
972
973#if defined(BOTAN_HAS_AES_VPERM)
974 if(CPUID::has_vperm()) {
975 return vperm_key_schedule(key.data(), key.size());
976 }
977#endif
978
979 aes_key_schedule(key.data(), key.size(), m_EK, m_DK);
980}
981
983 zap(m_EK);
984 zap(m_DK);
985}
986
987void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
989
990#if defined(BOTAN_HAS_AES_VAES)
991 if(CPUID::has_avx2_vaes()) {
992 return x86_vaes_encrypt_n(in, out, blocks);
993 }
994#endif
995
996#if defined(BOTAN_HAS_HW_AES_SUPPORT)
997 if(CPUID::has_hw_aes()) {
998 return hw_aes_encrypt_n(in, out, blocks);
999 }
1000#endif
1001
1002#if defined(BOTAN_HAS_AES_VPERM)
1003 if(CPUID::has_vperm()) {
1004 return vperm_encrypt_n(in, out, blocks);
1005 }
1006#endif
1007
1008 aes_encrypt_n(in, out, blocks, m_EK);
1009}
1010
1011void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
1013
1014#if defined(BOTAN_HAS_AES_VAES)
1015 if(CPUID::has_avx2_vaes()) {
1016 return x86_vaes_decrypt_n(in, out, blocks);
1017 }
1018#endif
1019
1020#if defined(BOTAN_HAS_HW_AES_SUPPORT)
1021 if(CPUID::has_hw_aes()) {
1022 return hw_aes_decrypt_n(in, out, blocks);
1023 }
1024#endif
1025
1026#if defined(BOTAN_HAS_AES_VPERM)
1027 if(CPUID::has_vperm()) {
1028 return vperm_decrypt_n(in, out, blocks);
1029 }
1030#endif
1031
1032 aes_decrypt_n(in, out, blocks, m_DK);
1033}
1034
1035void AES_256::key_schedule(std::span<const uint8_t> key) {
1036#if defined(BOTAN_HAS_AES_NI)
1037 if(CPUID::has_aes_ni()) {
1038 return aesni_key_schedule(key.data(), key.size());
1039 }
1040#endif
1041
1042#if defined(BOTAN_HAS_AES_VAES)
1043 if(CPUID::has_avx2_vaes()) {
1044 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, CPUID::is_little_endian());
1045 }
1046#endif
1047
1048#if defined(BOTAN_HAS_HW_AES_SUPPORT)
1049 if(CPUID::has_hw_aes()) {
1050 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, CPUID::is_little_endian());
1051 }
1052#endif
1053
1054#if defined(BOTAN_HAS_AES_VPERM)
1055 if(CPUID::has_vperm()) {
1056 return vperm_key_schedule(key.data(), key.size());
1057 }
1058#endif
1059
1060 aes_key_schedule(key.data(), key.size(), m_EK, m_DK);
1061}
1062
1064 zap(m_EK);
1065 zap(m_DK);
1066}
1067
1068} // namespace Botan
#define BOTAN_ASSERT_NOMSG(expr)
Definition assert.h:59
#define BOTAN_ASSERT(expr, assertion_made)
Definition assert.h:50
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:849
std::string provider() const override
Definition aes.cpp:789
size_t parallelism() const override
Definition aes.cpp:801
bool has_keying_material() const override
Definition aes.cpp:813
void clear() override
Definition aes.cpp:901
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:825
bool has_keying_material() const override
Definition aes.cpp:817
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:930
std::string provider() const override
Definition aes.cpp:793
size_t parallelism() const override
Definition aes.cpp:805
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:906
void clear() override
Definition aes.cpp:982
bool has_keying_material() const override
Definition aes.cpp:821
void clear() override
Definition aes.cpp:1063
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:987
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:1011
std::string provider() const override
Definition aes.cpp:797
size_t parallelism() const override
Definition aes.cpp:809
static bool has_vperm()
Definition cpuid.h:335
static bool is_little_endian()
Definition cpuid.h:60
static bool has_hw_aes()
Definition cpuid.h:350
void assert_key_material_set() const
Definition sym_algo.h:139
FE_25519 X
Definition ge.cpp:25
constexpr void unpoison(const T *p, size_t n)
Definition ct_utils.h:64
constexpr void poison(const T *p, size_t n)
Definition ct_utils.h:53
void copy_out_be(std::span< uint8_t > out, InR &&in)
Definition loadstor.h:801
void zap(std::vector< T, Alloc > &vec)
Definition secmem.h:117
constexpr T rotl(T input)
Definition rotate.h:21
constexpr T rotr(T input)
Definition rotate.h:33
constexpr void swap_bits(T &x, T &y, T mask, size_t shift)
Definition bit_ops.h:186
constexpr T reverse_bytes(T x)
Definition bswap.h:24
std::vector< T, secure_allocator< T > > secure_vector
Definition secmem.h:61
constexpr auto load_be(ParamTs &&... params)
Definition loadstor.h:530
constexpr T bit_permute_step(T x, T mask, size_t shift)
Definition bit_ops.h:176