Botan 3.4.0
Crypto and TLS for C&
aes.cpp
Go to the documentation of this file.
1/*
2* (C) 1999-2010,2015,2017,2018,2020 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/aes.h>
8
9#include <botan/internal/bit_ops.h>
10#include <botan/internal/cpuid.h>
11#include <botan/internal/ct_utils.h>
12#include <botan/internal/loadstor.h>
13#include <botan/internal/rotate.h>
14
15namespace Botan {
16
17#if defined(BOTAN_HAS_AES_POWER8) || defined(BOTAN_HAS_AES_ARMV8) || defined(BOTAN_HAS_AES_NI)
18 #define BOTAN_HAS_HW_AES_SUPPORT
19#endif
20
21/*
22* One of three AES implementation strategies are used to get a constant time
23* implementation which is immune to common cache/timing based side channels:
24*
25* - If AES hardware support is available (AES-NI, POWER8, Aarch64) use that
26*
27* - If 128-bit SIMD with byte shuffles are available (SSSE3, NEON, or Altivec),
28* use the vperm technique published by Mike Hamburg at CHES 2009.
29*
30* - If no hardware or SIMD support, fall back to a constant time bitsliced
31* implementation. This uses 32-bit words resulting in 2 blocks being processed
32* in parallel. Moving to 4 blocks (with 64-bit words) would approximately
33* double performance on 64-bit CPUs. Likewise moving to 128 bit SIMD would
34* again approximately double performance vs 64-bit. However the assumption is
35* that most 64-bit CPUs either have hardware AES or SIMD shuffle support and
36* that the majority of users falling back to this code will be 32-bit cores.
37* If this assumption proves to be unsound, the bitsliced code can easily be
38* extended to operate on either 32 or 64 bit words depending on the native
39* wordsize of the target processor.
40*
41* Useful references
42*
43* - "Accelerating AES with Vector Permute Instructions" Mike Hamburg
44* https://www.shiftleft.org/papers/vector_aes/vector_aes.pdf
45*
46* - "Faster and Timing-Attack Resistant AES-GCM" Käsper and Schwabe
47* https://eprint.iacr.org/2009/129.pdf
48*
49* - "A new combinational logic minimization technique with applications to cryptology."
50* Boyar and Peralta https://eprint.iacr.org/2009/191.pdf
51*
52* - "A depth-16 circuit for the AES S-box" Boyar and Peralta
53* https://eprint.iacr.org/2011/332.pdf
54*
55* - "A Very Compact S-box for AES" Canright
56* https://www.iacr.org/archive/ches2005/032.pdf
57* https://core.ac.uk/download/pdf/36694529.pdf (extended)
58*/
59
60namespace {
61
62/*
63This is an AES sbox circuit which can execute in bitsliced mode up to 32x in
64parallel.
65
66The circuit is from the "Circuit Minimization Team" group
67http://www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html
68http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt
69
70This circuit has size 113 and depth 27. In software it is much faster than
71circuits which are considered faster for hardware purposes (where circuit depth
72is the critical constraint), because unlike in hardware, on common CPUs we can
73only execute - at best - 3 or 4 logic operations per cycle. So a smaller circuit
74is superior. On an x86-64 machine this circuit is about 15% faster than the
75circuit of size 128 and depth 16 given in "A depth-16 circuit for the AES S-box".
76
77Another circuit for AES Sbox of size 102 and depth 24 is describted in "New
78Circuit Minimization Techniques for Smaller and Faster AES SBoxes"
79[https://eprint.iacr.org/2019/802] however it relies on "non-standard" gates
80like MUX, NOR, NAND, etc and so in practice in bitsliced software, its size is
81actually a bit larger than this circuit, as few CPUs have such instructions and
82otherwise they must be emulated using a sequence of available bit operations.
83*/
84void AES_SBOX(uint32_t V[8]) {
85 const uint32_t U0 = V[0];
86 const uint32_t U1 = V[1];
87 const uint32_t U2 = V[2];
88 const uint32_t U3 = V[3];
89 const uint32_t U4 = V[4];
90 const uint32_t U5 = V[5];
91 const uint32_t U6 = V[6];
92 const uint32_t U7 = V[7];
93
94 const uint32_t y14 = U3 ^ U5;
95 const uint32_t y13 = U0 ^ U6;
96 const uint32_t y9 = U0 ^ U3;
97 const uint32_t y8 = U0 ^ U5;
98 const uint32_t t0 = U1 ^ U2;
99 const uint32_t y1 = t0 ^ U7;
100 const uint32_t y4 = y1 ^ U3;
101 const uint32_t y12 = y13 ^ y14;
102 const uint32_t y2 = y1 ^ U0;
103 const uint32_t y5 = y1 ^ U6;
104 const uint32_t y3 = y5 ^ y8;
105 const uint32_t t1 = U4 ^ y12;
106 const uint32_t y15 = t1 ^ U5;
107 const uint32_t y20 = t1 ^ U1;
108 const uint32_t y6 = y15 ^ U7;
109 const uint32_t y10 = y15 ^ t0;
110 const uint32_t y11 = y20 ^ y9;
111 const uint32_t y7 = U7 ^ y11;
112 const uint32_t y17 = y10 ^ y11;
113 const uint32_t y19 = y10 ^ y8;
114 const uint32_t y16 = t0 ^ y11;
115 const uint32_t y21 = y13 ^ y16;
116 const uint32_t y18 = U0 ^ y16;
117 const uint32_t t2 = y12 & y15;
118 const uint32_t t3 = y3 & y6;
119 const uint32_t t4 = t3 ^ t2;
120 const uint32_t t5 = y4 & U7;
121 const uint32_t t6 = t5 ^ t2;
122 const uint32_t t7 = y13 & y16;
123 const uint32_t t8 = y5 & y1;
124 const uint32_t t9 = t8 ^ t7;
125 const uint32_t t10 = y2 & y7;
126 const uint32_t t11 = t10 ^ t7;
127 const uint32_t t12 = y9 & y11;
128 const uint32_t t13 = y14 & y17;
129 const uint32_t t14 = t13 ^ t12;
130 const uint32_t t15 = y8 & y10;
131 const uint32_t t16 = t15 ^ t12;
132 const uint32_t t17 = t4 ^ y20;
133 const uint32_t t18 = t6 ^ t16;
134 const uint32_t t19 = t9 ^ t14;
135 const uint32_t t20 = t11 ^ t16;
136 const uint32_t t21 = t17 ^ t14;
137 const uint32_t t22 = t18 ^ y19;
138 const uint32_t t23 = t19 ^ y21;
139 const uint32_t t24 = t20 ^ y18;
140 const uint32_t t25 = t21 ^ t22;
141 const uint32_t t26 = t21 & t23;
142 const uint32_t t27 = t24 ^ t26;
143 const uint32_t t28 = t25 & t27;
144 const uint32_t t29 = t28 ^ t22;
145 const uint32_t t30 = t23 ^ t24;
146 const uint32_t t31 = t22 ^ t26;
147 const uint32_t t32 = t31 & t30;
148 const uint32_t t33 = t32 ^ t24;
149 const uint32_t t34 = t23 ^ t33;
150 const uint32_t t35 = t27 ^ t33;
151 const uint32_t t36 = t24 & t35;
152 const uint32_t t37 = t36 ^ t34;
153 const uint32_t t38 = t27 ^ t36;
154 const uint32_t t39 = t29 & t38;
155 const uint32_t t40 = t25 ^ t39;
156 const uint32_t t41 = t40 ^ t37;
157 const uint32_t t42 = t29 ^ t33;
158 const uint32_t t43 = t29 ^ t40;
159 const uint32_t t44 = t33 ^ t37;
160 const uint32_t t45 = t42 ^ t41;
161 const uint32_t z0 = t44 & y15;
162 const uint32_t z1 = t37 & y6;
163 const uint32_t z2 = t33 & U7;
164 const uint32_t z3 = t43 & y16;
165 const uint32_t z4 = t40 & y1;
166 const uint32_t z5 = t29 & y7;
167 const uint32_t z6 = t42 & y11;
168 const uint32_t z7 = t45 & y17;
169 const uint32_t z8 = t41 & y10;
170 const uint32_t z9 = t44 & y12;
171 const uint32_t z10 = t37 & y3;
172 const uint32_t z11 = t33 & y4;
173 const uint32_t z12 = t43 & y13;
174 const uint32_t z13 = t40 & y5;
175 const uint32_t z14 = t29 & y2;
176 const uint32_t z15 = t42 & y9;
177 const uint32_t z16 = t45 & y14;
178 const uint32_t z17 = t41 & y8;
179 const uint32_t tc1 = z15 ^ z16;
180 const uint32_t tc2 = z10 ^ tc1;
181 const uint32_t tc3 = z9 ^ tc2;
182 const uint32_t tc4 = z0 ^ z2;
183 const uint32_t tc5 = z1 ^ z0;
184 const uint32_t tc6 = z3 ^ z4;
185 const uint32_t tc7 = z12 ^ tc4;
186 const uint32_t tc8 = z7 ^ tc6;
187 const uint32_t tc9 = z8 ^ tc7;
188 const uint32_t tc10 = tc8 ^ tc9;
189 const uint32_t tc11 = tc6 ^ tc5;
190 const uint32_t tc12 = z3 ^ z5;
191 const uint32_t tc13 = z13 ^ tc1;
192 const uint32_t tc14 = tc4 ^ tc12;
193 const uint32_t S3 = tc3 ^ tc11;
194 const uint32_t tc16 = z6 ^ tc8;
195 const uint32_t tc17 = z14 ^ tc10;
196 const uint32_t tc18 = ~tc13 ^ tc14;
197 const uint32_t S7 = z12 ^ tc18;
198 const uint32_t tc20 = z15 ^ tc16;
199 const uint32_t tc21 = tc2 ^ z11;
200 const uint32_t S0 = tc3 ^ tc16;
201 const uint32_t S6 = tc10 ^ tc18;
202 const uint32_t S4 = tc14 ^ S3;
203 const uint32_t S1 = ~(S3 ^ tc16);
204 const uint32_t tc26 = tc17 ^ tc20;
205 const uint32_t S2 = ~(tc26 ^ z17);
206 const uint32_t S5 = tc21 ^ tc17;
207
208 V[0] = S0;
209 V[1] = S1;
210 V[2] = S2;
211 V[3] = S3;
212 V[4] = S4;
213 V[5] = S5;
214 V[6] = S6;
215 V[7] = S7;
216}
217
218/*
219A circuit for inverse AES Sbox of size 121 and depth 21 from
220http://www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html
221http://www.cs.yale.edu/homes/peralta/CircuitStuff/Sinv.txt
222*/
223void AES_INV_SBOX(uint32_t V[8]) {
224 const uint32_t U0 = V[0];
225 const uint32_t U1 = V[1];
226 const uint32_t U2 = V[2];
227 const uint32_t U3 = V[3];
228 const uint32_t U4 = V[4];
229 const uint32_t U5 = V[5];
230 const uint32_t U6 = V[6];
231 const uint32_t U7 = V[7];
232
233 const uint32_t Y0 = U0 ^ U3;
234 const uint32_t Y2 = ~(U1 ^ U3);
235 const uint32_t Y4 = U0 ^ Y2;
236 const uint32_t RTL0 = U6 ^ U7;
237 const uint32_t Y1 = Y2 ^ RTL0;
238 const uint32_t Y7 = ~(U2 ^ Y1);
239 const uint32_t RTL1 = U3 ^ U4;
240 const uint32_t Y6 = ~(U7 ^ RTL1);
241 const uint32_t Y3 = Y1 ^ RTL1;
242 const uint32_t RTL2 = ~(U0 ^ U2);
243 const uint32_t Y5 = U5 ^ RTL2;
244 const uint32_t sa1 = Y0 ^ Y2;
245 const uint32_t sa0 = Y1 ^ Y3;
246 const uint32_t sb1 = Y4 ^ Y6;
247 const uint32_t sb0 = Y5 ^ Y7;
248 const uint32_t ah = Y0 ^ Y1;
249 const uint32_t al = Y2 ^ Y3;
250 const uint32_t aa = sa0 ^ sa1;
251 const uint32_t bh = Y4 ^ Y5;
252 const uint32_t bl = Y6 ^ Y7;
253 const uint32_t bb = sb0 ^ sb1;
254 const uint32_t ab20 = sa0 ^ sb0;
255 const uint32_t ab22 = al ^ bl;
256 const uint32_t ab23 = Y3 ^ Y7;
257 const uint32_t ab21 = sa1 ^ sb1;
258 const uint32_t abcd1 = ah & bh;
259 const uint32_t rr1 = Y0 & Y4;
260 const uint32_t ph11 = ab20 ^ abcd1;
261 const uint32_t t01 = Y1 & Y5;
262 const uint32_t ph01 = t01 ^ abcd1;
263 const uint32_t abcd2 = al & bl;
264 const uint32_t r1 = Y2 & Y6;
265 const uint32_t pl11 = ab22 ^ abcd2;
266 const uint32_t r2 = Y3 & Y7;
267 const uint32_t pl01 = r2 ^ abcd2;
268 const uint32_t r3 = sa0 & sb0;
269 const uint32_t vr1 = aa & bb;
270 const uint32_t pr1 = vr1 ^ r3;
271 const uint32_t wr1 = sa1 & sb1;
272 const uint32_t qr1 = wr1 ^ r3;
273 const uint32_t ab0 = ph11 ^ rr1;
274 const uint32_t ab1 = ph01 ^ ab21;
275 const uint32_t ab2 = pl11 ^ r1;
276 const uint32_t ab3 = pl01 ^ qr1;
277 const uint32_t cp1 = ab0 ^ pr1;
278 const uint32_t cp2 = ab1 ^ qr1;
279 const uint32_t cp3 = ab2 ^ pr1;
280 const uint32_t cp4 = ab3 ^ ab23;
281 const uint32_t tinv1 = cp3 ^ cp4;
282 const uint32_t tinv2 = cp3 & cp1;
283 const uint32_t tinv3 = cp2 ^ tinv2;
284 const uint32_t tinv4 = cp1 ^ cp2;
285 const uint32_t tinv5 = cp4 ^ tinv2;
286 const uint32_t tinv6 = tinv5 & tinv4;
287 const uint32_t tinv7 = tinv3 & tinv1;
288 const uint32_t d2 = cp4 ^ tinv7;
289 const uint32_t d0 = cp2 ^ tinv6;
290 const uint32_t tinv8 = cp1 & cp4;
291 const uint32_t tinv9 = tinv4 & tinv8;
292 const uint32_t tinv10 = tinv4 ^ tinv2;
293 const uint32_t d1 = tinv9 ^ tinv10;
294 const uint32_t tinv11 = cp2 & cp3;
295 const uint32_t tinv12 = tinv1 & tinv11;
296 const uint32_t tinv13 = tinv1 ^ tinv2;
297 const uint32_t d3 = tinv12 ^ tinv13;
298 const uint32_t sd1 = d1 ^ d3;
299 const uint32_t sd0 = d0 ^ d2;
300 const uint32_t dl = d0 ^ d1;
301 const uint32_t dh = d2 ^ d3;
302 const uint32_t dd = sd0 ^ sd1;
303 const uint32_t abcd3 = dh & bh;
304 const uint32_t rr2 = d3 & Y4;
305 const uint32_t t02 = d2 & Y5;
306 const uint32_t abcd4 = dl & bl;
307 const uint32_t r4 = d1 & Y6;
308 const uint32_t r5 = d0 & Y7;
309 const uint32_t r6 = sd0 & sb0;
310 const uint32_t vr2 = dd & bb;
311 const uint32_t wr2 = sd1 & sb1;
312 const uint32_t abcd5 = dh & ah;
313 const uint32_t r7 = d3 & Y0;
314 const uint32_t r8 = d2 & Y1;
315 const uint32_t abcd6 = dl & al;
316 const uint32_t r9 = d1 & Y2;
317 const uint32_t r10 = d0 & Y3;
318 const uint32_t r11 = sd0 & sa0;
319 const uint32_t vr3 = dd & aa;
320 const uint32_t wr3 = sd1 & sa1;
321 const uint32_t ph12 = rr2 ^ abcd3;
322 const uint32_t ph02 = t02 ^ abcd3;
323 const uint32_t pl12 = r4 ^ abcd4;
324 const uint32_t pl02 = r5 ^ abcd4;
325 const uint32_t pr2 = vr2 ^ r6;
326 const uint32_t qr2 = wr2 ^ r6;
327 const uint32_t p0 = ph12 ^ pr2;
328 const uint32_t p1 = ph02 ^ qr2;
329 const uint32_t p2 = pl12 ^ pr2;
330 const uint32_t p3 = pl02 ^ qr2;
331 const uint32_t ph13 = r7 ^ abcd5;
332 const uint32_t ph03 = r8 ^ abcd5;
333 const uint32_t pl13 = r9 ^ abcd6;
334 const uint32_t pl03 = r10 ^ abcd6;
335 const uint32_t pr3 = vr3 ^ r11;
336 const uint32_t qr3 = wr3 ^ r11;
337 const uint32_t p4 = ph13 ^ pr3;
338 const uint32_t S7 = ph03 ^ qr3;
339 const uint32_t p6 = pl13 ^ pr3;
340 const uint32_t p7 = pl03 ^ qr3;
341 const uint32_t S3 = p1 ^ p6;
342 const uint32_t S6 = p2 ^ p6;
343 const uint32_t S0 = p3 ^ p6;
344 const uint32_t X11 = p0 ^ p2;
345 const uint32_t S5 = S0 ^ X11;
346 const uint32_t X13 = p4 ^ p7;
347 const uint32_t X14 = X11 ^ X13;
348 const uint32_t S1 = S3 ^ X14;
349 const uint32_t X16 = p1 ^ S7;
350 const uint32_t S2 = X14 ^ X16;
351 const uint32_t X18 = p0 ^ p4;
352 const uint32_t X19 = S5 ^ X16;
353 const uint32_t S4 = X18 ^ X19;
354
355 V[0] = S0;
356 V[1] = S1;
357 V[2] = S2;
358 V[3] = S3;
359 V[4] = S4;
360 V[5] = S5;
361 V[6] = S6;
362 V[7] = S7;
363}
364
365inline void bit_transpose(uint32_t B[8]) {
366 swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1);
367 swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1);
368 swap_bits<uint32_t>(B[5], B[4], 0x55555555, 1);
369 swap_bits<uint32_t>(B[7], B[6], 0x55555555, 1);
370
371 swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2);
372 swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2);
373 swap_bits<uint32_t>(B[6], B[4], 0x33333333, 2);
374 swap_bits<uint32_t>(B[7], B[5], 0x33333333, 2);
375
376 swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4);
377 swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4);
378 swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4);
379 swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4);
380}
381
382inline void ks_expand(uint32_t B[8], const uint32_t K[], size_t r) {
383 /*
384 This is bit_transpose of K[r..r+4] || K[r..r+4], we can save some computation
385 due to knowing the first and second halves are the same data.
386 */
387 for(size_t i = 0; i != 4; ++i) {
388 B[i] = K[r + i];
389 }
390
391 swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1);
392 swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1);
393
394 swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2);
395 swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2);
396
397 B[4] = B[0];
398 B[5] = B[1];
399 B[6] = B[2];
400 B[7] = B[3];
401
402 swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4);
403 swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4);
404 swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4);
405 swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4);
406}
407
408inline void shift_rows(uint32_t B[8]) {
409 // 3 0 1 2 7 4 5 6 10 11 8 9 14 15 12 13 17 18 19 16 21 22 23 20 24 25 26 27 28 29 30 31
410#if defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT)
411 for(size_t i = 0; i != 8; i += 2) {
412 uint64_t x = (static_cast<uint64_t>(B[i]) << 32) | B[i + 1];
413 x = bit_permute_step<uint64_t>(x, 0x0022331100223311, 2);
414 x = bit_permute_step<uint64_t>(x, 0x0055005500550055, 1);
415 B[i] = static_cast<uint32_t>(x >> 32);
416 B[i + 1] = static_cast<uint32_t>(x);
417 }
418#else
419 for(size_t i = 0; i != 8; ++i) {
420 uint32_t x = B[i];
421 x = bit_permute_step<uint32_t>(x, 0x00223311, 2);
422 x = bit_permute_step<uint32_t>(x, 0x00550055, 1);
423 B[i] = x;
424 }
425#endif
426}
427
428inline void inv_shift_rows(uint32_t B[8]) {
429 // Inverse of shift_rows, just inverting the steps
430
431#if defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT)
432 for(size_t i = 0; i != 8; i += 2) {
433 uint64_t x = (static_cast<uint64_t>(B[i]) << 32) | B[i + 1];
434 x = bit_permute_step<uint64_t>(x, 0x0055005500550055, 1);
435 x = bit_permute_step<uint64_t>(x, 0x0022331100223311, 2);
436 B[i] = static_cast<uint32_t>(x >> 32);
437 B[i + 1] = static_cast<uint32_t>(x);
438 }
439#else
440 for(size_t i = 0; i != 8; ++i) {
441 uint32_t x = B[i];
442 x = bit_permute_step<uint32_t>(x, 0x00550055, 1);
443 x = bit_permute_step<uint32_t>(x, 0x00223311, 2);
444 B[i] = x;
445 }
446#endif
447}
448
449inline void mix_columns(uint32_t B[8]) {
450 // carry high bits in B[0] to positions in 0x1b == 0b11011
451 const uint32_t X2[8] = {
452 B[1],
453 B[2],
454 B[3],
455 B[4] ^ B[0],
456 B[5] ^ B[0],
457 B[6],
458 B[7] ^ B[0],
459 B[0],
460 };
461
462 for(size_t i = 0; i != 8; i++) {
463 const uint32_t X3 = B[i] ^ X2[i];
464 B[i] = X2[i] ^ rotr<8>(B[i]) ^ rotr<16>(B[i]) ^ rotr<24>(X3);
465 }
466}
467
468void inv_mix_columns(uint32_t B[8]) {
469 /*
470 OpenSSL's bsaes implementation credits Jussi Kivilinna with the lovely
471 matrix decomposition
472
473 | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
474 | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
475 | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
476 | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
477
478 Notice the first component is simply the MixColumns matrix. So we can
479 multiply first by (05,00,04,00) then perform MixColumns to get the equivalent
480 of InvMixColumn.
481 */
482 const uint32_t X4[8] = {
483 B[2],
484 B[3],
485 B[4] ^ B[0],
486 B[5] ^ B[0] ^ B[1],
487 B[6] ^ B[1],
488 B[7] ^ B[0],
489 B[0] ^ B[1],
490 B[1],
491 };
492
493 for(size_t i = 0; i != 8; i++) {
494 const uint32_t X5 = X4[i] ^ B[i];
495 B[i] = X5 ^ rotr<16>(X4[i]);
496 }
497
498 mix_columns(B);
499}
500
501/*
502* AES Encryption
503*/
504void aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, const secure_vector<uint32_t>& EK) {
505 BOTAN_ASSERT(EK.size() == 44 || EK.size() == 52 || EK.size() == 60, "Key was set");
506
507 const size_t rounds = (EK.size() - 4) / 4;
508
509 uint32_t KS[13 * 8] = {0}; // actual maximum is (rounds - 1) * 8
510 for(size_t i = 0; i < rounds - 1; i += 1) {
511 ks_expand(&KS[8 * i], EK.data(), 4 * i + 4);
512 }
513
514 const size_t BLOCK_SIZE = 16;
515 const size_t BITSLICED_BLOCKS = 8 * sizeof(uint32_t) / BLOCK_SIZE;
516
517 while(blocks > 0) {
518 const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS);
519
520 uint32_t B[8] = {0};
521
522 load_be(B, in, this_loop * 4);
523
524 CT::poison(B, 8);
525
526 for(size_t i = 0; i != 8; ++i) {
527 B[i] ^= EK[i % 4];
528 }
529
530 bit_transpose(B);
531
532 for(size_t r = 0; r != rounds - 1; ++r) {
533 AES_SBOX(B);
534 shift_rows(B);
535 mix_columns(B);
536
537 for(size_t i = 0; i != 8; ++i) {
538 B[i] ^= KS[8 * r + i];
539 }
540 }
541
542 // Final round:
543 AES_SBOX(B);
544 shift_rows(B);
545 bit_transpose(B);
546
547 for(size_t i = 0; i != 8; ++i) {
548 B[i] ^= EK[4 * rounds + i % 4];
549 }
550
551 CT::unpoison(B, 8);
552
553 copy_out_be(std::span(out, this_loop * 4 * sizeof(uint32_t)), B);
554
555 in += this_loop * BLOCK_SIZE;
556 out += this_loop * BLOCK_SIZE;
557 blocks -= this_loop;
558 }
559}
560
561/*
562* AES Decryption
563*/
564void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, const secure_vector<uint32_t>& DK) {
565 BOTAN_ASSERT(DK.size() == 44 || DK.size() == 52 || DK.size() == 60, "Key was set");
566
567 const size_t rounds = (DK.size() - 4) / 4;
568
569 uint32_t KS[13 * 8] = {0}; // actual maximum is (rounds - 1) * 8
570 for(size_t i = 0; i < rounds - 1; i += 1) {
571 ks_expand(&KS[8 * i], DK.data(), 4 * i + 4);
572 }
573
574 const size_t BLOCK_SIZE = 16;
575 const size_t BITSLICED_BLOCKS = 8 * sizeof(uint32_t) / BLOCK_SIZE;
576
577 while(blocks > 0) {
578 const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS);
579
580 uint32_t B[8] = {0};
581
582 CT::poison(B, 8);
583
584 load_be(B, in, this_loop * 4);
585
586 for(size_t i = 0; i != 8; ++i) {
587 B[i] ^= DK[i % 4];
588 }
589
590 bit_transpose(B);
591
592 for(size_t r = 0; r != rounds - 1; ++r) {
593 AES_INV_SBOX(B);
594 inv_shift_rows(B);
595 inv_mix_columns(B);
596
597 for(size_t i = 0; i != 8; ++i) {
598 B[i] ^= KS[8 * r + i];
599 }
600 }
601
602 // Final round:
603 AES_INV_SBOX(B);
604 inv_shift_rows(B);
605 bit_transpose(B);
606
607 for(size_t i = 0; i != 8; ++i) {
608 B[i] ^= DK[4 * rounds + i % 4];
609 }
610
611 CT::unpoison(B, 8);
612
613 copy_out_be(std::span(out, this_loop * 4 * sizeof(uint32_t)), B);
614
615 in += this_loop * BLOCK_SIZE;
616 out += this_loop * BLOCK_SIZE;
617 blocks -= this_loop;
618 }
619}
620
621inline uint32_t xtime32(uint32_t s) {
622 const uint32_t lo_bit = 0x01010101;
623 const uint32_t mask = 0x7F7F7F7F;
624 const uint32_t poly = 0x1B;
625
626 return ((s & mask) << 1) ^ (((s >> 7) & lo_bit) * poly);
627}
628
629inline uint32_t InvMixColumn(uint32_t s1) {
630 const uint32_t s2 = xtime32(s1);
631 const uint32_t s4 = xtime32(s2);
632 const uint32_t s8 = xtime32(s4);
633 const uint32_t s9 = s8 ^ s1;
634 const uint32_t s11 = s9 ^ s2;
635 const uint32_t s13 = s9 ^ s4;
636 const uint32_t s14 = s8 ^ s4 ^ s2;
637
638 return s14 ^ rotr<8>(s9) ^ rotr<16>(s13) ^ rotr<24>(s11);
639}
640
641void InvMixColumn_x4(uint32_t x[4]) {
642 x[0] = InvMixColumn(x[0]);
643 x[1] = InvMixColumn(x[1]);
644 x[2] = InvMixColumn(x[2]);
645 x[3] = InvMixColumn(x[3]);
646}
647
648uint32_t SE_word(uint32_t x) {
649 uint32_t I[8] = {0};
650
651 for(size_t i = 0; i != 8; ++i) {
652 I[i] = (x >> (7 - i)) & 0x01010101;
653 }
654
655 AES_SBOX(I);
656
657 x = 0;
658
659 for(size_t i = 0; i != 8; ++i) {
660 x |= ((I[i] & 0x01010101) << (7 - i));
661 }
662
663 return x;
664}
665
666void aes_key_schedule(const uint8_t key[],
667 size_t length,
668 secure_vector<uint32_t>& EK,
669 secure_vector<uint32_t>& DK,
670 bool bswap_keys = false) {
671 static const uint32_t RC[10] = {0x01000000,
672 0x02000000,
673 0x04000000,
674 0x08000000,
675 0x10000000,
676 0x20000000,
677 0x40000000,
678 0x80000000,
679 0x1B000000,
680 0x36000000};
681
682 const size_t X = length / 4;
683
684 // Can't happen, but make static analyzers happy
685 BOTAN_ASSERT_NOMSG(X == 4 || X == 6 || X == 8);
686
687 const size_t rounds = (length / 4) + 6;
688
689 // Help the optimizer
690 BOTAN_ASSERT_NOMSG(rounds == 10 || rounds == 12 || rounds == 14);
691
692 CT::poison(key, length);
693
694 EK.resize(length + 28);
695 DK.resize(length + 28);
696
697 for(size_t i = 0; i != X; ++i) {
698 EK[i] = load_be<uint32_t>(key, i);
699 }
700
701 for(size_t i = X; i < 4 * (rounds + 1); i += X) {
702 EK[i] = EK[i - X] ^ RC[(i - X) / X] ^ rotl<8>(SE_word(EK[i - 1]));
703
704 for(size_t j = 1; j != X && (i + j) < EK.size(); ++j) {
705 EK[i + j] = EK[i + j - X];
706
707 if(X == 8 && j == 4) {
708 EK[i + j] ^= SE_word(EK[i + j - 1]);
709 } else {
710 EK[i + j] ^= EK[i + j - 1];
711 }
712 }
713 }
714
715 for(size_t i = 0; i != 4 * (rounds + 1); i += 4) {
716 DK[i] = EK[4 * rounds - i];
717 DK[i + 1] = EK[4 * rounds - i + 1];
718 DK[i + 2] = EK[4 * rounds - i + 2];
719 DK[i + 3] = EK[4 * rounds - i + 3];
720 }
721
722 for(size_t i = 4; i != 4 * rounds; i += 4) {
723 InvMixColumn_x4(&DK[i]);
724 }
725
726 if(bswap_keys) {
727 // HW AES on little endian needs the subkeys to be byte reversed
728 for(size_t i = 0; i != EK.size(); ++i) {
729 EK[i] = reverse_bytes(EK[i]);
730 }
731 for(size_t i = 0; i != DK.size(); ++i) {
732 DK[i] = reverse_bytes(DK[i]);
733 }
734 }
735
736 CT::unpoison(EK.data(), EK.size());
737 CT::unpoison(DK.data(), DK.size());
738 CT::unpoison(key, length);
739}
740
741size_t aes_parallelism() {
742#if defined(BOTAN_HAS_HW_AES_SUPPORT)
743 if(CPUID::has_hw_aes()) {
744 return 4; // pipelined
745 }
746#endif
747
748#if defined(BOTAN_HAS_AES_VPERM)
749 if(CPUID::has_vperm()) {
750 return 2; // pipelined
751 }
752#endif
753
754 // bitsliced:
755 return 2;
756}
757
758const char* aes_provider() {
759#if defined(BOTAN_HAS_HW_AES_SUPPORT)
760 if(CPUID::has_hw_aes()) {
761 return "cpu";
762 }
763#endif
764
765#if defined(BOTAN_HAS_AES_VPERM)
766 if(CPUID::has_vperm()) {
767 return "vperm";
768 }
769#endif
770
771 return "base";
772}
773
774} // namespace
775
776std::string AES_128::provider() const {
777 return aes_provider();
778}
779
780std::string AES_192::provider() const {
781 return aes_provider();
782}
783
784std::string AES_256::provider() const {
785 return aes_provider();
786}
787
788size_t AES_128::parallelism() const {
789 return aes_parallelism();
790}
791
792size_t AES_192::parallelism() const {
793 return aes_parallelism();
794}
795
796size_t AES_256::parallelism() const {
797 return aes_parallelism();
798}
799
801 return !m_EK.empty();
802}
803
805 return !m_EK.empty();
806}
807
809 return !m_EK.empty();
810}
811
812void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
814
815#if defined(BOTAN_HAS_HW_AES_SUPPORT)
816 if(CPUID::has_hw_aes()) {
817 return hw_aes_encrypt_n(in, out, blocks);
818 }
819#endif
820
821#if defined(BOTAN_HAS_AES_VPERM)
822 if(CPUID::has_vperm()) {
823 return vperm_encrypt_n(in, out, blocks);
824 }
825#endif
826
827 aes_encrypt_n(in, out, blocks, m_EK);
828}
829
830void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
832
833#if defined(BOTAN_HAS_HW_AES_SUPPORT)
834 if(CPUID::has_hw_aes()) {
835 return hw_aes_decrypt_n(in, out, blocks);
836 }
837#endif
838
839#if defined(BOTAN_HAS_AES_VPERM)
840 if(CPUID::has_vperm()) {
841 return vperm_decrypt_n(in, out, blocks);
842 }
843#endif
844
845 aes_decrypt_n(in, out, blocks, m_DK);
846}
847
848void AES_128::key_schedule(std::span<const uint8_t> key) {
849#if defined(BOTAN_HAS_AES_NI)
850 if(CPUID::has_aes_ni()) {
851 return aesni_key_schedule(key.data(), key.size());
852 }
853#endif
854
855#if defined(BOTAN_HAS_HW_AES_SUPPORT)
856 if(CPUID::has_hw_aes()) {
857 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, CPUID::is_little_endian());
858 }
859#endif
860
861#if defined(BOTAN_HAS_AES_VPERM)
862 if(CPUID::has_vperm()) {
863 return vperm_key_schedule(key.data(), key.size());
864 }
865#endif
866
867 aes_key_schedule(key.data(), key.size(), m_EK, m_DK);
868}
869
871 zap(m_EK);
872 zap(m_DK);
873}
874
875void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
877
878#if defined(BOTAN_HAS_HW_AES_SUPPORT)
879 if(CPUID::has_hw_aes()) {
880 return hw_aes_encrypt_n(in, out, blocks);
881 }
882#endif
883
884#if defined(BOTAN_HAS_AES_VPERM)
885 if(CPUID::has_vperm()) {
886 return vperm_encrypt_n(in, out, blocks);
887 }
888#endif
889
890 aes_encrypt_n(in, out, blocks, m_EK);
891}
892
893void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
895
896#if defined(BOTAN_HAS_HW_AES_SUPPORT)
897 if(CPUID::has_hw_aes()) {
898 return hw_aes_decrypt_n(in, out, blocks);
899 }
900#endif
901
902#if defined(BOTAN_HAS_AES_VPERM)
903 if(CPUID::has_vperm()) {
904 return vperm_decrypt_n(in, out, blocks);
905 }
906#endif
907
908 aes_decrypt_n(in, out, blocks, m_DK);
909}
910
911void AES_192::key_schedule(std::span<const uint8_t> key) {
912#if defined(BOTAN_HAS_AES_NI)
913 if(CPUID::has_aes_ni()) {
914 return aesni_key_schedule(key.data(), key.size());
915 }
916#endif
917
918#if defined(BOTAN_HAS_HW_AES_SUPPORT)
919 if(CPUID::has_hw_aes()) {
920 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, CPUID::is_little_endian());
921 }
922#endif
923
924#if defined(BOTAN_HAS_AES_VPERM)
925 if(CPUID::has_vperm()) {
926 return vperm_key_schedule(key.data(), key.size());
927 }
928#endif
929
930 aes_key_schedule(key.data(), key.size(), m_EK, m_DK);
931}
932
934 zap(m_EK);
935 zap(m_DK);
936}
937
938void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
940
941#if defined(BOTAN_HAS_HW_AES_SUPPORT)
942 if(CPUID::has_hw_aes()) {
943 return hw_aes_encrypt_n(in, out, blocks);
944 }
945#endif
946
947#if defined(BOTAN_HAS_AES_VPERM)
948 if(CPUID::has_vperm()) {
949 return vperm_encrypt_n(in, out, blocks);
950 }
951#endif
952
953 aes_encrypt_n(in, out, blocks, m_EK);
954}
955
956void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
958
959#if defined(BOTAN_HAS_HW_AES_SUPPORT)
960 if(CPUID::has_hw_aes()) {
961 return hw_aes_decrypt_n(in, out, blocks);
962 }
963#endif
964
965#if defined(BOTAN_HAS_AES_VPERM)
966 if(CPUID::has_vperm()) {
967 return vperm_decrypt_n(in, out, blocks);
968 }
969#endif
970
971 aes_decrypt_n(in, out, blocks, m_DK);
972}
973
974void AES_256::key_schedule(std::span<const uint8_t> key) {
975#if defined(BOTAN_HAS_AES_NI)
976 if(CPUID::has_aes_ni()) {
977 return aesni_key_schedule(key.data(), key.size());
978 }
979#endif
980
981#if defined(BOTAN_HAS_HW_AES_SUPPORT)
982 if(CPUID::has_hw_aes()) {
983 return aes_key_schedule(key.data(), key.size(), m_EK, m_DK, CPUID::is_little_endian());
984 }
985#endif
986
987#if defined(BOTAN_HAS_AES_VPERM)
988 if(CPUID::has_vperm()) {
989 return vperm_key_schedule(key.data(), key.size());
990 }
991#endif
992
993 aes_key_schedule(key.data(), key.size(), m_EK, m_DK);
994}
995
997 zap(m_EK);
998 zap(m_DK);
999}
1000
1001} // namespace Botan
#define BOTAN_ASSERT_NOMSG(expr)
Definition assert.h:59
#define BOTAN_ASSERT(expr, assertion_made)
Definition assert.h:50
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:830
std::string provider() const override
Definition aes.cpp:776
size_t parallelism() const override
Definition aes.cpp:788
bool has_keying_material() const override
Definition aes.cpp:800
void clear() override
Definition aes.cpp:870
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:812
bool has_keying_material() const override
Definition aes.cpp:804
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:893
std::string provider() const override
Definition aes.cpp:780
size_t parallelism() const override
Definition aes.cpp:792
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:875
void clear() override
Definition aes.cpp:933
bool has_keying_material() const override
Definition aes.cpp:808
void clear() override
Definition aes.cpp:996
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:938
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition aes.cpp:956
std::string provider() const override
Definition aes.cpp:784
size_t parallelism() const override
Definition aes.cpp:796
static bool has_vperm()
Definition cpuid.h:277
static bool is_little_endian()
Definition cpuid.h:59
static bool has_hw_aes()
Definition cpuid.h:292
void assert_key_material_set() const
Definition sym_algo.h:139
FE_25519 X
Definition ge.cpp:25
void poison(const T *p, size_t n)
Definition ct_utils.h:46
constexpr void unpoison(const T *p, size_t n)
Definition ct_utils.h:57
void copy_out_be(std::span< uint8_t > out, InR &&in)
Definition loadstor.h:739
void zap(std::vector< T, Alloc > &vec)
Definition secmem.h:117
constexpr uint16_t reverse_bytes(uint16_t x)
Definition bswap.h:21
constexpr auto load_be(ParamTs &&... params)
Definition loadstor.h:471