Botan  2.18.1
Crypto and TLS for C++11
aes.cpp
Go to the documentation of this file.
1 /*
2 * (C) 1999-2010,2015,2017,2018,2020 Jack Lloyd
3 *
4 * Botan is released under the Simplified BSD License (see license.txt)
5 */
6 
7 #include <botan/aes.h>
8 #include <botan/loadstor.h>
9 #include <botan/cpuid.h>
10 #include <botan/rotate.h>
11 #include <botan/internal/bit_ops.h>
12 #include <botan/internal/ct_utils.h>
13 
14 namespace Botan {
15 
16 #if defined(BOTAN_HAS_AES_POWER8) || defined(BOTAN_HAS_AES_ARMV8) || defined(BOTAN_HAS_AES_NI)
17  #define BOTAN_HAS_HW_AES_SUPPORT
18 #endif
19 
20 /*
21 * One of three AES implementation strategies are used to get a constant time
22 * implementation which is immune to common cache/timing based side channels:
23 *
24 * - If AES hardware support is available (AES-NI, POWER8, Aarch64) use that
25 *
26 * - If 128-bit SIMD with byte shuffles are available (SSSE3, NEON, or Altivec),
27 * use the vperm technique published by Mike Hamburg at CHES 2009.
28 *
29 * - If no hardware or SIMD support, fall back to a constant time bitsliced
30 * implementation. This uses 32-bit words resulting in 2 blocks being processed
31 * in parallel. Moving to 4 blocks (with 64-bit words) would approximately
32 * double performance on 64-bit CPUs. Likewise moving to 128 bit SIMD would
33 * again approximately double performance vs 64-bit. However the assumption is
34 * that most 64-bit CPUs either have hardware AES or SIMD shuffle support and
35 * that the majority of users falling back to this code will be 32-bit cores.
36 * If this assumption proves to be unsound, the bitsliced code can easily be
37 * extended to operate on either 32 or 64 bit words depending on the native
38 * wordsize of the target processor.
39 *
40 * Useful references
41 *
42 * - "Accelerating AES with Vector Permute Instructions" Mike Hamburg
43 * https://www.shiftleft.org/papers/vector_aes/vector_aes.pdf
44 *
45 * - "Faster and Timing-Attack Resistant AES-GCM" K√§sper and Schwabe
46 * https://eprint.iacr.org/2009/129.pdf
47 *
48 * - "A new combinational logic minimization technique with applications to cryptology."
49 * Boyar and Peralta https://eprint.iacr.org/2009/191.pdf
50 *
51 * - "A depth-16 circuit for the AES S-box" Boyar and Peralta
52 * https://eprint.iacr.org/2011/332.pdf
53 *
54 * - "A Very Compact S-box for AES" Canright
55 * https://www.iacr.org/archive/ches2005/032.pdf
56 * https://core.ac.uk/download/pdf/36694529.pdf (extended)
57 */
58 
59 namespace {
60 
61 /*
62 This is an AES sbox circuit which can execute in bitsliced mode up to 32x in
63 parallel.
64 
65 The circuit is from the "Circuit Minimization Team" group
66 http://www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html
67 http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt
68 
69 This circuit has size 113 and depth 27. In software it is much faster than
70 circuits which are considered faster for hardware purposes (where circuit depth
71 is the critical constraint), because unlike in hardware, on common CPUs we can
72 only execute - at best - 3 or 4 logic operations per cycle. So a smaller circuit
73 is superior. On an x86-64 machine this circuit is about 15% faster than the
74 circuit of size 128 and depth 16 given in "A depth-16 circuit for the AES S-box".
75 
76 Another circuit for AES Sbox of size 102 and depth 24 is describted in "New
77 Circuit Minimization Techniques for Smaller and Faster AES SBoxes"
78 [https://eprint.iacr.org/2019/802] however it relies on "non-standard" gates
79 like MUX, NOR, NAND, etc and so in practice in bitsliced software, its size is
80 actually a bit larger than this circuit, as few CPUs have such instructions and
81 otherwise they must be emulated using a sequence of available bit operations.
82 */
83 void AES_SBOX(uint32_t V[8])
84  {
85  const uint32_t U0 = V[0];
86  const uint32_t U1 = V[1];
87  const uint32_t U2 = V[2];
88  const uint32_t U3 = V[3];
89  const uint32_t U4 = V[4];
90  const uint32_t U5 = V[5];
91  const uint32_t U6 = V[6];
92  const uint32_t U7 = V[7];
93 
94  const uint32_t y14 = U3 ^ U5;
95  const uint32_t y13 = U0 ^ U6;
96  const uint32_t y9 = U0 ^ U3;
97  const uint32_t y8 = U0 ^ U5;
98  const uint32_t t0 = U1 ^ U2;
99  const uint32_t y1 = t0 ^ U7;
100  const uint32_t y4 = y1 ^ U3;
101  const uint32_t y12 = y13 ^ y14;
102  const uint32_t y2 = y1 ^ U0;
103  const uint32_t y5 = y1 ^ U6;
104  const uint32_t y3 = y5 ^ y8;
105  const uint32_t t1 = U4 ^ y12;
106  const uint32_t y15 = t1 ^ U5;
107  const uint32_t y20 = t1 ^ U1;
108  const uint32_t y6 = y15 ^ U7;
109  const uint32_t y10 = y15 ^ t0;
110  const uint32_t y11 = y20 ^ y9;
111  const uint32_t y7 = U7 ^ y11;
112  const uint32_t y17 = y10 ^ y11;
113  const uint32_t y19 = y10 ^ y8;
114  const uint32_t y16 = t0 ^ y11;
115  const uint32_t y21 = y13 ^ y16;
116  const uint32_t y18 = U0 ^ y16;
117  const uint32_t t2 = y12 & y15;
118  const uint32_t t3 = y3 & y6;
119  const uint32_t t4 = t3 ^ t2;
120  const uint32_t t5 = y4 & U7;
121  const uint32_t t6 = t5 ^ t2;
122  const uint32_t t7 = y13 & y16;
123  const uint32_t t8 = y5 & y1;
124  const uint32_t t9 = t8 ^ t7;
125  const uint32_t t10 = y2 & y7;
126  const uint32_t t11 = t10 ^ t7;
127  const uint32_t t12 = y9 & y11;
128  const uint32_t t13 = y14 & y17;
129  const uint32_t t14 = t13 ^ t12;
130  const uint32_t t15 = y8 & y10;
131  const uint32_t t16 = t15 ^ t12;
132  const uint32_t t17 = t4 ^ y20;
133  const uint32_t t18 = t6 ^ t16;
134  const uint32_t t19 = t9 ^ t14;
135  const uint32_t t20 = t11 ^ t16;
136  const uint32_t t21 = t17 ^ t14;
137  const uint32_t t22 = t18 ^ y19;
138  const uint32_t t23 = t19 ^ y21;
139  const uint32_t t24 = t20 ^ y18;
140  const uint32_t t25 = t21 ^ t22;
141  const uint32_t t26 = t21 & t23;
142  const uint32_t t27 = t24 ^ t26;
143  const uint32_t t28 = t25 & t27;
144  const uint32_t t29 = t28 ^ t22;
145  const uint32_t t30 = t23 ^ t24;
146  const uint32_t t31 = t22 ^ t26;
147  const uint32_t t32 = t31 & t30;
148  const uint32_t t33 = t32 ^ t24;
149  const uint32_t t34 = t23 ^ t33;
150  const uint32_t t35 = t27 ^ t33;
151  const uint32_t t36 = t24 & t35;
152  const uint32_t t37 = t36 ^ t34;
153  const uint32_t t38 = t27 ^ t36;
154  const uint32_t t39 = t29 & t38;
155  const uint32_t t40 = t25 ^ t39;
156  const uint32_t t41 = t40 ^ t37;
157  const uint32_t t42 = t29 ^ t33;
158  const uint32_t t43 = t29 ^ t40;
159  const uint32_t t44 = t33 ^ t37;
160  const uint32_t t45 = t42 ^ t41;
161  const uint32_t z0 = t44 & y15;
162  const uint32_t z1 = t37 & y6;
163  const uint32_t z2 = t33 & U7;
164  const uint32_t z3 = t43 & y16;
165  const uint32_t z4 = t40 & y1;
166  const uint32_t z5 = t29 & y7;
167  const uint32_t z6 = t42 & y11;
168  const uint32_t z7 = t45 & y17;
169  const uint32_t z8 = t41 & y10;
170  const uint32_t z9 = t44 & y12;
171  const uint32_t z10 = t37 & y3;
172  const uint32_t z11 = t33 & y4;
173  const uint32_t z12 = t43 & y13;
174  const uint32_t z13 = t40 & y5;
175  const uint32_t z14 = t29 & y2;
176  const uint32_t z15 = t42 & y9;
177  const uint32_t z16 = t45 & y14;
178  const uint32_t z17 = t41 & y8;
179  const uint32_t tc1 = z15 ^ z16;
180  const uint32_t tc2 = z10 ^ tc1;
181  const uint32_t tc3 = z9 ^ tc2;
182  const uint32_t tc4 = z0 ^ z2;
183  const uint32_t tc5 = z1 ^ z0;
184  const uint32_t tc6 = z3 ^ z4;
185  const uint32_t tc7 = z12 ^ tc4;
186  const uint32_t tc8 = z7 ^ tc6;
187  const uint32_t tc9 = z8 ^ tc7;
188  const uint32_t tc10 = tc8 ^ tc9;
189  const uint32_t tc11 = tc6 ^ tc5;
190  const uint32_t tc12 = z3 ^ z5;
191  const uint32_t tc13 = z13 ^ tc1;
192  const uint32_t tc14 = tc4 ^ tc12;
193  const uint32_t S3 = tc3 ^ tc11;
194  const uint32_t tc16 = z6 ^ tc8;
195  const uint32_t tc17 = z14 ^ tc10;
196  const uint32_t tc18 = ~tc13 ^ tc14;
197  const uint32_t S7 = z12 ^ tc18;
198  const uint32_t tc20 = z15 ^ tc16;
199  const uint32_t tc21 = tc2 ^ z11;
200  const uint32_t S0 = tc3 ^ tc16;
201  const uint32_t S6 = tc10 ^ tc18;
202  const uint32_t S4 = tc14 ^ S3;
203  const uint32_t S1 = ~(S3 ^ tc16);
204  const uint32_t tc26 = tc17 ^ tc20;
205  const uint32_t S2 = ~(tc26 ^ z17);
206  const uint32_t S5 = tc21 ^ tc17;
207 
208  V[0] = S0;
209  V[1] = S1;
210  V[2] = S2;
211  V[3] = S3;
212  V[4] = S4;
213  V[5] = S5;
214  V[6] = S6;
215  V[7] = S7;
216  }
217 
218 /*
219 A circuit for inverse AES Sbox of size 121 and depth 21 from
220 http://www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html
221 http://www.cs.yale.edu/homes/peralta/CircuitStuff/Sinv.txt
222 */
223 void AES_INV_SBOX(uint32_t V[8])
224  {
225  const uint32_t U0 = V[0];
226  const uint32_t U1 = V[1];
227  const uint32_t U2 = V[2];
228  const uint32_t U3 = V[3];
229  const uint32_t U4 = V[4];
230  const uint32_t U5 = V[5];
231  const uint32_t U6 = V[6];
232  const uint32_t U7 = V[7];
233 
234  const uint32_t Y0 = U0 ^ U3;
235  const uint32_t Y2 = ~(U1 ^ U3);
236  const uint32_t Y4 = U0 ^ Y2;
237  const uint32_t RTL0 = U6 ^ U7;
238  const uint32_t Y1 = Y2 ^ RTL0;
239  const uint32_t Y7 = ~(U2 ^ Y1);
240  const uint32_t RTL1 = U3 ^ U4;
241  const uint32_t Y6 = ~(U7 ^ RTL1);
242  const uint32_t Y3 = Y1 ^ RTL1;
243  const uint32_t RTL2 = ~(U0 ^ U2);
244  const uint32_t Y5 = U5 ^ RTL2;
245  const uint32_t sa1 = Y0 ^ Y2;
246  const uint32_t sa0 = Y1 ^ Y3;
247  const uint32_t sb1 = Y4 ^ Y6;
248  const uint32_t sb0 = Y5 ^ Y7;
249  const uint32_t ah = Y0 ^ Y1;
250  const uint32_t al = Y2 ^ Y3;
251  const uint32_t aa = sa0 ^ sa1;
252  const uint32_t bh = Y4 ^ Y5;
253  const uint32_t bl = Y6 ^ Y7;
254  const uint32_t bb = sb0 ^ sb1;
255  const uint32_t ab20 = sa0 ^ sb0;
256  const uint32_t ab22 = al ^ bl;
257  const uint32_t ab23 = Y3 ^ Y7;
258  const uint32_t ab21 = sa1 ^ sb1;
259  const uint32_t abcd1 = ah & bh;
260  const uint32_t rr1 = Y0 & Y4;
261  const uint32_t ph11 = ab20 ^ abcd1;
262  const uint32_t t01 = Y1 & Y5;
263  const uint32_t ph01 = t01 ^ abcd1;
264  const uint32_t abcd2 = al & bl;
265  const uint32_t r1 = Y2 & Y6;
266  const uint32_t pl11 = ab22 ^ abcd2;
267  const uint32_t r2 = Y3 & Y7;
268  const uint32_t pl01 = r2 ^ abcd2;
269  const uint32_t r3 = sa0 & sb0;
270  const uint32_t vr1 = aa & bb;
271  const uint32_t pr1 = vr1 ^ r3;
272  const uint32_t wr1 = sa1 & sb1;
273  const uint32_t qr1 = wr1 ^ r3;
274  const uint32_t ab0 = ph11 ^ rr1;
275  const uint32_t ab1 = ph01 ^ ab21;
276  const uint32_t ab2 = pl11 ^ r1;
277  const uint32_t ab3 = pl01 ^ qr1;
278  const uint32_t cp1 = ab0 ^ pr1;
279  const uint32_t cp2 = ab1 ^ qr1;
280  const uint32_t cp3 = ab2 ^ pr1;
281  const uint32_t cp4 = ab3 ^ ab23;
282  const uint32_t tinv1 = cp3 ^ cp4;
283  const uint32_t tinv2 = cp3 & cp1;
284  const uint32_t tinv3 = cp2 ^ tinv2;
285  const uint32_t tinv4 = cp1 ^ cp2;
286  const uint32_t tinv5 = cp4 ^ tinv2;
287  const uint32_t tinv6 = tinv5 & tinv4;
288  const uint32_t tinv7 = tinv3 & tinv1;
289  const uint32_t d2 = cp4 ^ tinv7;
290  const uint32_t d0 = cp2 ^ tinv6;
291  const uint32_t tinv8 = cp1 & cp4;
292  const uint32_t tinv9 = tinv4 & tinv8;
293  const uint32_t tinv10 = tinv4 ^ tinv2;
294  const uint32_t d1 = tinv9 ^ tinv10;
295  const uint32_t tinv11 = cp2 & cp3;
296  const uint32_t tinv12 = tinv1 & tinv11;
297  const uint32_t tinv13 = tinv1 ^ tinv2;
298  const uint32_t d3 = tinv12 ^ tinv13;
299  const uint32_t sd1 = d1 ^ d3;
300  const uint32_t sd0 = d0 ^ d2;
301  const uint32_t dl = d0 ^ d1;
302  const uint32_t dh = d2 ^ d3;
303  const uint32_t dd = sd0 ^ sd1;
304  const uint32_t abcd3 = dh & bh;
305  const uint32_t rr2 = d3 & Y4;
306  const uint32_t t02 = d2 & Y5;
307  const uint32_t abcd4 = dl & bl;
308  const uint32_t r4 = d1 & Y6;
309  const uint32_t r5 = d0 & Y7;
310  const uint32_t r6 = sd0 & sb0;
311  const uint32_t vr2 = dd & bb;
312  const uint32_t wr2 = sd1 & sb1;
313  const uint32_t abcd5 = dh & ah;
314  const uint32_t r7 = d3 & Y0;
315  const uint32_t r8 = d2 & Y1;
316  const uint32_t abcd6 = dl & al;
317  const uint32_t r9 = d1 & Y2;
318  const uint32_t r10 = d0 & Y3;
319  const uint32_t r11 = sd0 & sa0;
320  const uint32_t vr3 = dd & aa;
321  const uint32_t wr3 = sd1 & sa1;
322  const uint32_t ph12 = rr2 ^ abcd3;
323  const uint32_t ph02 = t02 ^ abcd3;
324  const uint32_t pl12 = r4 ^ abcd4;
325  const uint32_t pl02 = r5 ^ abcd4;
326  const uint32_t pr2 = vr2 ^ r6;
327  const uint32_t qr2 = wr2 ^ r6;
328  const uint32_t p0 = ph12 ^ pr2;
329  const uint32_t p1 = ph02 ^ qr2;
330  const uint32_t p2 = pl12 ^ pr2;
331  const uint32_t p3 = pl02 ^ qr2;
332  const uint32_t ph13 = r7 ^ abcd5;
333  const uint32_t ph03 = r8 ^ abcd5;
334  const uint32_t pl13 = r9 ^ abcd6;
335  const uint32_t pl03 = r10 ^ abcd6;
336  const uint32_t pr3 = vr3 ^ r11;
337  const uint32_t qr3 = wr3 ^ r11;
338  const uint32_t p4 = ph13 ^ pr3;
339  const uint32_t S7 = ph03 ^ qr3;
340  const uint32_t p6 = pl13 ^ pr3;
341  const uint32_t p7 = pl03 ^ qr3;
342  const uint32_t S3 = p1 ^ p6;
343  const uint32_t S6 = p2 ^ p6;
344  const uint32_t S0 = p3 ^ p6;
345  const uint32_t X11 = p0 ^ p2;
346  const uint32_t S5 = S0 ^ X11;
347  const uint32_t X13 = p4 ^ p7;
348  const uint32_t X14 = X11 ^ X13;
349  const uint32_t S1 = S3 ^ X14;
350  const uint32_t X16 = p1 ^ S7;
351  const uint32_t S2 = X14 ^ X16;
352  const uint32_t X18 = p0 ^ p4;
353  const uint32_t X19 = S5 ^ X16;
354  const uint32_t S4 = X18 ^ X19;
355 
356  V[0] = S0;
357  V[1] = S1;
358  V[2] = S2;
359  V[3] = S3;
360  V[4] = S4;
361  V[5] = S5;
362  V[6] = S6;
363  V[7] = S7;
364  }
365 
366 inline void bit_transpose(uint32_t B[8])
367  {
368  swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1);
369  swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1);
370  swap_bits<uint32_t>(B[5], B[4], 0x55555555, 1);
371  swap_bits<uint32_t>(B[7], B[6], 0x55555555, 1);
372 
373  swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2);
374  swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2);
375  swap_bits<uint32_t>(B[6], B[4], 0x33333333, 2);
376  swap_bits<uint32_t>(B[7], B[5], 0x33333333, 2);
377 
378  swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4);
379  swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4);
380  swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4);
381  swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4);
382  }
383 
384 inline void ks_expand(uint32_t B[8], const uint32_t K[], size_t r)
385  {
386  /*
387  This is bit_transpose of K[r..r+4] || K[r..r+4], we can save some computation
388  due to knowing the first and second halves are the same data.
389  */
390  for(size_t i = 0; i != 4; ++i)
391  B[i] = K[r + i];
392 
393  swap_bits<uint32_t>(B[1], B[0], 0x55555555, 1);
394  swap_bits<uint32_t>(B[3], B[2], 0x55555555, 1);
395 
396  swap_bits<uint32_t>(B[2], B[0], 0x33333333, 2);
397  swap_bits<uint32_t>(B[3], B[1], 0x33333333, 2);
398 
399  B[4] = B[0];
400  B[5] = B[1];
401  B[6] = B[2];
402  B[7] = B[3];
403 
404  swap_bits<uint32_t>(B[4], B[0], 0x0F0F0F0F, 4);
405  swap_bits<uint32_t>(B[5], B[1], 0x0F0F0F0F, 4);
406  swap_bits<uint32_t>(B[6], B[2], 0x0F0F0F0F, 4);
407  swap_bits<uint32_t>(B[7], B[3], 0x0F0F0F0F, 4);
408  }
409 
410 inline void shift_rows(uint32_t B[8])
411  {
412  // 3 0 1 2 7 4 5 6 10 11 8 9 14 15 12 13 17 18 19 16 21 22 23 20 24 25 26 27 28 29 30 31
413 #if defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT)
414  for(size_t i = 0; i != 8; i += 2)
415  {
416  uint64_t x = (static_cast<uint64_t>(B[i]) << 32) | B[i+1];
417  x = bit_permute_step<uint64_t>(x, 0x0022331100223311, 2);
418  x = bit_permute_step<uint64_t>(x, 0x0055005500550055, 1);
419  B[i] = static_cast<uint32_t>(x >> 32);
420  B[i+1] = static_cast<uint32_t>(x);
421  }
422 #else
423  for(size_t i = 0; i != 8; ++i)
424  {
425  uint32_t x = B[i];
426  x = bit_permute_step<uint32_t>(x, 0x00223311, 2);
427  x = bit_permute_step<uint32_t>(x, 0x00550055, 1);
428  B[i] = x;
429  }
430 #endif
431  }
432 
433 inline void inv_shift_rows(uint32_t B[8])
434  {
435  // Inverse of shift_rows, just inverting the steps
436 
437 #if defined(BOTAN_TARGET_CPU_HAS_NATIVE_64BIT)
438  for(size_t i = 0; i != 8; i += 2)
439  {
440  uint64_t x = (static_cast<uint64_t>(B[i]) << 32) | B[i+1];
441  x = bit_permute_step<uint64_t>(x, 0x0055005500550055, 1);
442  x = bit_permute_step<uint64_t>(x, 0x0022331100223311, 2);
443  B[i] = static_cast<uint32_t>(x >> 32);
444  B[i+1] = static_cast<uint32_t>(x);
445  }
446 #else
447  for(size_t i = 0; i != 8; ++i)
448  {
449  uint32_t x = B[i];
450  x = bit_permute_step<uint32_t>(x, 0x00550055, 1);
451  x = bit_permute_step<uint32_t>(x, 0x00223311, 2);
452  B[i] = x;
453  }
454 #endif
455  }
456 
457 inline void mix_columns(uint32_t B[8])
458  {
459  // carry high bits in B[0] to positions in 0x1b == 0b11011
460  const uint32_t X2[8] = {
461  B[1],
462  B[2],
463  B[3],
464  B[4] ^ B[0],
465  B[5] ^ B[0],
466  B[6],
467  B[7] ^ B[0],
468  B[0],
469  };
470 
471  for(size_t i = 0; i != 8; i++)
472  {
473  const uint32_t X3 = B[i] ^ X2[i];
474  B[i] = X2[i] ^ rotr<8>(B[i]) ^ rotr<16>(B[i]) ^ rotr<24>(X3);
475  }
476  }
477 
478 void inv_mix_columns(uint32_t B[8])
479  {
480  /*
481  OpenSSL's bsaes implementation credits Jussi Kivilinna with the lovely
482  matrix decomposition
483 
484  | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
485  | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
486  | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
487  | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
488 
489  Notice the first component is simply the MixColumns matrix. So we can
490  multiply first by (05,00,04,00) then perform MixColumns to get the equivalent
491  of InvMixColumn.
492  */
493  const uint32_t X4[8] = {
494  B[2],
495  B[3],
496  B[4] ^ B[0],
497  B[5] ^ B[0] ^ B[1],
498  B[6] ^ B[1],
499  B[7] ^ B[0],
500  B[0] ^ B[1],
501  B[1],
502  };
503 
504  for(size_t i = 0; i != 8; i++)
505  {
506  const uint32_t X5 = X4[i] ^ B[i];
507  B[i] = X5 ^ rotr<16>(X4[i]);
508  }
509 
510  mix_columns(B);
511  }
512 
513 /*
514 * AES Encryption
515 */
516 void aes_encrypt_n(const uint8_t in[], uint8_t out[],
517  size_t blocks,
518  const secure_vector<uint32_t>& EK)
519  {
520  BOTAN_ASSERT(EK.size() == 44 || EK.size() == 52 || EK.size() == 60, "Key was set");
521 
522  const size_t rounds = (EK.size() - 4) / 4;
523 
524  uint32_t KS[13*8] = { 0 }; // actual maximum is (rounds - 1) * 8
525  for(size_t i = 0; i < rounds - 1; i += 1)
526  {
527  ks_expand(&KS[8*i], EK.data(), 4*i + 4);
528  }
529 
530  const size_t BLOCK_SIZE = 16;
531  const size_t BITSLICED_BLOCKS = 8*sizeof(uint32_t) / BLOCK_SIZE;
532 
533  while(blocks > 0)
534  {
535  const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS);
536 
537  uint32_t B[8] = { 0 };
538 
539  load_be(B, in, this_loop*4);
540 
541  for(size_t i = 0; i != 8; ++i)
542  B[i] ^= EK[i % 4];
543 
544  bit_transpose(B);
545 
546  for(size_t r = 0; r != rounds - 1; ++r)
547  {
548  AES_SBOX(B);
549  shift_rows(B);
550  mix_columns(B);
551 
552  for(size_t i = 0; i != 8; ++i)
553  B[i] ^= KS[8*r + i];
554  }
555 
556  // Final round:
557  AES_SBOX(B);
558  shift_rows(B);
559  bit_transpose(B);
560 
561  for(size_t i = 0; i != 8; ++i)
562  B[i] ^= EK[4*rounds + i % 4];
563 
564  copy_out_be(out, this_loop*4*sizeof(uint32_t), B);
565 
566  in += this_loop * BLOCK_SIZE;
567  out += this_loop * BLOCK_SIZE;
568  blocks -= this_loop;
569  }
570  }
571 
572 /*
573 * AES Decryption
574 */
575 void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks,
576  const secure_vector<uint32_t>& DK)
577  {
578  BOTAN_ASSERT(DK.size() == 44 || DK.size() == 52 || DK.size() == 60, "Key was set");
579 
580  const size_t rounds = (DK.size() - 4) / 4;
581 
582  uint32_t KS[13*8] = { 0 }; // actual maximum is (rounds - 1) * 8
583  for(size_t i = 0; i < rounds - 1; i += 1)
584  {
585  ks_expand(&KS[8*i], DK.data(), 4*i + 4);
586  }
587 
588  const size_t BLOCK_SIZE = 16;
589  const size_t BITSLICED_BLOCKS = 8*sizeof(uint32_t) / BLOCK_SIZE;
590 
591  while(blocks > 0)
592  {
593  const size_t this_loop = std::min(blocks, BITSLICED_BLOCKS);
594 
595  uint32_t B[8] = { 0 };
596 
597  load_be(B, in, this_loop*4);
598 
599  for(size_t i = 0; i != 8; ++i)
600  B[i] ^= DK[i % 4];
601 
602  bit_transpose(B);
603 
604  for(size_t r = 0; r != rounds - 1; ++r)
605  {
606  AES_INV_SBOX(B);
607  inv_shift_rows(B);
608  inv_mix_columns(B);
609 
610  for(size_t i = 0; i != 8; ++i)
611  B[i] ^= KS[8*r + i];
612  }
613 
614  // Final round:
615  AES_INV_SBOX(B);
616  inv_shift_rows(B);
617  bit_transpose(B);
618 
619  for(size_t i = 0; i != 8; ++i)
620  B[i] ^= DK[4*rounds + i % 4];
621 
622  copy_out_be(out, this_loop*4*sizeof(uint32_t), B);
623 
624  in += this_loop * BLOCK_SIZE;
625  out += this_loop * BLOCK_SIZE;
626  blocks -= this_loop;
627  }
628  }
629 
630 inline uint32_t xtime32(uint32_t s)
631  {
632  const uint32_t lo_bit = 0x01010101;
633  const uint32_t mask = 0x7F7F7F7F;
634  const uint32_t poly = 0x1B;
635 
636  return ((s & mask) << 1) ^ (((s >> 7) & lo_bit) * poly);
637  }
638 
639 inline uint32_t InvMixColumn(uint32_t s1)
640  {
641  const uint32_t s2 = xtime32(s1);
642  const uint32_t s4 = xtime32(s2);
643  const uint32_t s8 = xtime32(s4);
644  const uint32_t s9 = s8 ^ s1;
645  const uint32_t s11 = s9 ^ s2;
646  const uint32_t s13 = s9 ^ s4;
647  const uint32_t s14 = s8 ^ s4 ^ s2;
648 
649  return s14 ^ rotr<8>(s9) ^ rotr<16>(s13) ^ rotr<24>(s11);
650  }
651 
652 void InvMixColumn_x4(uint32_t x[4])
653  {
654  x[0] = InvMixColumn(x[0]);
655  x[1] = InvMixColumn(x[1]);
656  x[2] = InvMixColumn(x[2]);
657  x[3] = InvMixColumn(x[3]);
658  }
659 
660 uint32_t SE_word(uint32_t x)
661  {
662  uint32_t I[8] = { 0 };
663 
664  for(size_t i = 0; i != 8; ++i)
665  I[i] = (x >> (7-i)) & 0x01010101;
666 
667  AES_SBOX(I);
668 
669  x = 0;
670 
671  for(size_t i = 0; i != 8; ++i)
672  x |= ((I[i] & 0x01010101) << (7-i));
673 
674  return x;
675  }
676 
677 void aes_key_schedule(const uint8_t key[], size_t length,
678  secure_vector<uint32_t>& EK,
679  secure_vector<uint32_t>& DK,
680  bool bswap_keys = false)
681  {
682  static const uint32_t RC[10] = {
683  0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000,
684  0x20000000, 0x40000000, 0x80000000, 0x1B000000, 0x36000000 };
685 
686  const size_t X = length / 4;
687 
688  // Can't happen, but make static analyzers happy
689  BOTAN_ASSERT_NOMSG(X == 4 || X == 6 || X == 8);
690 
691  const size_t rounds = (length / 4) + 6;
692 
693  // Help the optimizer
694  BOTAN_ASSERT_NOMSG(rounds == 10 || rounds == 12 || rounds == 14);
695 
696  CT::poison(key, length);
697 
698  EK.resize(length + 28);
699  DK.resize(length + 28);
700 
701  for(size_t i = 0; i != X; ++i)
702  EK[i] = load_be<uint32_t>(key, i);
703 
704  for(size_t i = X; i < 4*(rounds+1); i += X)
705  {
706  EK[i] = EK[i-X] ^ RC[(i-X)/X] ^ rotl<8>(SE_word(EK[i-1]));
707 
708  for(size_t j = 1; j != X && (i+j) < EK.size(); ++j)
709  {
710  EK[i+j] = EK[i+j-X];
711 
712  if(X == 8 && j == 4)
713  EK[i+j] ^= SE_word(EK[i+j-1]);
714  else
715  EK[i+j] ^= EK[i+j-1];
716  }
717  }
718 
719  for(size_t i = 0; i != 4*(rounds+1); i += 4)
720  {
721  DK[i ] = EK[4*rounds - i ];
722  DK[i+1] = EK[4*rounds - i+1];
723  DK[i+2] = EK[4*rounds - i+2];
724  DK[i+3] = EK[4*rounds - i+3];
725  }
726 
727  for(size_t i = 4; i != 4*rounds; i += 4)
728  {
729  InvMixColumn_x4(&DK[i]);
730  }
731 
732  if(bswap_keys)
733  {
734  // HW AES on little endian needs the subkeys to be byte reversed
735  for(size_t i = 0; i != EK.size(); ++i)
736  EK[i] = reverse_bytes(EK[i]);
737  for(size_t i = 0; i != DK.size(); ++i)
738  DK[i] = reverse_bytes(DK[i]);
739  }
740 
741  CT::unpoison(EK.data(), EK.size());
742  CT::unpoison(DK.data(), DK.size());
743  CT::unpoison(key, length);
744  }
745 
746 size_t aes_parallelism()
747  {
748 #if defined(BOTAN_HAS_HW_AES_SUPPORT)
749  if(CPUID::has_hw_aes())
750  {
751  return 4; // pipelined
752  }
753 #endif
754 
755 #if defined(BOTAN_HAS_AES_VPERM)
756  if(CPUID::has_vperm())
757  {
758  return 2; // pipelined
759  }
760 #endif
761 
762  // bitsliced:
763  return 2;
764  }
765 
766 const char* aes_provider()
767  {
768 #if defined(BOTAN_HAS_HW_AES_SUPPORT)
769  if(CPUID::has_hw_aes())
770  {
771  return "cpu";
772  }
773 #endif
774 
775 #if defined(BOTAN_HAS_AES_VPERM)
776  if(CPUID::has_vperm())
777  {
778  return "vperm";
779  }
780 #endif
781 
782  return "base";
783  }
784 
785 }
786 
787 std::string AES_128::provider() const { return aes_provider(); }
788 std::string AES_192::provider() const { return aes_provider(); }
789 std::string AES_256::provider() const { return aes_provider(); }
790 
791 size_t AES_128::parallelism() const { return aes_parallelism(); }
792 size_t AES_192::parallelism() const { return aes_parallelism(); }
793 size_t AES_256::parallelism() const { return aes_parallelism(); }
794 
795 void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
796  {
797  verify_key_set(m_EK.empty() == false);
798 
799 #if defined(BOTAN_HAS_HW_AES_SUPPORT)
800  if(CPUID::has_hw_aes())
801  {
802  return hw_aes_encrypt_n(in, out, blocks);
803  }
804 #endif
805 
806 #if defined(BOTAN_HAS_AES_VPERM)
807  if(CPUID::has_vperm())
808  {
809  return vperm_encrypt_n(in, out, blocks);
810  }
811 #endif
812 
813  aes_encrypt_n(in, out, blocks, m_EK);
814  }
815 
816 void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
817  {
818  verify_key_set(m_DK.empty() == false);
819 
820 #if defined(BOTAN_HAS_HW_AES_SUPPORT)
821  if(CPUID::has_hw_aes())
822  {
823  return hw_aes_decrypt_n(in, out, blocks);
824  }
825 #endif
826 
827 #if defined(BOTAN_HAS_AES_VPERM)
828  if(CPUID::has_vperm())
829  {
830  return vperm_decrypt_n(in, out, blocks);
831  }
832 #endif
833 
834  aes_decrypt_n(in, out, blocks, m_DK);
835  }
836 
837 void AES_128::key_schedule(const uint8_t key[], size_t length)
838  {
839 #if defined(BOTAN_HAS_AES_NI)
840  if(CPUID::has_aes_ni())
841  {
842  return aesni_key_schedule(key, length);
843  }
844 #endif
845 
846 #if defined(BOTAN_HAS_HW_AES_SUPPORT)
847  if(CPUID::has_hw_aes())
848  {
849  return aes_key_schedule(key, length, m_EK, m_DK, CPUID::is_little_endian());
850  }
851 #endif
852 
853 #if defined(BOTAN_HAS_AES_VPERM)
854  if(CPUID::has_vperm())
855  {
856  return vperm_key_schedule(key, length);
857  }
858 #endif
859 
860  aes_key_schedule(key, length, m_EK, m_DK);
861  }
862 
864  {
865  zap(m_EK);
866  zap(m_DK);
867  }
868 
869 void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
870  {
871  verify_key_set(m_EK.empty() == false);
872 
873 #if defined(BOTAN_HAS_HW_AES_SUPPORT)
874  if(CPUID::has_hw_aes())
875  {
876  return hw_aes_encrypt_n(in, out, blocks);
877  }
878 #endif
879 
880 #if defined(BOTAN_HAS_AES_VPERM)
881  if(CPUID::has_vperm())
882  {
883  return vperm_encrypt_n(in, out, blocks);
884  }
885 #endif
886 
887  aes_encrypt_n(in, out, blocks, m_EK);
888  }
889 
890 void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
891  {
892  verify_key_set(m_DK.empty() == false);
893 
894 #if defined(BOTAN_HAS_HW_AES_SUPPORT)
895  if(CPUID::has_hw_aes())
896  {
897  return hw_aes_decrypt_n(in, out, blocks);
898  }
899 #endif
900 
901 #if defined(BOTAN_HAS_AES_VPERM)
902  if(CPUID::has_vperm())
903  {
904  return vperm_decrypt_n(in, out, blocks);
905  }
906 #endif
907 
908  aes_decrypt_n(in, out, blocks, m_DK);
909  }
910 
911 void AES_192::key_schedule(const uint8_t key[], size_t length)
912  {
913 #if defined(BOTAN_HAS_AES_NI)
914  if(CPUID::has_aes_ni())
915  {
916  return aesni_key_schedule(key, length);
917  }
918 #endif
919 
920 #if defined(BOTAN_HAS_HW_AES_SUPPORT)
921  if(CPUID::has_hw_aes())
922  {
923  return aes_key_schedule(key, length, m_EK, m_DK, CPUID::is_little_endian());
924  }
925 #endif
926 
927 #if defined(BOTAN_HAS_AES_VPERM)
928  if(CPUID::has_vperm())
929  {
930  return vperm_key_schedule(key, length);
931  }
932 #endif
933 
934  aes_key_schedule(key, length, m_EK, m_DK);
935  }
936 
938  {
939  zap(m_EK);
940  zap(m_DK);
941  }
942 
943 void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
944  {
945  verify_key_set(m_EK.empty() == false);
946 
947 #if defined(BOTAN_HAS_HW_AES_SUPPORT)
948  if(CPUID::has_hw_aes())
949  {
950  return hw_aes_encrypt_n(in, out, blocks);
951  }
952 #endif
953 
954 #if defined(BOTAN_HAS_AES_VPERM)
955  if(CPUID::has_vperm())
956  {
957  return vperm_encrypt_n(in, out, blocks);
958  }
959 #endif
960 
961  aes_encrypt_n(in, out, blocks, m_EK);
962  }
963 
964 void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
965  {
966  verify_key_set(m_DK.empty() == false);
967 
968 #if defined(BOTAN_HAS_HW_AES_SUPPORT)
969  if(CPUID::has_hw_aes())
970  {
971  return hw_aes_decrypt_n(in, out, blocks);
972  }
973 #endif
974 
975 #if defined(BOTAN_HAS_AES_VPERM)
976  if(CPUID::has_vperm())
977  {
978  return vperm_decrypt_n(in, out, blocks);
979  }
980 #endif
981 
982  aes_decrypt_n(in, out, blocks, m_DK);
983  }
984 
985 void AES_256::key_schedule(const uint8_t key[], size_t length)
986  {
987 #if defined(BOTAN_HAS_AES_NI)
988  if(CPUID::has_aes_ni())
989  {
990  return aesni_key_schedule(key, length);
991  }
992 #endif
993 
994 #if defined(BOTAN_HAS_HW_AES_SUPPORT)
995  if(CPUID::has_hw_aes())
996  {
997  return aes_key_schedule(key, length, m_EK, m_DK, CPUID::is_little_endian());
998  }
999 #endif
1000 
1001 #if defined(BOTAN_HAS_AES_VPERM)
1002  if(CPUID::has_vperm())
1003  {
1004  return vperm_key_schedule(key, length);
1005  }
1006 #endif
1007 
1008  aes_key_schedule(key, length, m_EK, m_DK);
1009  }
1010 
1012  {
1013  zap(m_EK);
1014  zap(m_DK);
1015  }
1016 
1017 }
fe X
Definition: ge.cpp:27
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition: aes.cpp:795
void verify_key_set(bool cond) const
Definition: sym_algo.h:171
void zap(std::vector< T, Alloc > &vec)
Definition: secmem.h:127
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition: aes.cpp:890
uint32_t load_be< uint32_t >(const uint8_t in[], size_t off)
Definition: loadstor.h:179
void clear() override
Definition: aes.cpp:937
void clear() override
Definition: aes.cpp:863
#define BOTAN_ASSERT_NOMSG(expr)
Definition: assert.h:68
void poison(const T *p, size_t n)
Definition: ct_utils.h:48
void clear() override
Definition: aes.cpp:1011
size_t parallelism() const override
Definition: aes.cpp:792
#define BOTAN_ASSERT(expr, assertion_made)
Definition: assert.h:55
std::string provider() const override
Definition: aes.cpp:788
T load_be(const uint8_t in[], size_t off)
Definition: loadstor.h:107
std::string provider() const override
Definition: aes.cpp:787
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition: aes.cpp:816
static bool is_little_endian()
Definition: cpuid.h:73
Definition: alg_id.cpp:13
uint16_t reverse_bytes(uint16_t val)
Definition: bswap.h:25
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition: aes.cpp:869
size_t parallelism() const override
Definition: aes.cpp:793
static bool has_vperm()
Definition: cpuid.h:362
std::string provider() const override
Definition: aes.cpp:789
static bool has_hw_aes()
Definition: cpuid.h:378
void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition: aes.cpp:964
newhope_poly poly
Definition: newhope.cpp:25
void unpoison(const T *p, size_t n)
Definition: ct_utils.h:59
void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override
Definition: aes.cpp:943
void copy_out_be(uint8_t out[], size_t out_bytes, const T in[])
Definition: loadstor.h:658
size_t parallelism() const override
Definition: aes.cpp:791