Botan  2.7.0
Crypto and TLS for C++11
aes_ni.cpp
Go to the documentation of this file.
1 /*
2 * AES using AES-NI instructions
3 * (C) 2009,2012 Jack Lloyd
4 *
5 * Botan is released under the Simplified BSD License (see license.txt)
6 */
7 
8 #include <botan/aes.h>
9 #include <botan/loadstor.h>
10 #include <wmmintrin.h>
11 
12 namespace Botan {
13 
14 namespace {
15 
16 BOTAN_FUNC_ISA("ssse3")
17 __m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
18  {
19  key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3));
20  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
21  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
22  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
23  return _mm_xor_si128(key, key_with_rcon);
24  }
25 
26 BOTAN_FUNC_ISA("ssse3")
27 void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
28  uint32_t out[], bool last)
29  {
30  __m128i key1 = *K1;
31  __m128i key2 = *K2;
32 
33  key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
34  key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
35  key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
36  key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
37  key1 = _mm_xor_si128(key1, key2_with_rcon);
38 
39  *K1 = key1;
40  _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1);
41 
42  if(last)
43  return;
44 
45  key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
46  key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));
47 
48  *K2 = key2;
49  out[4] = _mm_cvtsi128_si32(key2);
50  out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
51  }
52 
53 /*
54 * The second half of the AES-256 key expansion (other half same as AES-128)
55 */
56 BOTAN_FUNC_ISA("ssse3,aes")
57 __m128i aes_256_key_expansion(__m128i key, __m128i key2)
58  {
59  __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
60  key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2));
61 
62  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
63  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
64  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
65  return _mm_xor_si128(key, key_with_rcon);
66  }
67 
68 }
69 
70 #define AES_ENC_4_ROUNDS(K) \
71  do \
72  { \
73  B0 = _mm_aesenc_si128(B0, K); \
74  B1 = _mm_aesenc_si128(B1, K); \
75  B2 = _mm_aesenc_si128(B2, K); \
76  B3 = _mm_aesenc_si128(B3, K); \
77  } while(0)
78 
79 #define AES_ENC_4_LAST_ROUNDS(K) \
80  do \
81  { \
82  B0 = _mm_aesenclast_si128(B0, K); \
83  B1 = _mm_aesenclast_si128(B1, K); \
84  B2 = _mm_aesenclast_si128(B2, K); \
85  B3 = _mm_aesenclast_si128(B3, K); \
86  } while(0)
87 
88 #define AES_DEC_4_ROUNDS(K) \
89  do \
90  { \
91  B0 = _mm_aesdec_si128(B0, K); \
92  B1 = _mm_aesdec_si128(B1, K); \
93  B2 = _mm_aesdec_si128(B2, K); \
94  B3 = _mm_aesdec_si128(B3, K); \
95  } while(0)
96 
97 #define AES_DEC_4_LAST_ROUNDS(K) \
98  do \
99  { \
100  B0 = _mm_aesdeclast_si128(B0, K); \
101  B1 = _mm_aesdeclast_si128(B1, K); \
102  B2 = _mm_aesdeclast_si128(B2, K); \
103  B3 = _mm_aesdeclast_si128(B3, K); \
104  } while(0)
105 
106 /*
107 * AES-128 Encryption
108 */
109 BOTAN_FUNC_ISA("ssse3,aes")
110 void AES_128::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
111  {
112  BOTAN_ASSERT(m_EK.empty() == false, "Key was set");
113 
114  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
115  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
116 
117  const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
118 
119  const __m128i K0 = _mm_loadu_si128(key_mm);
120  const __m128i K1 = _mm_loadu_si128(key_mm + 1);
121  const __m128i K2 = _mm_loadu_si128(key_mm + 2);
122  const __m128i K3 = _mm_loadu_si128(key_mm + 3);
123  const __m128i K4 = _mm_loadu_si128(key_mm + 4);
124  const __m128i K5 = _mm_loadu_si128(key_mm + 5);
125  const __m128i K6 = _mm_loadu_si128(key_mm + 6);
126  const __m128i K7 = _mm_loadu_si128(key_mm + 7);
127  const __m128i K8 = _mm_loadu_si128(key_mm + 8);
128  const __m128i K9 = _mm_loadu_si128(key_mm + 9);
129  const __m128i K10 = _mm_loadu_si128(key_mm + 10);
130 
131  while(blocks >= 4)
132  {
133  __m128i B0 = _mm_loadu_si128(in_mm + 0);
134  __m128i B1 = _mm_loadu_si128(in_mm + 1);
135  __m128i B2 = _mm_loadu_si128(in_mm + 2);
136  __m128i B3 = _mm_loadu_si128(in_mm + 3);
137 
138  B0 = _mm_xor_si128(B0, K0);
139  B1 = _mm_xor_si128(B1, K0);
140  B2 = _mm_xor_si128(B2, K0);
141  B3 = _mm_xor_si128(B3, K0);
142 
143  AES_ENC_4_ROUNDS(K1);
144  AES_ENC_4_ROUNDS(K2);
145  AES_ENC_4_ROUNDS(K3);
146  AES_ENC_4_ROUNDS(K4);
147  AES_ENC_4_ROUNDS(K5);
148  AES_ENC_4_ROUNDS(K6);
149  AES_ENC_4_ROUNDS(K7);
150  AES_ENC_4_ROUNDS(K8);
151  AES_ENC_4_ROUNDS(K9);
153 
154  _mm_storeu_si128(out_mm + 0, B0);
155  _mm_storeu_si128(out_mm + 1, B1);
156  _mm_storeu_si128(out_mm + 2, B2);
157  _mm_storeu_si128(out_mm + 3, B3);
158 
159  blocks -= 4;
160  in_mm += 4;
161  out_mm += 4;
162  }
163 
164  for(size_t i = 0; i != blocks; ++i)
165  {
166  __m128i B = _mm_loadu_si128(in_mm + i);
167 
168  B = _mm_xor_si128(B, K0);
169 
170  B = _mm_aesenc_si128(B, K1);
171  B = _mm_aesenc_si128(B, K2);
172  B = _mm_aesenc_si128(B, K3);
173  B = _mm_aesenc_si128(B, K4);
174  B = _mm_aesenc_si128(B, K5);
175  B = _mm_aesenc_si128(B, K6);
176  B = _mm_aesenc_si128(B, K7);
177  B = _mm_aesenc_si128(B, K8);
178  B = _mm_aesenc_si128(B, K9);
179  B = _mm_aesenclast_si128(B, K10);
180 
181  _mm_storeu_si128(out_mm + i, B);
182  }
183  }
184 
185 /*
186 * AES-128 Decryption
187 */
188 BOTAN_FUNC_ISA("ssse3,aes")
189 void AES_128::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
190  {
191  BOTAN_ASSERT(m_DK.empty() == false, "Key was set");
192 
193  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
194  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
195 
196  const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
197 
198  const __m128i K0 = _mm_loadu_si128(key_mm);
199  const __m128i K1 = _mm_loadu_si128(key_mm + 1);
200  const __m128i K2 = _mm_loadu_si128(key_mm + 2);
201  const __m128i K3 = _mm_loadu_si128(key_mm + 3);
202  const __m128i K4 = _mm_loadu_si128(key_mm + 4);
203  const __m128i K5 = _mm_loadu_si128(key_mm + 5);
204  const __m128i K6 = _mm_loadu_si128(key_mm + 6);
205  const __m128i K7 = _mm_loadu_si128(key_mm + 7);
206  const __m128i K8 = _mm_loadu_si128(key_mm + 8);
207  const __m128i K9 = _mm_loadu_si128(key_mm + 9);
208  const __m128i K10 = _mm_loadu_si128(key_mm + 10);
209 
210  while(blocks >= 4)
211  {
212  __m128i B0 = _mm_loadu_si128(in_mm + 0);
213  __m128i B1 = _mm_loadu_si128(in_mm + 1);
214  __m128i B2 = _mm_loadu_si128(in_mm + 2);
215  __m128i B3 = _mm_loadu_si128(in_mm + 3);
216 
217  B0 = _mm_xor_si128(B0, K0);
218  B1 = _mm_xor_si128(B1, K0);
219  B2 = _mm_xor_si128(B2, K0);
220  B3 = _mm_xor_si128(B3, K0);
221 
222  AES_DEC_4_ROUNDS(K1);
223  AES_DEC_4_ROUNDS(K2);
224  AES_DEC_4_ROUNDS(K3);
225  AES_DEC_4_ROUNDS(K4);
226  AES_DEC_4_ROUNDS(K5);
227  AES_DEC_4_ROUNDS(K6);
228  AES_DEC_4_ROUNDS(K7);
229  AES_DEC_4_ROUNDS(K8);
230  AES_DEC_4_ROUNDS(K9);
232 
233  _mm_storeu_si128(out_mm + 0, B0);
234  _mm_storeu_si128(out_mm + 1, B1);
235  _mm_storeu_si128(out_mm + 2, B2);
236  _mm_storeu_si128(out_mm + 3, B3);
237 
238  blocks -= 4;
239  in_mm += 4;
240  out_mm += 4;
241  }
242 
243  for(size_t i = 0; i != blocks; ++i)
244  {
245  __m128i B = _mm_loadu_si128(in_mm + i);
246 
247  B = _mm_xor_si128(B, K0);
248 
249  B = _mm_aesdec_si128(B, K1);
250  B = _mm_aesdec_si128(B, K2);
251  B = _mm_aesdec_si128(B, K3);
252  B = _mm_aesdec_si128(B, K4);
253  B = _mm_aesdec_si128(B, K5);
254  B = _mm_aesdec_si128(B, K6);
255  B = _mm_aesdec_si128(B, K7);
256  B = _mm_aesdec_si128(B, K8);
257  B = _mm_aesdec_si128(B, K9);
258  B = _mm_aesdeclast_si128(B, K10);
259 
260  _mm_storeu_si128(out_mm + i, B);
261  }
262  }
263 
264 /*
265 * AES-128 Key Schedule
266 */
267 BOTAN_FUNC_ISA("ssse3,aes")
268 void AES_128::aesni_key_schedule(const uint8_t key[], size_t)
269  {
270  m_EK.resize(44);
271  m_DK.resize(44);
272 
273  #define AES_128_key_exp(K, RCON) \
274  aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON))
275 
276  const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
277  const __m128i K1 = AES_128_key_exp(K0, 0x01);
278  const __m128i K2 = AES_128_key_exp(K1, 0x02);
279  const __m128i K3 = AES_128_key_exp(K2, 0x04);
280  const __m128i K4 = AES_128_key_exp(K3, 0x08);
281  const __m128i K5 = AES_128_key_exp(K4, 0x10);
282  const __m128i K6 = AES_128_key_exp(K5, 0x20);
283  const __m128i K7 = AES_128_key_exp(K6, 0x40);
284  const __m128i K8 = AES_128_key_exp(K7, 0x80);
285  const __m128i K9 = AES_128_key_exp(K8, 0x1B);
286  const __m128i K10 = AES_128_key_exp(K9, 0x36);
287 
288  __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
289  _mm_storeu_si128(EK_mm , K0);
290  _mm_storeu_si128(EK_mm + 1, K1);
291  _mm_storeu_si128(EK_mm + 2, K2);
292  _mm_storeu_si128(EK_mm + 3, K3);
293  _mm_storeu_si128(EK_mm + 4, K4);
294  _mm_storeu_si128(EK_mm + 5, K5);
295  _mm_storeu_si128(EK_mm + 6, K6);
296  _mm_storeu_si128(EK_mm + 7, K7);
297  _mm_storeu_si128(EK_mm + 8, K8);
298  _mm_storeu_si128(EK_mm + 9, K9);
299  _mm_storeu_si128(EK_mm + 10, K10);
300 
301  // Now generate decryption keys
302 
303  __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
304  _mm_storeu_si128(DK_mm , K10);
305  _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
306  _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
307  _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
308  _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
309  _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
310  _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
311  _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
312  _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
313  _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
314  _mm_storeu_si128(DK_mm + 10, K0);
315  }
316 
317 /*
318 * AES-192 Encryption
319 */
320 BOTAN_FUNC_ISA("ssse3,aes")
321 void AES_192::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
322  {
323  BOTAN_ASSERT(m_EK.empty() == false, "Key was set");
324 
325  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
326  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
327 
328  const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
329 
330  const __m128i K0 = _mm_loadu_si128(key_mm);
331  const __m128i K1 = _mm_loadu_si128(key_mm + 1);
332  const __m128i K2 = _mm_loadu_si128(key_mm + 2);
333  const __m128i K3 = _mm_loadu_si128(key_mm + 3);
334  const __m128i K4 = _mm_loadu_si128(key_mm + 4);
335  const __m128i K5 = _mm_loadu_si128(key_mm + 5);
336  const __m128i K6 = _mm_loadu_si128(key_mm + 6);
337  const __m128i K7 = _mm_loadu_si128(key_mm + 7);
338  const __m128i K8 = _mm_loadu_si128(key_mm + 8);
339  const __m128i K9 = _mm_loadu_si128(key_mm + 9);
340  const __m128i K10 = _mm_loadu_si128(key_mm + 10);
341  const __m128i K11 = _mm_loadu_si128(key_mm + 11);
342  const __m128i K12 = _mm_loadu_si128(key_mm + 12);
343 
344  while(blocks >= 4)
345  {
346  __m128i B0 = _mm_loadu_si128(in_mm + 0);
347  __m128i B1 = _mm_loadu_si128(in_mm + 1);
348  __m128i B2 = _mm_loadu_si128(in_mm + 2);
349  __m128i B3 = _mm_loadu_si128(in_mm + 3);
350 
351  B0 = _mm_xor_si128(B0, K0);
352  B1 = _mm_xor_si128(B1, K0);
353  B2 = _mm_xor_si128(B2, K0);
354  B3 = _mm_xor_si128(B3, K0);
355 
356  AES_ENC_4_ROUNDS(K1);
357  AES_ENC_4_ROUNDS(K2);
358  AES_ENC_4_ROUNDS(K3);
359  AES_ENC_4_ROUNDS(K4);
360  AES_ENC_4_ROUNDS(K5);
361  AES_ENC_4_ROUNDS(K6);
362  AES_ENC_4_ROUNDS(K7);
363  AES_ENC_4_ROUNDS(K8);
364  AES_ENC_4_ROUNDS(K9);
365  AES_ENC_4_ROUNDS(K10);
366  AES_ENC_4_ROUNDS(K11);
368 
369  _mm_storeu_si128(out_mm + 0, B0);
370  _mm_storeu_si128(out_mm + 1, B1);
371  _mm_storeu_si128(out_mm + 2, B2);
372  _mm_storeu_si128(out_mm + 3, B3);
373 
374  blocks -= 4;
375  in_mm += 4;
376  out_mm += 4;
377  }
378 
379  for(size_t i = 0; i != blocks; ++i)
380  {
381  __m128i B = _mm_loadu_si128(in_mm + i);
382 
383  B = _mm_xor_si128(B, K0);
384 
385  B = _mm_aesenc_si128(B, K1);
386  B = _mm_aesenc_si128(B, K2);
387  B = _mm_aesenc_si128(B, K3);
388  B = _mm_aesenc_si128(B, K4);
389  B = _mm_aesenc_si128(B, K5);
390  B = _mm_aesenc_si128(B, K6);
391  B = _mm_aesenc_si128(B, K7);
392  B = _mm_aesenc_si128(B, K8);
393  B = _mm_aesenc_si128(B, K9);
394  B = _mm_aesenc_si128(B, K10);
395  B = _mm_aesenc_si128(B, K11);
396  B = _mm_aesenclast_si128(B, K12);
397 
398  _mm_storeu_si128(out_mm + i, B);
399  }
400  }
401 
402 /*
403 * AES-192 Decryption
404 */
405 BOTAN_FUNC_ISA("ssse3,aes")
406 void AES_192::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
407  {
408  BOTAN_ASSERT(m_DK.empty() == false, "Key was set");
409 
410  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
411  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
412 
413  const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
414 
415  const __m128i K0 = _mm_loadu_si128(key_mm);
416  const __m128i K1 = _mm_loadu_si128(key_mm + 1);
417  const __m128i K2 = _mm_loadu_si128(key_mm + 2);
418  const __m128i K3 = _mm_loadu_si128(key_mm + 3);
419  const __m128i K4 = _mm_loadu_si128(key_mm + 4);
420  const __m128i K5 = _mm_loadu_si128(key_mm + 5);
421  const __m128i K6 = _mm_loadu_si128(key_mm + 6);
422  const __m128i K7 = _mm_loadu_si128(key_mm + 7);
423  const __m128i K8 = _mm_loadu_si128(key_mm + 8);
424  const __m128i K9 = _mm_loadu_si128(key_mm + 9);
425  const __m128i K10 = _mm_loadu_si128(key_mm + 10);
426  const __m128i K11 = _mm_loadu_si128(key_mm + 11);
427  const __m128i K12 = _mm_loadu_si128(key_mm + 12);
428 
429  while(blocks >= 4)
430  {
431  __m128i B0 = _mm_loadu_si128(in_mm + 0);
432  __m128i B1 = _mm_loadu_si128(in_mm + 1);
433  __m128i B2 = _mm_loadu_si128(in_mm + 2);
434  __m128i B3 = _mm_loadu_si128(in_mm + 3);
435 
436  B0 = _mm_xor_si128(B0, K0);
437  B1 = _mm_xor_si128(B1, K0);
438  B2 = _mm_xor_si128(B2, K0);
439  B3 = _mm_xor_si128(B3, K0);
440 
441  AES_DEC_4_ROUNDS(K1);
442  AES_DEC_4_ROUNDS(K2);
443  AES_DEC_4_ROUNDS(K3);
444  AES_DEC_4_ROUNDS(K4);
445  AES_DEC_4_ROUNDS(K5);
446  AES_DEC_4_ROUNDS(K6);
447  AES_DEC_4_ROUNDS(K7);
448  AES_DEC_4_ROUNDS(K8);
449  AES_DEC_4_ROUNDS(K9);
450  AES_DEC_4_ROUNDS(K10);
451  AES_DEC_4_ROUNDS(K11);
453 
454  _mm_storeu_si128(out_mm + 0, B0);
455  _mm_storeu_si128(out_mm + 1, B1);
456  _mm_storeu_si128(out_mm + 2, B2);
457  _mm_storeu_si128(out_mm + 3, B3);
458 
459  blocks -= 4;
460  in_mm += 4;
461  out_mm += 4;
462  }
463 
464  for(size_t i = 0; i != blocks; ++i)
465  {
466  __m128i B = _mm_loadu_si128(in_mm + i);
467 
468  B = _mm_xor_si128(B, K0);
469 
470  B = _mm_aesdec_si128(B, K1);
471  B = _mm_aesdec_si128(B, K2);
472  B = _mm_aesdec_si128(B, K3);
473  B = _mm_aesdec_si128(B, K4);
474  B = _mm_aesdec_si128(B, K5);
475  B = _mm_aesdec_si128(B, K6);
476  B = _mm_aesdec_si128(B, K7);
477  B = _mm_aesdec_si128(B, K8);
478  B = _mm_aesdec_si128(B, K9);
479  B = _mm_aesdec_si128(B, K10);
480  B = _mm_aesdec_si128(B, K11);
481  B = _mm_aesdeclast_si128(B, K12);
482 
483  _mm_storeu_si128(out_mm + i, B);
484  }
485  }
486 
487 /*
488 * AES-192 Key Schedule
489 */
490 BOTAN_FUNC_ISA("ssse3,aes")
491 void AES_192::aesni_key_schedule(const uint8_t key[], size_t)
492  {
493  m_EK.resize(52);
494  m_DK.resize(52);
495 
496  __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
497  __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8));
498  K1 = _mm_srli_si128(K1, 8);
499 
500  load_le(m_EK.data(), key, 6);
501 
502  #define AES_192_key_exp(RCON, EK_OFF) \
503  aes_192_key_expansion(&K0, &K1, \
504  _mm_aeskeygenassist_si128(K1, RCON), \
505  &m_EK[EK_OFF], EK_OFF == 48)
506 
507  AES_192_key_exp(0x01, 6);
508  AES_192_key_exp(0x02, 12);
509  AES_192_key_exp(0x04, 18);
510  AES_192_key_exp(0x08, 24);
511  AES_192_key_exp(0x10, 30);
512  AES_192_key_exp(0x20, 36);
513  AES_192_key_exp(0x40, 42);
514  AES_192_key_exp(0x80, 48);
515 
516  #undef AES_192_key_exp
517 
518  // Now generate decryption keys
519  const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data());
520 
521  __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
522  _mm_storeu_si128(DK_mm , _mm_loadu_si128(EK_mm + 12));
523  _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
524  _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
525  _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
526  _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
527  _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
528  _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
529  _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
530  _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
531  _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
532  _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
533  _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
534  _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
535  }
536 
537 /*
538 * AES-256 Encryption
539 */
540 BOTAN_FUNC_ISA("ssse3,aes")
541 void AES_256::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
542  {
543  BOTAN_ASSERT(m_EK.empty() == false, "Key was set");
544 
545  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
546  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
547 
548  const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
549 
550  const __m128i K0 = _mm_loadu_si128(key_mm);
551  const __m128i K1 = _mm_loadu_si128(key_mm + 1);
552  const __m128i K2 = _mm_loadu_si128(key_mm + 2);
553  const __m128i K3 = _mm_loadu_si128(key_mm + 3);
554  const __m128i K4 = _mm_loadu_si128(key_mm + 4);
555  const __m128i K5 = _mm_loadu_si128(key_mm + 5);
556  const __m128i K6 = _mm_loadu_si128(key_mm + 6);
557  const __m128i K7 = _mm_loadu_si128(key_mm + 7);
558  const __m128i K8 = _mm_loadu_si128(key_mm + 8);
559  const __m128i K9 = _mm_loadu_si128(key_mm + 9);
560  const __m128i K10 = _mm_loadu_si128(key_mm + 10);
561  const __m128i K11 = _mm_loadu_si128(key_mm + 11);
562  const __m128i K12 = _mm_loadu_si128(key_mm + 12);
563  const __m128i K13 = _mm_loadu_si128(key_mm + 13);
564  const __m128i K14 = _mm_loadu_si128(key_mm + 14);
565 
566  while(blocks >= 4)
567  {
568  __m128i B0 = _mm_loadu_si128(in_mm + 0);
569  __m128i B1 = _mm_loadu_si128(in_mm + 1);
570  __m128i B2 = _mm_loadu_si128(in_mm + 2);
571  __m128i B3 = _mm_loadu_si128(in_mm + 3);
572 
573  B0 = _mm_xor_si128(B0, K0);
574  B1 = _mm_xor_si128(B1, K0);
575  B2 = _mm_xor_si128(B2, K0);
576  B3 = _mm_xor_si128(B3, K0);
577 
578  AES_ENC_4_ROUNDS(K1);
579  AES_ENC_4_ROUNDS(K2);
580  AES_ENC_4_ROUNDS(K3);
581  AES_ENC_4_ROUNDS(K4);
582  AES_ENC_4_ROUNDS(K5);
583  AES_ENC_4_ROUNDS(K6);
584  AES_ENC_4_ROUNDS(K7);
585  AES_ENC_4_ROUNDS(K8);
586  AES_ENC_4_ROUNDS(K9);
587  AES_ENC_4_ROUNDS(K10);
588  AES_ENC_4_ROUNDS(K11);
589  AES_ENC_4_ROUNDS(K12);
590  AES_ENC_4_ROUNDS(K13);
592 
593  _mm_storeu_si128(out_mm + 0, B0);
594  _mm_storeu_si128(out_mm + 1, B1);
595  _mm_storeu_si128(out_mm + 2, B2);
596  _mm_storeu_si128(out_mm + 3, B3);
597 
598  blocks -= 4;
599  in_mm += 4;
600  out_mm += 4;
601  }
602 
603  for(size_t i = 0; i != blocks; ++i)
604  {
605  __m128i B = _mm_loadu_si128(in_mm + i);
606 
607  B = _mm_xor_si128(B, K0);
608 
609  B = _mm_aesenc_si128(B, K1);
610  B = _mm_aesenc_si128(B, K2);
611  B = _mm_aesenc_si128(B, K3);
612  B = _mm_aesenc_si128(B, K4);
613  B = _mm_aesenc_si128(B, K5);
614  B = _mm_aesenc_si128(B, K6);
615  B = _mm_aesenc_si128(B, K7);
616  B = _mm_aesenc_si128(B, K8);
617  B = _mm_aesenc_si128(B, K9);
618  B = _mm_aesenc_si128(B, K10);
619  B = _mm_aesenc_si128(B, K11);
620  B = _mm_aesenc_si128(B, K12);
621  B = _mm_aesenc_si128(B, K13);
622  B = _mm_aesenclast_si128(B, K14);
623 
624  _mm_storeu_si128(out_mm + i, B);
625  }
626  }
627 
628 /*
629 * AES-256 Decryption
630 */
631 BOTAN_FUNC_ISA("ssse3,aes")
632 void AES_256::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
633  {
634  BOTAN_ASSERT(m_DK.empty() == false, "Key was set");
635 
636  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
637  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
638 
639  const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
640 
641  const __m128i K0 = _mm_loadu_si128(key_mm);
642  const __m128i K1 = _mm_loadu_si128(key_mm + 1);
643  const __m128i K2 = _mm_loadu_si128(key_mm + 2);
644  const __m128i K3 = _mm_loadu_si128(key_mm + 3);
645  const __m128i K4 = _mm_loadu_si128(key_mm + 4);
646  const __m128i K5 = _mm_loadu_si128(key_mm + 5);
647  const __m128i K6 = _mm_loadu_si128(key_mm + 6);
648  const __m128i K7 = _mm_loadu_si128(key_mm + 7);
649  const __m128i K8 = _mm_loadu_si128(key_mm + 8);
650  const __m128i K9 = _mm_loadu_si128(key_mm + 9);
651  const __m128i K10 = _mm_loadu_si128(key_mm + 10);
652  const __m128i K11 = _mm_loadu_si128(key_mm + 11);
653  const __m128i K12 = _mm_loadu_si128(key_mm + 12);
654  const __m128i K13 = _mm_loadu_si128(key_mm + 13);
655  const __m128i K14 = _mm_loadu_si128(key_mm + 14);
656 
657  while(blocks >= 4)
658  {
659  __m128i B0 = _mm_loadu_si128(in_mm + 0);
660  __m128i B1 = _mm_loadu_si128(in_mm + 1);
661  __m128i B2 = _mm_loadu_si128(in_mm + 2);
662  __m128i B3 = _mm_loadu_si128(in_mm + 3);
663 
664  B0 = _mm_xor_si128(B0, K0);
665  B1 = _mm_xor_si128(B1, K0);
666  B2 = _mm_xor_si128(B2, K0);
667  B3 = _mm_xor_si128(B3, K0);
668 
669  AES_DEC_4_ROUNDS(K1);
670  AES_DEC_4_ROUNDS(K2);
671  AES_DEC_4_ROUNDS(K3);
672  AES_DEC_4_ROUNDS(K4);
673  AES_DEC_4_ROUNDS(K5);
674  AES_DEC_4_ROUNDS(K6);
675  AES_DEC_4_ROUNDS(K7);
676  AES_DEC_4_ROUNDS(K8);
677  AES_DEC_4_ROUNDS(K9);
678  AES_DEC_4_ROUNDS(K10);
679  AES_DEC_4_ROUNDS(K11);
680  AES_DEC_4_ROUNDS(K12);
681  AES_DEC_4_ROUNDS(K13);
683 
684  _mm_storeu_si128(out_mm + 0, B0);
685  _mm_storeu_si128(out_mm + 1, B1);
686  _mm_storeu_si128(out_mm + 2, B2);
687  _mm_storeu_si128(out_mm + 3, B3);
688 
689  blocks -= 4;
690  in_mm += 4;
691  out_mm += 4;
692  }
693 
694  for(size_t i = 0; i != blocks; ++i)
695  {
696  __m128i B = _mm_loadu_si128(in_mm + i);
697 
698  B = _mm_xor_si128(B, K0);
699 
700  B = _mm_aesdec_si128(B, K1);
701  B = _mm_aesdec_si128(B, K2);
702  B = _mm_aesdec_si128(B, K3);
703  B = _mm_aesdec_si128(B, K4);
704  B = _mm_aesdec_si128(B, K5);
705  B = _mm_aesdec_si128(B, K6);
706  B = _mm_aesdec_si128(B, K7);
707  B = _mm_aesdec_si128(B, K8);
708  B = _mm_aesdec_si128(B, K9);
709  B = _mm_aesdec_si128(B, K10);
710  B = _mm_aesdec_si128(B, K11);
711  B = _mm_aesdec_si128(B, K12);
712  B = _mm_aesdec_si128(B, K13);
713  B = _mm_aesdeclast_si128(B, K14);
714 
715  _mm_storeu_si128(out_mm + i, B);
716  }
717  }
718 
719 /*
720 * AES-256 Key Schedule
721 */
722 BOTAN_FUNC_ISA("ssse3,aes")
723 void AES_256::aesni_key_schedule(const uint8_t key[], size_t)
724  {
725  m_EK.resize(60);
726  m_DK.resize(60);
727 
728  const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
729  const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16));
730 
731  const __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01));
732  const __m128i K3 = aes_256_key_expansion(K1, K2);
733 
734  const __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02));
735  const __m128i K5 = aes_256_key_expansion(K3, K4);
736 
737  const __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04));
738  const __m128i K7 = aes_256_key_expansion(K5, K6);
739 
740  const __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08));
741  const __m128i K9 = aes_256_key_expansion(K7, K8);
742 
743  const __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10));
744  const __m128i K11 = aes_256_key_expansion(K9, K10);
745 
746  const __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20));
747  const __m128i K13 = aes_256_key_expansion(K11, K12);
748 
749  const __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40));
750 
751  __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
752  _mm_storeu_si128(EK_mm , K0);
753  _mm_storeu_si128(EK_mm + 1, K1);
754  _mm_storeu_si128(EK_mm + 2, K2);
755  _mm_storeu_si128(EK_mm + 3, K3);
756  _mm_storeu_si128(EK_mm + 4, K4);
757  _mm_storeu_si128(EK_mm + 5, K5);
758  _mm_storeu_si128(EK_mm + 6, K6);
759  _mm_storeu_si128(EK_mm + 7, K7);
760  _mm_storeu_si128(EK_mm + 8, K8);
761  _mm_storeu_si128(EK_mm + 9, K9);
762  _mm_storeu_si128(EK_mm + 10, K10);
763  _mm_storeu_si128(EK_mm + 11, K11);
764  _mm_storeu_si128(EK_mm + 12, K12);
765  _mm_storeu_si128(EK_mm + 13, K13);
766  _mm_storeu_si128(EK_mm + 14, K14);
767 
768  // Now generate decryption keys
769  __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
770  _mm_storeu_si128(DK_mm , K14);
771  _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
772  _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
773  _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
774  _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
775  _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
776  _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
777  _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
778  _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
779  _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
780  _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
781  _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
782  _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
783  _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
784  _mm_storeu_si128(DK_mm + 14, K0);
785  }
786 
787 #undef AES_ENC_4_ROUNDS
788 #undef AES_ENC_4_LAST_ROUNDS
789 #undef AES_DEC_4_ROUNDS
790 #undef AES_DEC_4_LAST_ROUNDS
791 
792 }
#define AES_DEC_4_LAST_ROUNDS(K)
Definition: aes_ni.cpp:97
#define BOTAN_ASSERT(expr, assertion_made)
Definition: assert.h:43
#define AES_192_key_exp(RCON, EK_OFF)
#define AES_ENC_4_ROUNDS(K)
Definition: aes_ni.cpp:70
#define AES_DEC_4_ROUNDS(K)
Definition: aes_ni.cpp:88
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:75
T load_le(const uint8_t in[], size_t off)
Definition: loadstor.h:121
Definition: alg_id.cpp:13
#define AES_ENC_4_LAST_ROUNDS(K)
Definition: aes_ni.cpp:79
#define AES_128_key_exp(K, RCON)