Botan  2.10.0
Crypto and TLS for C++11
aes_ni.cpp
Go to the documentation of this file.
1 /*
2 * AES using AES-NI instructions
3 * (C) 2009,2012 Jack Lloyd
4 *
5 * Botan is released under the Simplified BSD License (see license.txt)
6 */
7 
8 #include <botan/aes.h>
9 #include <botan/loadstor.h>
10 #include <wmmintrin.h>
11 
12 namespace Botan {
13 
14 namespace {
15 
16 BOTAN_FUNC_ISA("ssse3")
17 __m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
18  {
19  key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3));
20  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
21  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
22  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
23  return _mm_xor_si128(key, key_with_rcon);
24  }
25 
26 BOTAN_FUNC_ISA("ssse3")
27 void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
28  uint32_t out[], bool last)
29  {
30  __m128i key1 = *K1;
31  __m128i key2 = *K2;
32 
33  key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
34  key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
35  key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
36  key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
37  key1 = _mm_xor_si128(key1, key2_with_rcon);
38 
39  *K1 = key1;
40  _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1);
41 
42  if(last)
43  return;
44 
45  key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
46  key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));
47 
48  *K2 = key2;
49  out[4] = _mm_cvtsi128_si32(key2);
50  out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
51  }
52 
53 /*
54 * The second half of the AES-256 key expansion (other half same as AES-128)
55 */
56 BOTAN_FUNC_ISA("ssse3,aes")
57 __m128i aes_256_key_expansion(__m128i key, __m128i key2)
58  {
59  __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
60  key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2));
61 
62  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
63  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
64  key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
65  return _mm_xor_si128(key, key_with_rcon);
66  }
67 
68 }
69 
70 #define AES_ENC_4_ROUNDS(K) \
71  do \
72  { \
73  B0 = _mm_aesenc_si128(B0, K); \
74  B1 = _mm_aesenc_si128(B1, K); \
75  B2 = _mm_aesenc_si128(B2, K); \
76  B3 = _mm_aesenc_si128(B3, K); \
77  } while(0)
78 
79 #define AES_ENC_4_LAST_ROUNDS(K) \
80  do \
81  { \
82  B0 = _mm_aesenclast_si128(B0, K); \
83  B1 = _mm_aesenclast_si128(B1, K); \
84  B2 = _mm_aesenclast_si128(B2, K); \
85  B3 = _mm_aesenclast_si128(B3, K); \
86  } while(0)
87 
88 #define AES_DEC_4_ROUNDS(K) \
89  do \
90  { \
91  B0 = _mm_aesdec_si128(B0, K); \
92  B1 = _mm_aesdec_si128(B1, K); \
93  B2 = _mm_aesdec_si128(B2, K); \
94  B3 = _mm_aesdec_si128(B3, K); \
95  } while(0)
96 
97 #define AES_DEC_4_LAST_ROUNDS(K) \
98  do \
99  { \
100  B0 = _mm_aesdeclast_si128(B0, K); \
101  B1 = _mm_aesdeclast_si128(B1, K); \
102  B2 = _mm_aesdeclast_si128(B2, K); \
103  B3 = _mm_aesdeclast_si128(B3, K); \
104  } while(0)
105 
106 /*
107 * AES-128 Encryption
108 */
109 BOTAN_FUNC_ISA("ssse3,aes")
110 void AES_128::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
111  {
112  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
113  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
114 
115  const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
116 
117  const __m128i K0 = _mm_loadu_si128(key_mm);
118  const __m128i K1 = _mm_loadu_si128(key_mm + 1);
119  const __m128i K2 = _mm_loadu_si128(key_mm + 2);
120  const __m128i K3 = _mm_loadu_si128(key_mm + 3);
121  const __m128i K4 = _mm_loadu_si128(key_mm + 4);
122  const __m128i K5 = _mm_loadu_si128(key_mm + 5);
123  const __m128i K6 = _mm_loadu_si128(key_mm + 6);
124  const __m128i K7 = _mm_loadu_si128(key_mm + 7);
125  const __m128i K8 = _mm_loadu_si128(key_mm + 8);
126  const __m128i K9 = _mm_loadu_si128(key_mm + 9);
127  const __m128i K10 = _mm_loadu_si128(key_mm + 10);
128 
129  while(blocks >= 4)
130  {
131  __m128i B0 = _mm_loadu_si128(in_mm + 0);
132  __m128i B1 = _mm_loadu_si128(in_mm + 1);
133  __m128i B2 = _mm_loadu_si128(in_mm + 2);
134  __m128i B3 = _mm_loadu_si128(in_mm + 3);
135 
136  B0 = _mm_xor_si128(B0, K0);
137  B1 = _mm_xor_si128(B1, K0);
138  B2 = _mm_xor_si128(B2, K0);
139  B3 = _mm_xor_si128(B3, K0);
140 
141  AES_ENC_4_ROUNDS(K1);
142  AES_ENC_4_ROUNDS(K2);
143  AES_ENC_4_ROUNDS(K3);
144  AES_ENC_4_ROUNDS(K4);
145  AES_ENC_4_ROUNDS(K5);
146  AES_ENC_4_ROUNDS(K6);
147  AES_ENC_4_ROUNDS(K7);
148  AES_ENC_4_ROUNDS(K8);
149  AES_ENC_4_ROUNDS(K9);
151 
152  _mm_storeu_si128(out_mm + 0, B0);
153  _mm_storeu_si128(out_mm + 1, B1);
154  _mm_storeu_si128(out_mm + 2, B2);
155  _mm_storeu_si128(out_mm + 3, B3);
156 
157  blocks -= 4;
158  in_mm += 4;
159  out_mm += 4;
160  }
161 
162  for(size_t i = 0; i != blocks; ++i)
163  {
164  __m128i B = _mm_loadu_si128(in_mm + i);
165 
166  B = _mm_xor_si128(B, K0);
167 
168  B = _mm_aesenc_si128(B, K1);
169  B = _mm_aesenc_si128(B, K2);
170  B = _mm_aesenc_si128(B, K3);
171  B = _mm_aesenc_si128(B, K4);
172  B = _mm_aesenc_si128(B, K5);
173  B = _mm_aesenc_si128(B, K6);
174  B = _mm_aesenc_si128(B, K7);
175  B = _mm_aesenc_si128(B, K8);
176  B = _mm_aesenc_si128(B, K9);
177  B = _mm_aesenclast_si128(B, K10);
178 
179  _mm_storeu_si128(out_mm + i, B);
180  }
181  }
182 
183 /*
184 * AES-128 Decryption
185 */
186 BOTAN_FUNC_ISA("ssse3,aes")
187 void AES_128::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
188  {
189  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
190  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
191 
192  const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
193 
194  const __m128i K0 = _mm_loadu_si128(key_mm);
195  const __m128i K1 = _mm_loadu_si128(key_mm + 1);
196  const __m128i K2 = _mm_loadu_si128(key_mm + 2);
197  const __m128i K3 = _mm_loadu_si128(key_mm + 3);
198  const __m128i K4 = _mm_loadu_si128(key_mm + 4);
199  const __m128i K5 = _mm_loadu_si128(key_mm + 5);
200  const __m128i K6 = _mm_loadu_si128(key_mm + 6);
201  const __m128i K7 = _mm_loadu_si128(key_mm + 7);
202  const __m128i K8 = _mm_loadu_si128(key_mm + 8);
203  const __m128i K9 = _mm_loadu_si128(key_mm + 9);
204  const __m128i K10 = _mm_loadu_si128(key_mm + 10);
205 
206  while(blocks >= 4)
207  {
208  __m128i B0 = _mm_loadu_si128(in_mm + 0);
209  __m128i B1 = _mm_loadu_si128(in_mm + 1);
210  __m128i B2 = _mm_loadu_si128(in_mm + 2);
211  __m128i B3 = _mm_loadu_si128(in_mm + 3);
212 
213  B0 = _mm_xor_si128(B0, K0);
214  B1 = _mm_xor_si128(B1, K0);
215  B2 = _mm_xor_si128(B2, K0);
216  B3 = _mm_xor_si128(B3, K0);
217 
218  AES_DEC_4_ROUNDS(K1);
219  AES_DEC_4_ROUNDS(K2);
220  AES_DEC_4_ROUNDS(K3);
221  AES_DEC_4_ROUNDS(K4);
222  AES_DEC_4_ROUNDS(K5);
223  AES_DEC_4_ROUNDS(K6);
224  AES_DEC_4_ROUNDS(K7);
225  AES_DEC_4_ROUNDS(K8);
226  AES_DEC_4_ROUNDS(K9);
228 
229  _mm_storeu_si128(out_mm + 0, B0);
230  _mm_storeu_si128(out_mm + 1, B1);
231  _mm_storeu_si128(out_mm + 2, B2);
232  _mm_storeu_si128(out_mm + 3, B3);
233 
234  blocks -= 4;
235  in_mm += 4;
236  out_mm += 4;
237  }
238 
239  for(size_t i = 0; i != blocks; ++i)
240  {
241  __m128i B = _mm_loadu_si128(in_mm + i);
242 
243  B = _mm_xor_si128(B, K0);
244 
245  B = _mm_aesdec_si128(B, K1);
246  B = _mm_aesdec_si128(B, K2);
247  B = _mm_aesdec_si128(B, K3);
248  B = _mm_aesdec_si128(B, K4);
249  B = _mm_aesdec_si128(B, K5);
250  B = _mm_aesdec_si128(B, K6);
251  B = _mm_aesdec_si128(B, K7);
252  B = _mm_aesdec_si128(B, K8);
253  B = _mm_aesdec_si128(B, K9);
254  B = _mm_aesdeclast_si128(B, K10);
255 
256  _mm_storeu_si128(out_mm + i, B);
257  }
258  }
259 
260 /*
261 * AES-128 Key Schedule
262 */
263 BOTAN_FUNC_ISA("ssse3,aes")
264 void AES_128::aesni_key_schedule(const uint8_t key[], size_t)
265  {
266  m_EK.resize(44);
267  m_DK.resize(44);
268 
269  #define AES_128_key_exp(K, RCON) \
270  aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON))
271 
272  const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
273  const __m128i K1 = AES_128_key_exp(K0, 0x01);
274  const __m128i K2 = AES_128_key_exp(K1, 0x02);
275  const __m128i K3 = AES_128_key_exp(K2, 0x04);
276  const __m128i K4 = AES_128_key_exp(K3, 0x08);
277  const __m128i K5 = AES_128_key_exp(K4, 0x10);
278  const __m128i K6 = AES_128_key_exp(K5, 0x20);
279  const __m128i K7 = AES_128_key_exp(K6, 0x40);
280  const __m128i K8 = AES_128_key_exp(K7, 0x80);
281  const __m128i K9 = AES_128_key_exp(K8, 0x1B);
282  const __m128i K10 = AES_128_key_exp(K9, 0x36);
283 
284  __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
285  _mm_storeu_si128(EK_mm , K0);
286  _mm_storeu_si128(EK_mm + 1, K1);
287  _mm_storeu_si128(EK_mm + 2, K2);
288  _mm_storeu_si128(EK_mm + 3, K3);
289  _mm_storeu_si128(EK_mm + 4, K4);
290  _mm_storeu_si128(EK_mm + 5, K5);
291  _mm_storeu_si128(EK_mm + 6, K6);
292  _mm_storeu_si128(EK_mm + 7, K7);
293  _mm_storeu_si128(EK_mm + 8, K8);
294  _mm_storeu_si128(EK_mm + 9, K9);
295  _mm_storeu_si128(EK_mm + 10, K10);
296 
297  // Now generate decryption keys
298 
299  __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
300  _mm_storeu_si128(DK_mm , K10);
301  _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
302  _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
303  _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
304  _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
305  _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
306  _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
307  _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
308  _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
309  _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
310  _mm_storeu_si128(DK_mm + 10, K0);
311  }
312 
313 /*
314 * AES-192 Encryption
315 */
316 BOTAN_FUNC_ISA("ssse3,aes")
317 void AES_192::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
318  {
319  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
320  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
321 
322  const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
323 
324  const __m128i K0 = _mm_loadu_si128(key_mm);
325  const __m128i K1 = _mm_loadu_si128(key_mm + 1);
326  const __m128i K2 = _mm_loadu_si128(key_mm + 2);
327  const __m128i K3 = _mm_loadu_si128(key_mm + 3);
328  const __m128i K4 = _mm_loadu_si128(key_mm + 4);
329  const __m128i K5 = _mm_loadu_si128(key_mm + 5);
330  const __m128i K6 = _mm_loadu_si128(key_mm + 6);
331  const __m128i K7 = _mm_loadu_si128(key_mm + 7);
332  const __m128i K8 = _mm_loadu_si128(key_mm + 8);
333  const __m128i K9 = _mm_loadu_si128(key_mm + 9);
334  const __m128i K10 = _mm_loadu_si128(key_mm + 10);
335  const __m128i K11 = _mm_loadu_si128(key_mm + 11);
336  const __m128i K12 = _mm_loadu_si128(key_mm + 12);
337 
338  while(blocks >= 4)
339  {
340  __m128i B0 = _mm_loadu_si128(in_mm + 0);
341  __m128i B1 = _mm_loadu_si128(in_mm + 1);
342  __m128i B2 = _mm_loadu_si128(in_mm + 2);
343  __m128i B3 = _mm_loadu_si128(in_mm + 3);
344 
345  B0 = _mm_xor_si128(B0, K0);
346  B1 = _mm_xor_si128(B1, K0);
347  B2 = _mm_xor_si128(B2, K0);
348  B3 = _mm_xor_si128(B3, K0);
349 
350  AES_ENC_4_ROUNDS(K1);
351  AES_ENC_4_ROUNDS(K2);
352  AES_ENC_4_ROUNDS(K3);
353  AES_ENC_4_ROUNDS(K4);
354  AES_ENC_4_ROUNDS(K5);
355  AES_ENC_4_ROUNDS(K6);
356  AES_ENC_4_ROUNDS(K7);
357  AES_ENC_4_ROUNDS(K8);
358  AES_ENC_4_ROUNDS(K9);
359  AES_ENC_4_ROUNDS(K10);
360  AES_ENC_4_ROUNDS(K11);
362 
363  _mm_storeu_si128(out_mm + 0, B0);
364  _mm_storeu_si128(out_mm + 1, B1);
365  _mm_storeu_si128(out_mm + 2, B2);
366  _mm_storeu_si128(out_mm + 3, B3);
367 
368  blocks -= 4;
369  in_mm += 4;
370  out_mm += 4;
371  }
372 
373  for(size_t i = 0; i != blocks; ++i)
374  {
375  __m128i B = _mm_loadu_si128(in_mm + i);
376 
377  B = _mm_xor_si128(B, K0);
378 
379  B = _mm_aesenc_si128(B, K1);
380  B = _mm_aesenc_si128(B, K2);
381  B = _mm_aesenc_si128(B, K3);
382  B = _mm_aesenc_si128(B, K4);
383  B = _mm_aesenc_si128(B, K5);
384  B = _mm_aesenc_si128(B, K6);
385  B = _mm_aesenc_si128(B, K7);
386  B = _mm_aesenc_si128(B, K8);
387  B = _mm_aesenc_si128(B, K9);
388  B = _mm_aesenc_si128(B, K10);
389  B = _mm_aesenc_si128(B, K11);
390  B = _mm_aesenclast_si128(B, K12);
391 
392  _mm_storeu_si128(out_mm + i, B);
393  }
394  }
395 
396 /*
397 * AES-192 Decryption
398 */
399 BOTAN_FUNC_ISA("ssse3,aes")
400 void AES_192::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
401  {
402  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
403  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
404 
405  const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
406 
407  const __m128i K0 = _mm_loadu_si128(key_mm);
408  const __m128i K1 = _mm_loadu_si128(key_mm + 1);
409  const __m128i K2 = _mm_loadu_si128(key_mm + 2);
410  const __m128i K3 = _mm_loadu_si128(key_mm + 3);
411  const __m128i K4 = _mm_loadu_si128(key_mm + 4);
412  const __m128i K5 = _mm_loadu_si128(key_mm + 5);
413  const __m128i K6 = _mm_loadu_si128(key_mm + 6);
414  const __m128i K7 = _mm_loadu_si128(key_mm + 7);
415  const __m128i K8 = _mm_loadu_si128(key_mm + 8);
416  const __m128i K9 = _mm_loadu_si128(key_mm + 9);
417  const __m128i K10 = _mm_loadu_si128(key_mm + 10);
418  const __m128i K11 = _mm_loadu_si128(key_mm + 11);
419  const __m128i K12 = _mm_loadu_si128(key_mm + 12);
420 
421  while(blocks >= 4)
422  {
423  __m128i B0 = _mm_loadu_si128(in_mm + 0);
424  __m128i B1 = _mm_loadu_si128(in_mm + 1);
425  __m128i B2 = _mm_loadu_si128(in_mm + 2);
426  __m128i B3 = _mm_loadu_si128(in_mm + 3);
427 
428  B0 = _mm_xor_si128(B0, K0);
429  B1 = _mm_xor_si128(B1, K0);
430  B2 = _mm_xor_si128(B2, K0);
431  B3 = _mm_xor_si128(B3, K0);
432 
433  AES_DEC_4_ROUNDS(K1);
434  AES_DEC_4_ROUNDS(K2);
435  AES_DEC_4_ROUNDS(K3);
436  AES_DEC_4_ROUNDS(K4);
437  AES_DEC_4_ROUNDS(K5);
438  AES_DEC_4_ROUNDS(K6);
439  AES_DEC_4_ROUNDS(K7);
440  AES_DEC_4_ROUNDS(K8);
441  AES_DEC_4_ROUNDS(K9);
442  AES_DEC_4_ROUNDS(K10);
443  AES_DEC_4_ROUNDS(K11);
445 
446  _mm_storeu_si128(out_mm + 0, B0);
447  _mm_storeu_si128(out_mm + 1, B1);
448  _mm_storeu_si128(out_mm + 2, B2);
449  _mm_storeu_si128(out_mm + 3, B3);
450 
451  blocks -= 4;
452  in_mm += 4;
453  out_mm += 4;
454  }
455 
456  for(size_t i = 0; i != blocks; ++i)
457  {
458  __m128i B = _mm_loadu_si128(in_mm + i);
459 
460  B = _mm_xor_si128(B, K0);
461 
462  B = _mm_aesdec_si128(B, K1);
463  B = _mm_aesdec_si128(B, K2);
464  B = _mm_aesdec_si128(B, K3);
465  B = _mm_aesdec_si128(B, K4);
466  B = _mm_aesdec_si128(B, K5);
467  B = _mm_aesdec_si128(B, K6);
468  B = _mm_aesdec_si128(B, K7);
469  B = _mm_aesdec_si128(B, K8);
470  B = _mm_aesdec_si128(B, K9);
471  B = _mm_aesdec_si128(B, K10);
472  B = _mm_aesdec_si128(B, K11);
473  B = _mm_aesdeclast_si128(B, K12);
474 
475  _mm_storeu_si128(out_mm + i, B);
476  }
477  }
478 
479 /*
480 * AES-192 Key Schedule
481 */
482 BOTAN_FUNC_ISA("ssse3,aes")
483 void AES_192::aesni_key_schedule(const uint8_t key[], size_t)
484  {
485  m_EK.resize(52);
486  m_DK.resize(52);
487 
488  __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
489  __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8));
490  K1 = _mm_srli_si128(K1, 8);
491 
492  load_le(m_EK.data(), key, 6);
493 
494  #define AES_192_key_exp(RCON, EK_OFF) \
495  aes_192_key_expansion(&K0, &K1, \
496  _mm_aeskeygenassist_si128(K1, RCON), \
497  &m_EK[EK_OFF], EK_OFF == 48)
498 
499  AES_192_key_exp(0x01, 6);
500  AES_192_key_exp(0x02, 12);
501  AES_192_key_exp(0x04, 18);
502  AES_192_key_exp(0x08, 24);
503  AES_192_key_exp(0x10, 30);
504  AES_192_key_exp(0x20, 36);
505  AES_192_key_exp(0x40, 42);
506  AES_192_key_exp(0x80, 48);
507 
508  #undef AES_192_key_exp
509 
510  // Now generate decryption keys
511  const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data());
512 
513  __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
514  _mm_storeu_si128(DK_mm , _mm_loadu_si128(EK_mm + 12));
515  _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
516  _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
517  _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
518  _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
519  _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
520  _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
521  _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
522  _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
523  _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
524  _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
525  _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
526  _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
527  }
528 
529 /*
530 * AES-256 Encryption
531 */
532 BOTAN_FUNC_ISA("ssse3,aes")
533 void AES_256::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
534  {
535  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
536  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
537 
538  const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
539 
540  const __m128i K0 = _mm_loadu_si128(key_mm);
541  const __m128i K1 = _mm_loadu_si128(key_mm + 1);
542  const __m128i K2 = _mm_loadu_si128(key_mm + 2);
543  const __m128i K3 = _mm_loadu_si128(key_mm + 3);
544  const __m128i K4 = _mm_loadu_si128(key_mm + 4);
545  const __m128i K5 = _mm_loadu_si128(key_mm + 5);
546  const __m128i K6 = _mm_loadu_si128(key_mm + 6);
547  const __m128i K7 = _mm_loadu_si128(key_mm + 7);
548  const __m128i K8 = _mm_loadu_si128(key_mm + 8);
549  const __m128i K9 = _mm_loadu_si128(key_mm + 9);
550  const __m128i K10 = _mm_loadu_si128(key_mm + 10);
551  const __m128i K11 = _mm_loadu_si128(key_mm + 11);
552  const __m128i K12 = _mm_loadu_si128(key_mm + 12);
553  const __m128i K13 = _mm_loadu_si128(key_mm + 13);
554  const __m128i K14 = _mm_loadu_si128(key_mm + 14);
555 
556  while(blocks >= 4)
557  {
558  __m128i B0 = _mm_loadu_si128(in_mm + 0);
559  __m128i B1 = _mm_loadu_si128(in_mm + 1);
560  __m128i B2 = _mm_loadu_si128(in_mm + 2);
561  __m128i B3 = _mm_loadu_si128(in_mm + 3);
562 
563  B0 = _mm_xor_si128(B0, K0);
564  B1 = _mm_xor_si128(B1, K0);
565  B2 = _mm_xor_si128(B2, K0);
566  B3 = _mm_xor_si128(B3, K0);
567 
568  AES_ENC_4_ROUNDS(K1);
569  AES_ENC_4_ROUNDS(K2);
570  AES_ENC_4_ROUNDS(K3);
571  AES_ENC_4_ROUNDS(K4);
572  AES_ENC_4_ROUNDS(K5);
573  AES_ENC_4_ROUNDS(K6);
574  AES_ENC_4_ROUNDS(K7);
575  AES_ENC_4_ROUNDS(K8);
576  AES_ENC_4_ROUNDS(K9);
577  AES_ENC_4_ROUNDS(K10);
578  AES_ENC_4_ROUNDS(K11);
579  AES_ENC_4_ROUNDS(K12);
580  AES_ENC_4_ROUNDS(K13);
582 
583  _mm_storeu_si128(out_mm + 0, B0);
584  _mm_storeu_si128(out_mm + 1, B1);
585  _mm_storeu_si128(out_mm + 2, B2);
586  _mm_storeu_si128(out_mm + 3, B3);
587 
588  blocks -= 4;
589  in_mm += 4;
590  out_mm += 4;
591  }
592 
593  for(size_t i = 0; i != blocks; ++i)
594  {
595  __m128i B = _mm_loadu_si128(in_mm + i);
596 
597  B = _mm_xor_si128(B, K0);
598 
599  B = _mm_aesenc_si128(B, K1);
600  B = _mm_aesenc_si128(B, K2);
601  B = _mm_aesenc_si128(B, K3);
602  B = _mm_aesenc_si128(B, K4);
603  B = _mm_aesenc_si128(B, K5);
604  B = _mm_aesenc_si128(B, K6);
605  B = _mm_aesenc_si128(B, K7);
606  B = _mm_aesenc_si128(B, K8);
607  B = _mm_aesenc_si128(B, K9);
608  B = _mm_aesenc_si128(B, K10);
609  B = _mm_aesenc_si128(B, K11);
610  B = _mm_aesenc_si128(B, K12);
611  B = _mm_aesenc_si128(B, K13);
612  B = _mm_aesenclast_si128(B, K14);
613 
614  _mm_storeu_si128(out_mm + i, B);
615  }
616  }
617 
618 /*
619 * AES-256 Decryption
620 */
621 BOTAN_FUNC_ISA("ssse3,aes")
622 void AES_256::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
623  {
624  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
625  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
626 
627  const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
628 
629  const __m128i K0 = _mm_loadu_si128(key_mm);
630  const __m128i K1 = _mm_loadu_si128(key_mm + 1);
631  const __m128i K2 = _mm_loadu_si128(key_mm + 2);
632  const __m128i K3 = _mm_loadu_si128(key_mm + 3);
633  const __m128i K4 = _mm_loadu_si128(key_mm + 4);
634  const __m128i K5 = _mm_loadu_si128(key_mm + 5);
635  const __m128i K6 = _mm_loadu_si128(key_mm + 6);
636  const __m128i K7 = _mm_loadu_si128(key_mm + 7);
637  const __m128i K8 = _mm_loadu_si128(key_mm + 8);
638  const __m128i K9 = _mm_loadu_si128(key_mm + 9);
639  const __m128i K10 = _mm_loadu_si128(key_mm + 10);
640  const __m128i K11 = _mm_loadu_si128(key_mm + 11);
641  const __m128i K12 = _mm_loadu_si128(key_mm + 12);
642  const __m128i K13 = _mm_loadu_si128(key_mm + 13);
643  const __m128i K14 = _mm_loadu_si128(key_mm + 14);
644 
645  while(blocks >= 4)
646  {
647  __m128i B0 = _mm_loadu_si128(in_mm + 0);
648  __m128i B1 = _mm_loadu_si128(in_mm + 1);
649  __m128i B2 = _mm_loadu_si128(in_mm + 2);
650  __m128i B3 = _mm_loadu_si128(in_mm + 3);
651 
652  B0 = _mm_xor_si128(B0, K0);
653  B1 = _mm_xor_si128(B1, K0);
654  B2 = _mm_xor_si128(B2, K0);
655  B3 = _mm_xor_si128(B3, K0);
656 
657  AES_DEC_4_ROUNDS(K1);
658  AES_DEC_4_ROUNDS(K2);
659  AES_DEC_4_ROUNDS(K3);
660  AES_DEC_4_ROUNDS(K4);
661  AES_DEC_4_ROUNDS(K5);
662  AES_DEC_4_ROUNDS(K6);
663  AES_DEC_4_ROUNDS(K7);
664  AES_DEC_4_ROUNDS(K8);
665  AES_DEC_4_ROUNDS(K9);
666  AES_DEC_4_ROUNDS(K10);
667  AES_DEC_4_ROUNDS(K11);
668  AES_DEC_4_ROUNDS(K12);
669  AES_DEC_4_ROUNDS(K13);
671 
672  _mm_storeu_si128(out_mm + 0, B0);
673  _mm_storeu_si128(out_mm + 1, B1);
674  _mm_storeu_si128(out_mm + 2, B2);
675  _mm_storeu_si128(out_mm + 3, B3);
676 
677  blocks -= 4;
678  in_mm += 4;
679  out_mm += 4;
680  }
681 
682  for(size_t i = 0; i != blocks; ++i)
683  {
684  __m128i B = _mm_loadu_si128(in_mm + i);
685 
686  B = _mm_xor_si128(B, K0);
687 
688  B = _mm_aesdec_si128(B, K1);
689  B = _mm_aesdec_si128(B, K2);
690  B = _mm_aesdec_si128(B, K3);
691  B = _mm_aesdec_si128(B, K4);
692  B = _mm_aesdec_si128(B, K5);
693  B = _mm_aesdec_si128(B, K6);
694  B = _mm_aesdec_si128(B, K7);
695  B = _mm_aesdec_si128(B, K8);
696  B = _mm_aesdec_si128(B, K9);
697  B = _mm_aesdec_si128(B, K10);
698  B = _mm_aesdec_si128(B, K11);
699  B = _mm_aesdec_si128(B, K12);
700  B = _mm_aesdec_si128(B, K13);
701  B = _mm_aesdeclast_si128(B, K14);
702 
703  _mm_storeu_si128(out_mm + i, B);
704  }
705  }
706 
707 /*
708 * AES-256 Key Schedule
709 */
710 BOTAN_FUNC_ISA("ssse3,aes")
711 void AES_256::aesni_key_schedule(const uint8_t key[], size_t)
712  {
713  m_EK.resize(60);
714  m_DK.resize(60);
715 
716  const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
717  const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16));
718 
719  const __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01));
720  const __m128i K3 = aes_256_key_expansion(K1, K2);
721 
722  const __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02));
723  const __m128i K5 = aes_256_key_expansion(K3, K4);
724 
725  const __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04));
726  const __m128i K7 = aes_256_key_expansion(K5, K6);
727 
728  const __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08));
729  const __m128i K9 = aes_256_key_expansion(K7, K8);
730 
731  const __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10));
732  const __m128i K11 = aes_256_key_expansion(K9, K10);
733 
734  const __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20));
735  const __m128i K13 = aes_256_key_expansion(K11, K12);
736 
737  const __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40));
738 
739  __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
740  _mm_storeu_si128(EK_mm , K0);
741  _mm_storeu_si128(EK_mm + 1, K1);
742  _mm_storeu_si128(EK_mm + 2, K2);
743  _mm_storeu_si128(EK_mm + 3, K3);
744  _mm_storeu_si128(EK_mm + 4, K4);
745  _mm_storeu_si128(EK_mm + 5, K5);
746  _mm_storeu_si128(EK_mm + 6, K6);
747  _mm_storeu_si128(EK_mm + 7, K7);
748  _mm_storeu_si128(EK_mm + 8, K8);
749  _mm_storeu_si128(EK_mm + 9, K9);
750  _mm_storeu_si128(EK_mm + 10, K10);
751  _mm_storeu_si128(EK_mm + 11, K11);
752  _mm_storeu_si128(EK_mm + 12, K12);
753  _mm_storeu_si128(EK_mm + 13, K13);
754  _mm_storeu_si128(EK_mm + 14, K14);
755 
756  // Now generate decryption keys
757  __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
758  _mm_storeu_si128(DK_mm , K14);
759  _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
760  _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
761  _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
762  _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
763  _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
764  _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
765  _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
766  _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
767  _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
768  _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
769  _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
770  _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
771  _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
772  _mm_storeu_si128(DK_mm + 14, K0);
773  }
774 
775 #undef AES_ENC_4_ROUNDS
776 #undef AES_ENC_4_LAST_ROUNDS
777 #undef AES_DEC_4_ROUNDS
778 #undef AES_DEC_4_LAST_ROUNDS
779 
780 }
#define AES_DEC_4_LAST_ROUNDS(K)
Definition: aes_ni.cpp:97
void const uint8_t in[]
Definition: mgf1.h:26
#define AES_192_key_exp(RCON, EK_OFF)
const uint8_t uint8_t size_t blocks
Definition: ffi.h:686
#define AES_ENC_4_ROUNDS(K)
Definition: aes_ni.cpp:70
#define AES_DEC_4_ROUNDS(K)
Definition: aes_ni.cpp:88
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:71
T load_le(const uint8_t in[], size_t off)
Definition: loadstor.h:121
Definition: alg_id.cpp:13
uint8_t out[]
Definition: pbkdf2.h:19
const uint8_t * key
Definition: ffi.h:359
#define AES_ENC_4_LAST_ROUNDS(K)
Definition: aes_ni.cpp:79
#define AES_128_key_exp(K, RCON)