Botan 2.19.1
Crypto and TLS for C&
aes_ni.cpp
Go to the documentation of this file.
1/*
2* AES using AES-NI instructions
3* (C) 2009,2012 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#include <botan/aes.h>
9#include <botan/loadstor.h>
10#include <wmmintrin.h>
11
12namespace Botan {
13
14namespace {
15
16BOTAN_FUNC_ISA("ssse3")
17__m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
18 {
19 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3));
20 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
21 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
22 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
23 return _mm_xor_si128(key, key_with_rcon);
24 }
25
26BOTAN_FUNC_ISA("ssse3")
27void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
28 uint32_t out[], bool last)
29 {
30 __m128i key1 = *K1;
31 __m128i key2 = *K2;
32
33 key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
34 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
35 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
36 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
37 key1 = _mm_xor_si128(key1, key2_with_rcon);
38
39 *K1 = key1;
40 _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1);
41
42 if(last)
43 return;
44
45 key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
46 key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));
47
48 *K2 = key2;
49 out[4] = _mm_cvtsi128_si32(key2);
50 out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
51 }
52
53/*
54* The second half of the AES-256 key expansion (other half same as AES-128)
55*/
56BOTAN_FUNC_ISA("ssse3,aes")
57__m128i aes_256_key_expansion(__m128i key, __m128i key2)
58 {
59 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
60 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2));
61
62 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
63 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
64 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
65 return _mm_xor_si128(key, key_with_rcon);
66 }
67
68}
69
70#define AES_ENC_4_ROUNDS(K) \
71 do \
72 { \
73 B0 = _mm_aesenc_si128(B0, K); \
74 B1 = _mm_aesenc_si128(B1, K); \
75 B2 = _mm_aesenc_si128(B2, K); \
76 B3 = _mm_aesenc_si128(B3, K); \
77 } while(0)
78
79#define AES_ENC_4_LAST_ROUNDS(K) \
80 do \
81 { \
82 B0 = _mm_aesenclast_si128(B0, K); \
83 B1 = _mm_aesenclast_si128(B1, K); \
84 B2 = _mm_aesenclast_si128(B2, K); \
85 B3 = _mm_aesenclast_si128(B3, K); \
86 } while(0)
87
88#define AES_DEC_4_ROUNDS(K) \
89 do \
90 { \
91 B0 = _mm_aesdec_si128(B0, K); \
92 B1 = _mm_aesdec_si128(B1, K); \
93 B2 = _mm_aesdec_si128(B2, K); \
94 B3 = _mm_aesdec_si128(B3, K); \
95 } while(0)
96
97#define AES_DEC_4_LAST_ROUNDS(K) \
98 do \
99 { \
100 B0 = _mm_aesdeclast_si128(B0, K); \
101 B1 = _mm_aesdeclast_si128(B1, K); \
102 B2 = _mm_aesdeclast_si128(B2, K); \
103 B3 = _mm_aesdeclast_si128(B3, K); \
104 } while(0)
105
106/*
107* AES-128 Encryption
108*/
109BOTAN_FUNC_ISA("ssse3,aes")
110void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
111 {
112 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
113 __m128i* out_mm = reinterpret_cast<__m128i*>(out);
114
115 const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
116
117 const __m128i K0 = _mm_loadu_si128(key_mm);
118 const __m128i K1 = _mm_loadu_si128(key_mm + 1);
119 const __m128i K2 = _mm_loadu_si128(key_mm + 2);
120 const __m128i K3 = _mm_loadu_si128(key_mm + 3);
121 const __m128i K4 = _mm_loadu_si128(key_mm + 4);
122 const __m128i K5 = _mm_loadu_si128(key_mm + 5);
123 const __m128i K6 = _mm_loadu_si128(key_mm + 6);
124 const __m128i K7 = _mm_loadu_si128(key_mm + 7);
125 const __m128i K8 = _mm_loadu_si128(key_mm + 8);
126 const __m128i K9 = _mm_loadu_si128(key_mm + 9);
127 const __m128i K10 = _mm_loadu_si128(key_mm + 10);
128
129 while(blocks >= 4)
130 {
131 __m128i B0 = _mm_loadu_si128(in_mm + 0);
132 __m128i B1 = _mm_loadu_si128(in_mm + 1);
133 __m128i B2 = _mm_loadu_si128(in_mm + 2);
134 __m128i B3 = _mm_loadu_si128(in_mm + 3);
135
136 B0 = _mm_xor_si128(B0, K0);
137 B1 = _mm_xor_si128(B1, K0);
138 B2 = _mm_xor_si128(B2, K0);
139 B3 = _mm_xor_si128(B3, K0);
140
151
152 _mm_storeu_si128(out_mm + 0, B0);
153 _mm_storeu_si128(out_mm + 1, B1);
154 _mm_storeu_si128(out_mm + 2, B2);
155 _mm_storeu_si128(out_mm + 3, B3);
156
157 blocks -= 4;
158 in_mm += 4;
159 out_mm += 4;
160 }
161
162 for(size_t i = 0; i != blocks; ++i)
163 {
164 __m128i B = _mm_loadu_si128(in_mm + i);
165
166 B = _mm_xor_si128(B, K0);
167
168 B = _mm_aesenc_si128(B, K1);
169 B = _mm_aesenc_si128(B, K2);
170 B = _mm_aesenc_si128(B, K3);
171 B = _mm_aesenc_si128(B, K4);
172 B = _mm_aesenc_si128(B, K5);
173 B = _mm_aesenc_si128(B, K6);
174 B = _mm_aesenc_si128(B, K7);
175 B = _mm_aesenc_si128(B, K8);
176 B = _mm_aesenc_si128(B, K9);
177 B = _mm_aesenclast_si128(B, K10);
178
179 _mm_storeu_si128(out_mm + i, B);
180 }
181 }
182
183/*
184* AES-128 Decryption
185*/
186BOTAN_FUNC_ISA("ssse3,aes")
187void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
188 {
189 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
190 __m128i* out_mm = reinterpret_cast<__m128i*>(out);
191
192 const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
193
194 const __m128i K0 = _mm_loadu_si128(key_mm);
195 const __m128i K1 = _mm_loadu_si128(key_mm + 1);
196 const __m128i K2 = _mm_loadu_si128(key_mm + 2);
197 const __m128i K3 = _mm_loadu_si128(key_mm + 3);
198 const __m128i K4 = _mm_loadu_si128(key_mm + 4);
199 const __m128i K5 = _mm_loadu_si128(key_mm + 5);
200 const __m128i K6 = _mm_loadu_si128(key_mm + 6);
201 const __m128i K7 = _mm_loadu_si128(key_mm + 7);
202 const __m128i K8 = _mm_loadu_si128(key_mm + 8);
203 const __m128i K9 = _mm_loadu_si128(key_mm + 9);
204 const __m128i K10 = _mm_loadu_si128(key_mm + 10);
205
206 while(blocks >= 4)
207 {
208 __m128i B0 = _mm_loadu_si128(in_mm + 0);
209 __m128i B1 = _mm_loadu_si128(in_mm + 1);
210 __m128i B2 = _mm_loadu_si128(in_mm + 2);
211 __m128i B3 = _mm_loadu_si128(in_mm + 3);
212
213 B0 = _mm_xor_si128(B0, K0);
214 B1 = _mm_xor_si128(B1, K0);
215 B2 = _mm_xor_si128(B2, K0);
216 B3 = _mm_xor_si128(B3, K0);
217
228
229 _mm_storeu_si128(out_mm + 0, B0);
230 _mm_storeu_si128(out_mm + 1, B1);
231 _mm_storeu_si128(out_mm + 2, B2);
232 _mm_storeu_si128(out_mm + 3, B3);
233
234 blocks -= 4;
235 in_mm += 4;
236 out_mm += 4;
237 }
238
239 for(size_t i = 0; i != blocks; ++i)
240 {
241 __m128i B = _mm_loadu_si128(in_mm + i);
242
243 B = _mm_xor_si128(B, K0);
244
245 B = _mm_aesdec_si128(B, K1);
246 B = _mm_aesdec_si128(B, K2);
247 B = _mm_aesdec_si128(B, K3);
248 B = _mm_aesdec_si128(B, K4);
249 B = _mm_aesdec_si128(B, K5);
250 B = _mm_aesdec_si128(B, K6);
251 B = _mm_aesdec_si128(B, K7);
252 B = _mm_aesdec_si128(B, K8);
253 B = _mm_aesdec_si128(B, K9);
254 B = _mm_aesdeclast_si128(B, K10);
255
256 _mm_storeu_si128(out_mm + i, B);
257 }
258 }
259
260/*
261* AES-128 Key Schedule
262*/
263BOTAN_FUNC_ISA("ssse3,aes")
264void AES_128::aesni_key_schedule(const uint8_t key[], size_t)
265 {
266 m_EK.resize(44);
267 m_DK.resize(44);
268
269 #define AES_128_key_exp(K, RCON) \
270 aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON))
271
272 const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
273 const __m128i K1 = AES_128_key_exp(K0, 0x01);
274 const __m128i K2 = AES_128_key_exp(K1, 0x02);
275 const __m128i K3 = AES_128_key_exp(K2, 0x04);
276 const __m128i K4 = AES_128_key_exp(K3, 0x08);
277 const __m128i K5 = AES_128_key_exp(K4, 0x10);
278 const __m128i K6 = AES_128_key_exp(K5, 0x20);
279 const __m128i K7 = AES_128_key_exp(K6, 0x40);
280 const __m128i K8 = AES_128_key_exp(K7, 0x80);
281 const __m128i K9 = AES_128_key_exp(K8, 0x1B);
282 const __m128i K10 = AES_128_key_exp(K9, 0x36);
283
284 __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
285 _mm_storeu_si128(EK_mm , K0);
286 _mm_storeu_si128(EK_mm + 1, K1);
287 _mm_storeu_si128(EK_mm + 2, K2);
288 _mm_storeu_si128(EK_mm + 3, K3);
289 _mm_storeu_si128(EK_mm + 4, K4);
290 _mm_storeu_si128(EK_mm + 5, K5);
291 _mm_storeu_si128(EK_mm + 6, K6);
292 _mm_storeu_si128(EK_mm + 7, K7);
293 _mm_storeu_si128(EK_mm + 8, K8);
294 _mm_storeu_si128(EK_mm + 9, K9);
295 _mm_storeu_si128(EK_mm + 10, K10);
296
297 // Now generate decryption keys
298
299 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
300 _mm_storeu_si128(DK_mm , K10);
301 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
302 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
303 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
304 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
305 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
306 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
307 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
308 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
309 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
310 _mm_storeu_si128(DK_mm + 10, K0);
311 }
312
313/*
314* AES-192 Encryption
315*/
316BOTAN_FUNC_ISA("ssse3,aes")
317void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
318 {
319 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
320 __m128i* out_mm = reinterpret_cast<__m128i*>(out);
321
322 const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
323
324 const __m128i K0 = _mm_loadu_si128(key_mm);
325 const __m128i K1 = _mm_loadu_si128(key_mm + 1);
326 const __m128i K2 = _mm_loadu_si128(key_mm + 2);
327 const __m128i K3 = _mm_loadu_si128(key_mm + 3);
328 const __m128i K4 = _mm_loadu_si128(key_mm + 4);
329 const __m128i K5 = _mm_loadu_si128(key_mm + 5);
330 const __m128i K6 = _mm_loadu_si128(key_mm + 6);
331 const __m128i K7 = _mm_loadu_si128(key_mm + 7);
332 const __m128i K8 = _mm_loadu_si128(key_mm + 8);
333 const __m128i K9 = _mm_loadu_si128(key_mm + 9);
334 const __m128i K10 = _mm_loadu_si128(key_mm + 10);
335 const __m128i K11 = _mm_loadu_si128(key_mm + 11);
336 const __m128i K12 = _mm_loadu_si128(key_mm + 12);
337
338 while(blocks >= 4)
339 {
340 __m128i B0 = _mm_loadu_si128(in_mm + 0);
341 __m128i B1 = _mm_loadu_si128(in_mm + 1);
342 __m128i B2 = _mm_loadu_si128(in_mm + 2);
343 __m128i B3 = _mm_loadu_si128(in_mm + 3);
344
345 B0 = _mm_xor_si128(B0, K0);
346 B1 = _mm_xor_si128(B1, K0);
347 B2 = _mm_xor_si128(B2, K0);
348 B3 = _mm_xor_si128(B3, K0);
349
359 AES_ENC_4_ROUNDS(K10);
360 AES_ENC_4_ROUNDS(K11);
362
363 _mm_storeu_si128(out_mm + 0, B0);
364 _mm_storeu_si128(out_mm + 1, B1);
365 _mm_storeu_si128(out_mm + 2, B2);
366 _mm_storeu_si128(out_mm + 3, B3);
367
368 blocks -= 4;
369 in_mm += 4;
370 out_mm += 4;
371 }
372
373 for(size_t i = 0; i != blocks; ++i)
374 {
375 __m128i B = _mm_loadu_si128(in_mm + i);
376
377 B = _mm_xor_si128(B, K0);
378
379 B = _mm_aesenc_si128(B, K1);
380 B = _mm_aesenc_si128(B, K2);
381 B = _mm_aesenc_si128(B, K3);
382 B = _mm_aesenc_si128(B, K4);
383 B = _mm_aesenc_si128(B, K5);
384 B = _mm_aesenc_si128(B, K6);
385 B = _mm_aesenc_si128(B, K7);
386 B = _mm_aesenc_si128(B, K8);
387 B = _mm_aesenc_si128(B, K9);
388 B = _mm_aesenc_si128(B, K10);
389 B = _mm_aesenc_si128(B, K11);
390 B = _mm_aesenclast_si128(B, K12);
391
392 _mm_storeu_si128(out_mm + i, B);
393 }
394 }
395
396/*
397* AES-192 Decryption
398*/
399BOTAN_FUNC_ISA("ssse3,aes")
400void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
401 {
402 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
403 __m128i* out_mm = reinterpret_cast<__m128i*>(out);
404
405 const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
406
407 const __m128i K0 = _mm_loadu_si128(key_mm);
408 const __m128i K1 = _mm_loadu_si128(key_mm + 1);
409 const __m128i K2 = _mm_loadu_si128(key_mm + 2);
410 const __m128i K3 = _mm_loadu_si128(key_mm + 3);
411 const __m128i K4 = _mm_loadu_si128(key_mm + 4);
412 const __m128i K5 = _mm_loadu_si128(key_mm + 5);
413 const __m128i K6 = _mm_loadu_si128(key_mm + 6);
414 const __m128i K7 = _mm_loadu_si128(key_mm + 7);
415 const __m128i K8 = _mm_loadu_si128(key_mm + 8);
416 const __m128i K9 = _mm_loadu_si128(key_mm + 9);
417 const __m128i K10 = _mm_loadu_si128(key_mm + 10);
418 const __m128i K11 = _mm_loadu_si128(key_mm + 11);
419 const __m128i K12 = _mm_loadu_si128(key_mm + 12);
420
421 while(blocks >= 4)
422 {
423 __m128i B0 = _mm_loadu_si128(in_mm + 0);
424 __m128i B1 = _mm_loadu_si128(in_mm + 1);
425 __m128i B2 = _mm_loadu_si128(in_mm + 2);
426 __m128i B3 = _mm_loadu_si128(in_mm + 3);
427
428 B0 = _mm_xor_si128(B0, K0);
429 B1 = _mm_xor_si128(B1, K0);
430 B2 = _mm_xor_si128(B2, K0);
431 B3 = _mm_xor_si128(B3, K0);
432
442 AES_DEC_4_ROUNDS(K10);
443 AES_DEC_4_ROUNDS(K11);
445
446 _mm_storeu_si128(out_mm + 0, B0);
447 _mm_storeu_si128(out_mm + 1, B1);
448 _mm_storeu_si128(out_mm + 2, B2);
449 _mm_storeu_si128(out_mm + 3, B3);
450
451 blocks -= 4;
452 in_mm += 4;
453 out_mm += 4;
454 }
455
456 for(size_t i = 0; i != blocks; ++i)
457 {
458 __m128i B = _mm_loadu_si128(in_mm + i);
459
460 B = _mm_xor_si128(B, K0);
461
462 B = _mm_aesdec_si128(B, K1);
463 B = _mm_aesdec_si128(B, K2);
464 B = _mm_aesdec_si128(B, K3);
465 B = _mm_aesdec_si128(B, K4);
466 B = _mm_aesdec_si128(B, K5);
467 B = _mm_aesdec_si128(B, K6);
468 B = _mm_aesdec_si128(B, K7);
469 B = _mm_aesdec_si128(B, K8);
470 B = _mm_aesdec_si128(B, K9);
471 B = _mm_aesdec_si128(B, K10);
472 B = _mm_aesdec_si128(B, K11);
473 B = _mm_aesdeclast_si128(B, K12);
474
475 _mm_storeu_si128(out_mm + i, B);
476 }
477 }
478
479/*
480* AES-192 Key Schedule
481*/
482BOTAN_FUNC_ISA("ssse3,aes")
483void AES_192::aesni_key_schedule(const uint8_t key[], size_t)
484 {
485 m_EK.resize(52);
486 m_DK.resize(52);
487
488 __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
489 __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8));
490 K1 = _mm_srli_si128(K1, 8);
491
492 load_le(m_EK.data(), key, 6);
493
494 #define AES_192_key_exp(RCON, EK_OFF) \
495 aes_192_key_expansion(&K0, &K1, \
496 _mm_aeskeygenassist_si128(K1, RCON), \
497 &m_EK[EK_OFF], EK_OFF == 48)
498
499 AES_192_key_exp(0x01, 6);
500 AES_192_key_exp(0x02, 12);
501 AES_192_key_exp(0x04, 18);
502 AES_192_key_exp(0x08, 24);
503 AES_192_key_exp(0x10, 30);
504 AES_192_key_exp(0x20, 36);
505 AES_192_key_exp(0x40, 42);
506 AES_192_key_exp(0x80, 48);
507
508 #undef AES_192_key_exp
509
510 // Now generate decryption keys
511 const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data());
512
513 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
514 _mm_storeu_si128(DK_mm , _mm_loadu_si128(EK_mm + 12));
515 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
516 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
517 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
518 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
519 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
520 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
521 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
522 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
523 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
524 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
525 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
526 _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
527 }
528
529/*
530* AES-256 Encryption
531*/
532BOTAN_FUNC_ISA("ssse3,aes")
533void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
534 {
535 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
536 __m128i* out_mm = reinterpret_cast<__m128i*>(out);
537
538 const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data());
539
540 const __m128i K0 = _mm_loadu_si128(key_mm);
541 const __m128i K1 = _mm_loadu_si128(key_mm + 1);
542 const __m128i K2 = _mm_loadu_si128(key_mm + 2);
543 const __m128i K3 = _mm_loadu_si128(key_mm + 3);
544 const __m128i K4 = _mm_loadu_si128(key_mm + 4);
545 const __m128i K5 = _mm_loadu_si128(key_mm + 5);
546 const __m128i K6 = _mm_loadu_si128(key_mm + 6);
547 const __m128i K7 = _mm_loadu_si128(key_mm + 7);
548 const __m128i K8 = _mm_loadu_si128(key_mm + 8);
549 const __m128i K9 = _mm_loadu_si128(key_mm + 9);
550 const __m128i K10 = _mm_loadu_si128(key_mm + 10);
551 const __m128i K11 = _mm_loadu_si128(key_mm + 11);
552 const __m128i K12 = _mm_loadu_si128(key_mm + 12);
553 const __m128i K13 = _mm_loadu_si128(key_mm + 13);
554 const __m128i K14 = _mm_loadu_si128(key_mm + 14);
555
556 while(blocks >= 4)
557 {
558 __m128i B0 = _mm_loadu_si128(in_mm + 0);
559 __m128i B1 = _mm_loadu_si128(in_mm + 1);
560 __m128i B2 = _mm_loadu_si128(in_mm + 2);
561 __m128i B3 = _mm_loadu_si128(in_mm + 3);
562
563 B0 = _mm_xor_si128(B0, K0);
564 B1 = _mm_xor_si128(B1, K0);
565 B2 = _mm_xor_si128(B2, K0);
566 B3 = _mm_xor_si128(B3, K0);
567
577 AES_ENC_4_ROUNDS(K10);
578 AES_ENC_4_ROUNDS(K11);
579 AES_ENC_4_ROUNDS(K12);
580 AES_ENC_4_ROUNDS(K13);
582
583 _mm_storeu_si128(out_mm + 0, B0);
584 _mm_storeu_si128(out_mm + 1, B1);
585 _mm_storeu_si128(out_mm + 2, B2);
586 _mm_storeu_si128(out_mm + 3, B3);
587
588 blocks -= 4;
589 in_mm += 4;
590 out_mm += 4;
591 }
592
593 for(size_t i = 0; i != blocks; ++i)
594 {
595 __m128i B = _mm_loadu_si128(in_mm + i);
596
597 B = _mm_xor_si128(B, K0);
598
599 B = _mm_aesenc_si128(B, K1);
600 B = _mm_aesenc_si128(B, K2);
601 B = _mm_aesenc_si128(B, K3);
602 B = _mm_aesenc_si128(B, K4);
603 B = _mm_aesenc_si128(B, K5);
604 B = _mm_aesenc_si128(B, K6);
605 B = _mm_aesenc_si128(B, K7);
606 B = _mm_aesenc_si128(B, K8);
607 B = _mm_aesenc_si128(B, K9);
608 B = _mm_aesenc_si128(B, K10);
609 B = _mm_aesenc_si128(B, K11);
610 B = _mm_aesenc_si128(B, K12);
611 B = _mm_aesenc_si128(B, K13);
612 B = _mm_aesenclast_si128(B, K14);
613
614 _mm_storeu_si128(out_mm + i, B);
615 }
616 }
617
618/*
619* AES-256 Decryption
620*/
621BOTAN_FUNC_ISA("ssse3,aes")
622void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
623 {
624 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
625 __m128i* out_mm = reinterpret_cast<__m128i*>(out);
626
627 const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data());
628
629 const __m128i K0 = _mm_loadu_si128(key_mm);
630 const __m128i K1 = _mm_loadu_si128(key_mm + 1);
631 const __m128i K2 = _mm_loadu_si128(key_mm + 2);
632 const __m128i K3 = _mm_loadu_si128(key_mm + 3);
633 const __m128i K4 = _mm_loadu_si128(key_mm + 4);
634 const __m128i K5 = _mm_loadu_si128(key_mm + 5);
635 const __m128i K6 = _mm_loadu_si128(key_mm + 6);
636 const __m128i K7 = _mm_loadu_si128(key_mm + 7);
637 const __m128i K8 = _mm_loadu_si128(key_mm + 8);
638 const __m128i K9 = _mm_loadu_si128(key_mm + 9);
639 const __m128i K10 = _mm_loadu_si128(key_mm + 10);
640 const __m128i K11 = _mm_loadu_si128(key_mm + 11);
641 const __m128i K12 = _mm_loadu_si128(key_mm + 12);
642 const __m128i K13 = _mm_loadu_si128(key_mm + 13);
643 const __m128i K14 = _mm_loadu_si128(key_mm + 14);
644
645 while(blocks >= 4)
646 {
647 __m128i B0 = _mm_loadu_si128(in_mm + 0);
648 __m128i B1 = _mm_loadu_si128(in_mm + 1);
649 __m128i B2 = _mm_loadu_si128(in_mm + 2);
650 __m128i B3 = _mm_loadu_si128(in_mm + 3);
651
652 B0 = _mm_xor_si128(B0, K0);
653 B1 = _mm_xor_si128(B1, K0);
654 B2 = _mm_xor_si128(B2, K0);
655 B3 = _mm_xor_si128(B3, K0);
656
666 AES_DEC_4_ROUNDS(K10);
667 AES_DEC_4_ROUNDS(K11);
668 AES_DEC_4_ROUNDS(K12);
669 AES_DEC_4_ROUNDS(K13);
671
672 _mm_storeu_si128(out_mm + 0, B0);
673 _mm_storeu_si128(out_mm + 1, B1);
674 _mm_storeu_si128(out_mm + 2, B2);
675 _mm_storeu_si128(out_mm + 3, B3);
676
677 blocks -= 4;
678 in_mm += 4;
679 out_mm += 4;
680 }
681
682 for(size_t i = 0; i != blocks; ++i)
683 {
684 __m128i B = _mm_loadu_si128(in_mm + i);
685
686 B = _mm_xor_si128(B, K0);
687
688 B = _mm_aesdec_si128(B, K1);
689 B = _mm_aesdec_si128(B, K2);
690 B = _mm_aesdec_si128(B, K3);
691 B = _mm_aesdec_si128(B, K4);
692 B = _mm_aesdec_si128(B, K5);
693 B = _mm_aesdec_si128(B, K6);
694 B = _mm_aesdec_si128(B, K7);
695 B = _mm_aesdec_si128(B, K8);
696 B = _mm_aesdec_si128(B, K9);
697 B = _mm_aesdec_si128(B, K10);
698 B = _mm_aesdec_si128(B, K11);
699 B = _mm_aesdec_si128(B, K12);
700 B = _mm_aesdec_si128(B, K13);
701 B = _mm_aesdeclast_si128(B, K14);
702
703 _mm_storeu_si128(out_mm + i, B);
704 }
705 }
706
707/*
708* AES-256 Key Schedule
709*/
710BOTAN_FUNC_ISA("ssse3,aes")
711void AES_256::aesni_key_schedule(const uint8_t key[], size_t)
712 {
713 m_EK.resize(60);
714 m_DK.resize(60);
715
716 const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
717 const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16));
718
719 const __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01));
720 const __m128i K3 = aes_256_key_expansion(K1, K2);
721
722 const __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02));
723 const __m128i K5 = aes_256_key_expansion(K3, K4);
724
725 const __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04));
726 const __m128i K7 = aes_256_key_expansion(K5, K6);
727
728 const __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08));
729 const __m128i K9 = aes_256_key_expansion(K7, K8);
730
731 const __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10));
732 const __m128i K11 = aes_256_key_expansion(K9, K10);
733
734 const __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20));
735 const __m128i K13 = aes_256_key_expansion(K11, K12);
736
737 const __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40));
738
739 __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
740 _mm_storeu_si128(EK_mm , K0);
741 _mm_storeu_si128(EK_mm + 1, K1);
742 _mm_storeu_si128(EK_mm + 2, K2);
743 _mm_storeu_si128(EK_mm + 3, K3);
744 _mm_storeu_si128(EK_mm + 4, K4);
745 _mm_storeu_si128(EK_mm + 5, K5);
746 _mm_storeu_si128(EK_mm + 6, K6);
747 _mm_storeu_si128(EK_mm + 7, K7);
748 _mm_storeu_si128(EK_mm + 8, K8);
749 _mm_storeu_si128(EK_mm + 9, K9);
750 _mm_storeu_si128(EK_mm + 10, K10);
751 _mm_storeu_si128(EK_mm + 11, K11);
752 _mm_storeu_si128(EK_mm + 12, K12);
753 _mm_storeu_si128(EK_mm + 13, K13);
754 _mm_storeu_si128(EK_mm + 14, K14);
755
756 // Now generate decryption keys
757 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
758 _mm_storeu_si128(DK_mm , K14);
759 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
760 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
761 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
762 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
763 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
764 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
765 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
766 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
767 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
768 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
769 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
770 _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
771 _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
772 _mm_storeu_si128(DK_mm + 14, K0);
773 }
774
775#undef AES_ENC_4_ROUNDS
776#undef AES_ENC_4_LAST_ROUNDS
777#undef AES_DEC_4_ROUNDS
778#undef AES_DEC_4_LAST_ROUNDS
779
780}
#define AES_DEC_4_ROUNDS(K)
Definition: aes_ni.cpp:88
#define AES_ENC_4_LAST_ROUNDS(K)
Definition: aes_ni.cpp:79
#define AES_ENC_4_ROUNDS(K)
Definition: aes_ni.cpp:70
#define AES_192_key_exp(RCON, EK_OFF)
#define AES_DEC_4_LAST_ROUNDS(K)
Definition: aes_ni.cpp:97
#define AES_128_key_exp(K, RCON)
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:77
Definition: alg_id.cpp:13
T load_le(const uint8_t in[], size_t off)
Definition: loadstor.h:123