9#include <botan/loadstor.h>
17__m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
19 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3));
20 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
21 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
22 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
23 return _mm_xor_si128(key, key_with_rcon);
27void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
28 uint32_t out[],
bool last)
33 key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
34 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
35 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
36 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
37 key1 = _mm_xor_si128(key1, key2_with_rcon);
40 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(out), key1);
45 key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
46 key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));
49 out[4] = _mm_cvtsi128_si32(key2);
50 out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
57__m128i aes_256_key_expansion(__m128i key, __m128i key2)
59 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
60 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2));
62 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
63 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
64 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
65 return _mm_xor_si128(key, key_with_rcon);
70#define AES_ENC_4_ROUNDS(K) \
73 B0 = _mm_aesenc_si128(B0, K); \
74 B1 = _mm_aesenc_si128(B1, K); \
75 B2 = _mm_aesenc_si128(B2, K); \
76 B3 = _mm_aesenc_si128(B3, K); \
79#define AES_ENC_4_LAST_ROUNDS(K) \
82 B0 = _mm_aesenclast_si128(B0, K); \
83 B1 = _mm_aesenclast_si128(B1, K); \
84 B2 = _mm_aesenclast_si128(B2, K); \
85 B3 = _mm_aesenclast_si128(B3, K); \
88#define AES_DEC_4_ROUNDS(K) \
91 B0 = _mm_aesdec_si128(B0, K); \
92 B1 = _mm_aesdec_si128(B1, K); \
93 B2 = _mm_aesdec_si128(B2, K); \
94 B3 = _mm_aesdec_si128(B3, K); \
97#define AES_DEC_4_LAST_ROUNDS(K) \
100 B0 = _mm_aesdeclast_si128(B0, K); \
101 B1 = _mm_aesdeclast_si128(B1, K); \
102 B2 = _mm_aesdeclast_si128(B2, K); \
103 B3 = _mm_aesdeclast_si128(B3, K); \
110void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
112 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
113 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
115 const __m128i* key_mm =
reinterpret_cast<const __m128i*
>(m_EK.data());
117 const __m128i K0 = _mm_loadu_si128(key_mm);
118 const __m128i K1 = _mm_loadu_si128(key_mm + 1);
119 const __m128i K2 = _mm_loadu_si128(key_mm + 2);
120 const __m128i K3 = _mm_loadu_si128(key_mm + 3);
121 const __m128i K4 = _mm_loadu_si128(key_mm + 4);
122 const __m128i K5 = _mm_loadu_si128(key_mm + 5);
123 const __m128i K6 = _mm_loadu_si128(key_mm + 6);
124 const __m128i K7 = _mm_loadu_si128(key_mm + 7);
125 const __m128i K8 = _mm_loadu_si128(key_mm + 8);
126 const __m128i K9 = _mm_loadu_si128(key_mm + 9);
127 const __m128i K10 = _mm_loadu_si128(key_mm + 10);
131 __m128i B0 = _mm_loadu_si128(in_mm + 0);
132 __m128i B1 = _mm_loadu_si128(in_mm + 1);
133 __m128i B2 = _mm_loadu_si128(in_mm + 2);
134 __m128i B3 = _mm_loadu_si128(in_mm + 3);
136 B0 = _mm_xor_si128(B0, K0);
137 B1 = _mm_xor_si128(B1, K0);
138 B2 = _mm_xor_si128(B2, K0);
139 B3 = _mm_xor_si128(B3, K0);
152 _mm_storeu_si128(out_mm + 0, B0);
153 _mm_storeu_si128(out_mm + 1, B1);
154 _mm_storeu_si128(out_mm + 2, B2);
155 _mm_storeu_si128(out_mm + 3, B3);
162 for(
size_t i = 0; i != blocks; ++i)
164 __m128i B = _mm_loadu_si128(in_mm + i);
166 B = _mm_xor_si128(B, K0);
168 B = _mm_aesenc_si128(B, K1);
169 B = _mm_aesenc_si128(B, K2);
170 B = _mm_aesenc_si128(B, K3);
171 B = _mm_aesenc_si128(B, K4);
172 B = _mm_aesenc_si128(B, K5);
173 B = _mm_aesenc_si128(B, K6);
174 B = _mm_aesenc_si128(B, K7);
175 B = _mm_aesenc_si128(B, K8);
176 B = _mm_aesenc_si128(B, K9);
177 B = _mm_aesenclast_si128(B, K10);
179 _mm_storeu_si128(out_mm + i, B);
187void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
189 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
190 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
192 const __m128i* key_mm =
reinterpret_cast<const __m128i*
>(m_DK.data());
194 const __m128i K0 = _mm_loadu_si128(key_mm);
195 const __m128i K1 = _mm_loadu_si128(key_mm + 1);
196 const __m128i K2 = _mm_loadu_si128(key_mm + 2);
197 const __m128i K3 = _mm_loadu_si128(key_mm + 3);
198 const __m128i K4 = _mm_loadu_si128(key_mm + 4);
199 const __m128i K5 = _mm_loadu_si128(key_mm + 5);
200 const __m128i K6 = _mm_loadu_si128(key_mm + 6);
201 const __m128i K7 = _mm_loadu_si128(key_mm + 7);
202 const __m128i K8 = _mm_loadu_si128(key_mm + 8);
203 const __m128i K9 = _mm_loadu_si128(key_mm + 9);
204 const __m128i K10 = _mm_loadu_si128(key_mm + 10);
208 __m128i B0 = _mm_loadu_si128(in_mm + 0);
209 __m128i B1 = _mm_loadu_si128(in_mm + 1);
210 __m128i B2 = _mm_loadu_si128(in_mm + 2);
211 __m128i B3 = _mm_loadu_si128(in_mm + 3);
213 B0 = _mm_xor_si128(B0, K0);
214 B1 = _mm_xor_si128(B1, K0);
215 B2 = _mm_xor_si128(B2, K0);
216 B3 = _mm_xor_si128(B3, K0);
229 _mm_storeu_si128(out_mm + 0, B0);
230 _mm_storeu_si128(out_mm + 1, B1);
231 _mm_storeu_si128(out_mm + 2, B2);
232 _mm_storeu_si128(out_mm + 3, B3);
239 for(
size_t i = 0; i != blocks; ++i)
241 __m128i B = _mm_loadu_si128(in_mm + i);
243 B = _mm_xor_si128(B, K0);
245 B = _mm_aesdec_si128(B, K1);
246 B = _mm_aesdec_si128(B, K2);
247 B = _mm_aesdec_si128(B, K3);
248 B = _mm_aesdec_si128(B, K4);
249 B = _mm_aesdec_si128(B, K5);
250 B = _mm_aesdec_si128(B, K6);
251 B = _mm_aesdec_si128(B, K7);
252 B = _mm_aesdec_si128(B, K8);
253 B = _mm_aesdec_si128(B, K9);
254 B = _mm_aesdeclast_si128(B, K10);
256 _mm_storeu_si128(out_mm + i, B);
264void AES_128::aesni_key_schedule(const uint8_t key[],
size_t)
269 #define AES_128_key_exp(K, RCON) \
270 aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON))
272 const __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
284 __m128i* EK_mm =
reinterpret_cast<__m128i*
>(m_EK.data());
285 _mm_storeu_si128(EK_mm , K0);
286 _mm_storeu_si128(EK_mm + 1, K1);
287 _mm_storeu_si128(EK_mm + 2, K2);
288 _mm_storeu_si128(EK_mm + 3, K3);
289 _mm_storeu_si128(EK_mm + 4, K4);
290 _mm_storeu_si128(EK_mm + 5, K5);
291 _mm_storeu_si128(EK_mm + 6, K6);
292 _mm_storeu_si128(EK_mm + 7, K7);
293 _mm_storeu_si128(EK_mm + 8, K8);
294 _mm_storeu_si128(EK_mm + 9, K9);
295 _mm_storeu_si128(EK_mm + 10, K10);
299 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
300 _mm_storeu_si128(DK_mm , K10);
301 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
302 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
303 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
304 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
305 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
306 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
307 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
308 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
309 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
310 _mm_storeu_si128(DK_mm + 10, K0);
317void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
319 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
320 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
322 const __m128i* key_mm =
reinterpret_cast<const __m128i*
>(m_EK.data());
324 const __m128i K0 = _mm_loadu_si128(key_mm);
325 const __m128i K1 = _mm_loadu_si128(key_mm + 1);
326 const __m128i K2 = _mm_loadu_si128(key_mm + 2);
327 const __m128i K3 = _mm_loadu_si128(key_mm + 3);
328 const __m128i K4 = _mm_loadu_si128(key_mm + 4);
329 const __m128i K5 = _mm_loadu_si128(key_mm + 5);
330 const __m128i K6 = _mm_loadu_si128(key_mm + 6);
331 const __m128i K7 = _mm_loadu_si128(key_mm + 7);
332 const __m128i K8 = _mm_loadu_si128(key_mm + 8);
333 const __m128i K9 = _mm_loadu_si128(key_mm + 9);
334 const __m128i K10 = _mm_loadu_si128(key_mm + 10);
335 const __m128i K11 = _mm_loadu_si128(key_mm + 11);
336 const __m128i K12 = _mm_loadu_si128(key_mm + 12);
340 __m128i B0 = _mm_loadu_si128(in_mm + 0);
341 __m128i B1 = _mm_loadu_si128(in_mm + 1);
342 __m128i B2 = _mm_loadu_si128(in_mm + 2);
343 __m128i B3 = _mm_loadu_si128(in_mm + 3);
345 B0 = _mm_xor_si128(B0, K0);
346 B1 = _mm_xor_si128(B1, K0);
347 B2 = _mm_xor_si128(B2, K0);
348 B3 = _mm_xor_si128(B3, K0);
363 _mm_storeu_si128(out_mm + 0, B0);
364 _mm_storeu_si128(out_mm + 1, B1);
365 _mm_storeu_si128(out_mm + 2, B2);
366 _mm_storeu_si128(out_mm + 3, B3);
373 for(
size_t i = 0; i != blocks; ++i)
375 __m128i B = _mm_loadu_si128(in_mm + i);
377 B = _mm_xor_si128(B, K0);
379 B = _mm_aesenc_si128(B, K1);
380 B = _mm_aesenc_si128(B, K2);
381 B = _mm_aesenc_si128(B, K3);
382 B = _mm_aesenc_si128(B, K4);
383 B = _mm_aesenc_si128(B, K5);
384 B = _mm_aesenc_si128(B, K6);
385 B = _mm_aesenc_si128(B, K7);
386 B = _mm_aesenc_si128(B, K8);
387 B = _mm_aesenc_si128(B, K9);
388 B = _mm_aesenc_si128(B, K10);
389 B = _mm_aesenc_si128(B, K11);
390 B = _mm_aesenclast_si128(B, K12);
392 _mm_storeu_si128(out_mm + i, B);
400void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
402 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
403 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
405 const __m128i* key_mm =
reinterpret_cast<const __m128i*
>(m_DK.data());
407 const __m128i K0 = _mm_loadu_si128(key_mm);
408 const __m128i K1 = _mm_loadu_si128(key_mm + 1);
409 const __m128i K2 = _mm_loadu_si128(key_mm + 2);
410 const __m128i K3 = _mm_loadu_si128(key_mm + 3);
411 const __m128i K4 = _mm_loadu_si128(key_mm + 4);
412 const __m128i K5 = _mm_loadu_si128(key_mm + 5);
413 const __m128i K6 = _mm_loadu_si128(key_mm + 6);
414 const __m128i K7 = _mm_loadu_si128(key_mm + 7);
415 const __m128i K8 = _mm_loadu_si128(key_mm + 8);
416 const __m128i K9 = _mm_loadu_si128(key_mm + 9);
417 const __m128i K10 = _mm_loadu_si128(key_mm + 10);
418 const __m128i K11 = _mm_loadu_si128(key_mm + 11);
419 const __m128i K12 = _mm_loadu_si128(key_mm + 12);
423 __m128i B0 = _mm_loadu_si128(in_mm + 0);
424 __m128i B1 = _mm_loadu_si128(in_mm + 1);
425 __m128i B2 = _mm_loadu_si128(in_mm + 2);
426 __m128i B3 = _mm_loadu_si128(in_mm + 3);
428 B0 = _mm_xor_si128(B0, K0);
429 B1 = _mm_xor_si128(B1, K0);
430 B2 = _mm_xor_si128(B2, K0);
431 B3 = _mm_xor_si128(B3, K0);
446 _mm_storeu_si128(out_mm + 0, B0);
447 _mm_storeu_si128(out_mm + 1, B1);
448 _mm_storeu_si128(out_mm + 2, B2);
449 _mm_storeu_si128(out_mm + 3, B3);
456 for(
size_t i = 0; i != blocks; ++i)
458 __m128i B = _mm_loadu_si128(in_mm + i);
460 B = _mm_xor_si128(B, K0);
462 B = _mm_aesdec_si128(B, K1);
463 B = _mm_aesdec_si128(B, K2);
464 B = _mm_aesdec_si128(B, K3);
465 B = _mm_aesdec_si128(B, K4);
466 B = _mm_aesdec_si128(B, K5);
467 B = _mm_aesdec_si128(B, K6);
468 B = _mm_aesdec_si128(B, K7);
469 B = _mm_aesdec_si128(B, K8);
470 B = _mm_aesdec_si128(B, K9);
471 B = _mm_aesdec_si128(B, K10);
472 B = _mm_aesdec_si128(B, K11);
473 B = _mm_aesdeclast_si128(B, K12);
475 _mm_storeu_si128(out_mm + i, B);
483void AES_192::aesni_key_schedule(const uint8_t key[],
size_t)
488 __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
489 __m128i K1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key + 8));
490 K1 = _mm_srli_si128(K1, 8);
494 #define AES_192_key_exp(RCON, EK_OFF) \
495 aes_192_key_expansion(&K0, &K1, \
496 _mm_aeskeygenassist_si128(K1, RCON), \
497 &m_EK[EK_OFF], EK_OFF == 48)
508 #undef AES_192_key_exp
511 const __m128i* EK_mm =
reinterpret_cast<const __m128i*
>(m_EK.data());
513 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
514 _mm_storeu_si128(DK_mm , _mm_loadu_si128(EK_mm + 12));
515 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
516 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
517 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
518 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
519 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
520 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
521 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
522 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
523 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
524 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
525 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
526 _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
533void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
535 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
536 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
538 const __m128i* key_mm =
reinterpret_cast<const __m128i*
>(m_EK.data());
540 const __m128i K0 = _mm_loadu_si128(key_mm);
541 const __m128i K1 = _mm_loadu_si128(key_mm + 1);
542 const __m128i K2 = _mm_loadu_si128(key_mm + 2);
543 const __m128i K3 = _mm_loadu_si128(key_mm + 3);
544 const __m128i K4 = _mm_loadu_si128(key_mm + 4);
545 const __m128i K5 = _mm_loadu_si128(key_mm + 5);
546 const __m128i K6 = _mm_loadu_si128(key_mm + 6);
547 const __m128i K7 = _mm_loadu_si128(key_mm + 7);
548 const __m128i K8 = _mm_loadu_si128(key_mm + 8);
549 const __m128i K9 = _mm_loadu_si128(key_mm + 9);
550 const __m128i K10 = _mm_loadu_si128(key_mm + 10);
551 const __m128i K11 = _mm_loadu_si128(key_mm + 11);
552 const __m128i K12 = _mm_loadu_si128(key_mm + 12);
553 const __m128i K13 = _mm_loadu_si128(key_mm + 13);
554 const __m128i K14 = _mm_loadu_si128(key_mm + 14);
558 __m128i B0 = _mm_loadu_si128(in_mm + 0);
559 __m128i B1 = _mm_loadu_si128(in_mm + 1);
560 __m128i B2 = _mm_loadu_si128(in_mm + 2);
561 __m128i B3 = _mm_loadu_si128(in_mm + 3);
563 B0 = _mm_xor_si128(B0, K0);
564 B1 = _mm_xor_si128(B1, K0);
565 B2 = _mm_xor_si128(B2, K0);
566 B3 = _mm_xor_si128(B3, K0);
583 _mm_storeu_si128(out_mm + 0, B0);
584 _mm_storeu_si128(out_mm + 1, B1);
585 _mm_storeu_si128(out_mm + 2, B2);
586 _mm_storeu_si128(out_mm + 3, B3);
593 for(
size_t i = 0; i != blocks; ++i)
595 __m128i B = _mm_loadu_si128(in_mm + i);
597 B = _mm_xor_si128(B, K0);
599 B = _mm_aesenc_si128(B, K1);
600 B = _mm_aesenc_si128(B, K2);
601 B = _mm_aesenc_si128(B, K3);
602 B = _mm_aesenc_si128(B, K4);
603 B = _mm_aesenc_si128(B, K5);
604 B = _mm_aesenc_si128(B, K6);
605 B = _mm_aesenc_si128(B, K7);
606 B = _mm_aesenc_si128(B, K8);
607 B = _mm_aesenc_si128(B, K9);
608 B = _mm_aesenc_si128(B, K10);
609 B = _mm_aesenc_si128(B, K11);
610 B = _mm_aesenc_si128(B, K12);
611 B = _mm_aesenc_si128(B, K13);
612 B = _mm_aesenclast_si128(B, K14);
614 _mm_storeu_si128(out_mm + i, B);
622void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
624 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
625 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
627 const __m128i* key_mm =
reinterpret_cast<const __m128i*
>(m_DK.data());
629 const __m128i K0 = _mm_loadu_si128(key_mm);
630 const __m128i K1 = _mm_loadu_si128(key_mm + 1);
631 const __m128i K2 = _mm_loadu_si128(key_mm + 2);
632 const __m128i K3 = _mm_loadu_si128(key_mm + 3);
633 const __m128i K4 = _mm_loadu_si128(key_mm + 4);
634 const __m128i K5 = _mm_loadu_si128(key_mm + 5);
635 const __m128i K6 = _mm_loadu_si128(key_mm + 6);
636 const __m128i K7 = _mm_loadu_si128(key_mm + 7);
637 const __m128i K8 = _mm_loadu_si128(key_mm + 8);
638 const __m128i K9 = _mm_loadu_si128(key_mm + 9);
639 const __m128i K10 = _mm_loadu_si128(key_mm + 10);
640 const __m128i K11 = _mm_loadu_si128(key_mm + 11);
641 const __m128i K12 = _mm_loadu_si128(key_mm + 12);
642 const __m128i K13 = _mm_loadu_si128(key_mm + 13);
643 const __m128i K14 = _mm_loadu_si128(key_mm + 14);
647 __m128i B0 = _mm_loadu_si128(in_mm + 0);
648 __m128i B1 = _mm_loadu_si128(in_mm + 1);
649 __m128i B2 = _mm_loadu_si128(in_mm + 2);
650 __m128i B3 = _mm_loadu_si128(in_mm + 3);
652 B0 = _mm_xor_si128(B0, K0);
653 B1 = _mm_xor_si128(B1, K0);
654 B2 = _mm_xor_si128(B2, K0);
655 B3 = _mm_xor_si128(B3, K0);
672 _mm_storeu_si128(out_mm + 0, B0);
673 _mm_storeu_si128(out_mm + 1, B1);
674 _mm_storeu_si128(out_mm + 2, B2);
675 _mm_storeu_si128(out_mm + 3, B3);
682 for(
size_t i = 0; i != blocks; ++i)
684 __m128i B = _mm_loadu_si128(in_mm + i);
686 B = _mm_xor_si128(B, K0);
688 B = _mm_aesdec_si128(B, K1);
689 B = _mm_aesdec_si128(B, K2);
690 B = _mm_aesdec_si128(B, K3);
691 B = _mm_aesdec_si128(B, K4);
692 B = _mm_aesdec_si128(B, K5);
693 B = _mm_aesdec_si128(B, K6);
694 B = _mm_aesdec_si128(B, K7);
695 B = _mm_aesdec_si128(B, K8);
696 B = _mm_aesdec_si128(B, K9);
697 B = _mm_aesdec_si128(B, K10);
698 B = _mm_aesdec_si128(B, K11);
699 B = _mm_aesdec_si128(B, K12);
700 B = _mm_aesdec_si128(B, K13);
701 B = _mm_aesdeclast_si128(B, K14);
703 _mm_storeu_si128(out_mm + i, B);
711void AES_256::aesni_key_schedule(const uint8_t key[],
size_t)
716 const __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
717 const __m128i K1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key + 16));
719 const __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01));
720 const __m128i K3 = aes_256_key_expansion(K1, K2);
722 const __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02));
723 const __m128i K5 = aes_256_key_expansion(K3, K4);
725 const __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04));
726 const __m128i K7 = aes_256_key_expansion(K5, K6);
728 const __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08));
729 const __m128i K9 = aes_256_key_expansion(K7, K8);
731 const __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10));
732 const __m128i K11 = aes_256_key_expansion(K9, K10);
734 const __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20));
735 const __m128i K13 = aes_256_key_expansion(K11, K12);
737 const __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40));
739 __m128i* EK_mm =
reinterpret_cast<__m128i*
>(m_EK.data());
740 _mm_storeu_si128(EK_mm , K0);
741 _mm_storeu_si128(EK_mm + 1, K1);
742 _mm_storeu_si128(EK_mm + 2, K2);
743 _mm_storeu_si128(EK_mm + 3, K3);
744 _mm_storeu_si128(EK_mm + 4, K4);
745 _mm_storeu_si128(EK_mm + 5, K5);
746 _mm_storeu_si128(EK_mm + 6, K6);
747 _mm_storeu_si128(EK_mm + 7, K7);
748 _mm_storeu_si128(EK_mm + 8, K8);
749 _mm_storeu_si128(EK_mm + 9, K9);
750 _mm_storeu_si128(EK_mm + 10, K10);
751 _mm_storeu_si128(EK_mm + 11, K11);
752 _mm_storeu_si128(EK_mm + 12, K12);
753 _mm_storeu_si128(EK_mm + 13, K13);
754 _mm_storeu_si128(EK_mm + 14, K14);
757 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
758 _mm_storeu_si128(DK_mm , K14);
759 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
760 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
761 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
762 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
763 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
764 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
765 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
766 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
767 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
768 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
769 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
770 _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
771 _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
772 _mm_storeu_si128(DK_mm + 14, K0);
775#undef AES_ENC_4_ROUNDS
776#undef AES_ENC_4_LAST_ROUNDS
777#undef AES_DEC_4_ROUNDS
778#undef AES_DEC_4_LAST_ROUNDS
#define AES_DEC_4_ROUNDS(K)
#define AES_ENC_4_LAST_ROUNDS(K)
#define AES_ENC_4_ROUNDS(K)
#define AES_192_key_exp(RCON, EK_OFF)
#define AES_DEC_4_LAST_ROUNDS(K)
#define AES_128_key_exp(K, RCON)
#define BOTAN_FUNC_ISA(isa)
T load_le(const uint8_t in[], size_t off)