8#include <botan/internal/aes.h>
10#include <botan/internal/loadstor.h>
11#include <botan/internal/simd_32.h>
20inline __m128i aes_128_key_expansion(__m128i key, __m128i key_getting_rcon) {
21 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key_getting_rcon, RC);
22 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3, 3, 3, 3));
23 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
24 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
25 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
26 return _mm_xor_si128(key, key_with_rcon);
30void aes_192_key_expansion(
31 __m128i* K1, __m128i* K2, __m128i key2_with_rcon,
secure_vector<uint32_t>& out,
size_t offset) {
35 key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1, 1, 1, 1));
36 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
37 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
38 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
39 key1 = _mm_xor_si128(key1, key2_with_rcon);
42 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(&out[offset]), key1);
48 key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
49 key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3, 3, 3, 3)));
52 out[offset + 4] = _mm_cvtsi128_si32(key2);
53 out[offset + 5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
59BOTAN_FUNC_ISA(
"ssse3,aes") __m128i aes_256_key_expansion(__m128i key, __m128i key2) {
60 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
61 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2, 2, 2, 2));
63 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
64 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
65 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
66 return _mm_xor_si128(key, key_with_rcon);
69BOTAN_FORCE_INLINE void keyxor(SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
77 B = SIMD_4x32(_mm_aesenc_si128(B.raw(),
K.raw()));
80BOTAN_FUNC_ISA_INLINE(
"aes") void aesenc(SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
81 B0 = SIMD_4x32(_mm_aesenc_si128(B0.raw(),
K.raw()));
82 B1 = SIMD_4x32(_mm_aesenc_si128(B1.raw(),
K.raw()));
83 B2 = SIMD_4x32(_mm_aesenc_si128(B2.raw(),
K.raw()));
84 B3 = SIMD_4x32(_mm_aesenc_si128(B3.raw(),
K.raw()));
88 B = SIMD_4x32(_mm_aesenclast_si128(B.raw(),
K.raw()));
91BOTAN_FUNC_ISA_INLINE(
"aes") void aesenclast(SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
92 B0 = SIMD_4x32(_mm_aesenclast_si128(B0.raw(),
K.raw()));
93 B1 = SIMD_4x32(_mm_aesenclast_si128(B1.raw(),
K.raw()));
94 B2 = SIMD_4x32(_mm_aesenclast_si128(B2.raw(),
K.raw()));
95 B3 = SIMD_4x32(_mm_aesenclast_si128(B3.raw(),
K.raw()));
99 B = SIMD_4x32(_mm_aesdec_si128(B.raw(),
K.raw()));
102BOTAN_FUNC_ISA_INLINE(
"aes") void aesdec(SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
103 B0 = SIMD_4x32(_mm_aesdec_si128(B0.raw(),
K.raw()));
104 B1 = SIMD_4x32(_mm_aesdec_si128(B1.raw(),
K.raw()));
105 B2 = SIMD_4x32(_mm_aesdec_si128(B2.raw(),
K.raw()));
106 B3 = SIMD_4x32(_mm_aesdec_si128(B3.raw(),
K.raw()));
110 B = SIMD_4x32(_mm_aesdeclast_si128(B.raw(),
K.raw()));
113BOTAN_FUNC_ISA_INLINE(
"aes") void aesdeclast(SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
114 B0 = SIMD_4x32(_mm_aesdeclast_si128(B0.raw(),
K.raw()));
115 B1 = SIMD_4x32(_mm_aesdeclast_si128(B1.raw(),
K.raw()));
116 B2 = SIMD_4x32(_mm_aesdeclast_si128(B2.raw(),
K.raw()));
117 B3 = SIMD_4x32(_mm_aesdeclast_si128(B3.raw(),
K.raw()));
144 keyxor(K0, B0, B1, B2, B3);
145 aesenc(K1, B0, B1, B2, B3);
146 aesenc(K2, B0, B1, B2, B3);
147 aesenc(K3, B0, B1, B2, B3);
148 aesenc(K4, B0, B1, B2, B3);
149 aesenc(K5, B0, B1, B2, B3);
150 aesenc(K6, B0, B1, B2, B3);
151 aesenc(K7, B0, B1, B2, B3);
152 aesenc(K8, B0, B1, B2, B3);
153 aesenc(K9, B0, B1, B2, B3);
154 aesenclast(K10, B0, B1, B2, B3);
166 for(
size_t i = 0; i != blocks; ++i) {
188BOTAN_FUNC_ISA(
"ssse3,aes") void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const {
207 keyxor(K0, B0, B1, B2, B3);
208 aesdec(K1, B0, B1, B2, B3);
209 aesdec(K2, B0, B1, B2, B3);
210 aesdec(K3, B0, B1, B2, B3);
211 aesdec(K4, B0, B1, B2, B3);
212 aesdec(K5, B0, B1, B2, B3);
213 aesdec(K6, B0, B1, B2, B3);
214 aesdec(K7, B0, B1, B2, B3);
215 aesdec(K8, B0, B1, B2, B3);
216 aesdec(K9, B0, B1, B2, B3);
217 aesdeclast(K10, B0, B1, B2, B3);
219 B0.store_le(out + 16 * 0);
220 B1.store_le(out + 16 * 1);
221 B2.store_le(out + 16 * 2);
222 B3.store_le(out + 16 * 3);
229 for(
size_t i = 0; i != blocks; ++i) {
244 B0.store_le(out + 16 * i);
251BOTAN_FUNC_ISA(
"ssse3,aes") void AES_128::aesni_key_schedule(const uint8_t key[],
size_t ) {
255 const __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
256 const __m128i K1 = aes_128_key_expansion<0x01>(K0, K0);
257 const __m128i
K2 = aes_128_key_expansion<0x02>(K1, K1);
258 const __m128i K3 = aes_128_key_expansion<0x04>(K2, K2);
259 const __m128i K4 = aes_128_key_expansion<0x08>(K3, K3);
260 const __m128i K5 = aes_128_key_expansion<0x10>(K4, K4);
261 const __m128i K6 = aes_128_key_expansion<0x20>(K5, K5);
262 const __m128i K7 = aes_128_key_expansion<0x40>(K6, K6);
263 const __m128i K8 = aes_128_key_expansion<0x80>(K7, K7);
264 const __m128i K9 = aes_128_key_expansion<0x1B>(K8, K8);
265 const __m128i K10 = aes_128_key_expansion<0x36>(K9, K9);
267 __m128i* EK_mm =
reinterpret_cast<__m128i*
>(m_EK.data());
268 _mm_storeu_si128(EK_mm, K0);
269 _mm_storeu_si128(EK_mm + 1, K1);
270 _mm_storeu_si128(EK_mm + 2, K2);
271 _mm_storeu_si128(EK_mm + 3, K3);
272 _mm_storeu_si128(EK_mm + 4, K4);
273 _mm_storeu_si128(EK_mm + 5, K5);
274 _mm_storeu_si128(EK_mm + 6, K6);
275 _mm_storeu_si128(EK_mm + 7, K7);
276 _mm_storeu_si128(EK_mm + 8, K8);
277 _mm_storeu_si128(EK_mm + 9, K9);
278 _mm_storeu_si128(EK_mm + 10, K10);
282 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
283 _mm_storeu_si128(DK_mm, K10);
284 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
285 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
286 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
287 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
288 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
289 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
290 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
291 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
292 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
293 _mm_storeu_si128(DK_mm + 10, K0);
299BOTAN_FUNC_ISA(
"ssse3,aes") void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const {
320 keyxor(K0, B0, B1, B2, B3);
321 aesenc(K1, B0, B1, B2, B3);
322 aesenc(K2, B0, B1, B2, B3);
323 aesenc(K3, B0, B1, B2, B3);
324 aesenc(K4, B0, B1, B2, B3);
325 aesenc(K5, B0, B1, B2, B3);
326 aesenc(K6, B0, B1, B2, B3);
327 aesenc(K7, B0, B1, B2, B3);
328 aesenc(K8, B0, B1, B2, B3);
329 aesenc(K9, B0, B1, B2, B3);
330 aesenc(K10, B0, B1, B2, B3);
331 aesenc(K11, B0, B1, B2, B3);
332 aesenclast(K12, B0, B1, B2, B3);
334 B0.store_le(out + 16 * 0);
335 B1.store_le(out + 16 * 1);
336 B2.store_le(out + 16 * 2);
337 B3.store_le(out + 16 * 3);
344 for(
size_t i = 0; i != blocks; ++i) {
362 B0.store_le(out + 16 * i);
369BOTAN_FUNC_ISA(
"ssse3,aes") void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const {
390 keyxor(K0, B0, B1, B2, B3);
391 aesdec(K1, B0, B1, B2, B3);
392 aesdec(K2, B0, B1, B2, B3);
393 aesdec(K3, B0, B1, B2, B3);
394 aesdec(K4, B0, B1, B2, B3);
395 aesdec(K5, B0, B1, B2, B3);
396 aesdec(K6, B0, B1, B2, B3);
397 aesdec(K7, B0, B1, B2, B3);
398 aesdec(K8, B0, B1, B2, B3);
399 aesdec(K9, B0, B1, B2, B3);
400 aesdec(K10, B0, B1, B2, B3);
401 aesdec(K11, B0, B1, B2, B3);
402 aesdeclast(K12, B0, B1, B2, B3);
404 B0.store_le(out + 16 * 0);
405 B1.store_le(out + 16 * 1);
406 B2.store_le(out + 16 * 2);
407 B3.store_le(out + 16 * 3);
414 for(
size_t i = 0; i != blocks; ++i) {
432 B0.store_le(out + 16 * i);
439BOTAN_FUNC_ISA(
"ssse3,aes") void AES_192::aesni_key_schedule(const uint8_t key[],
size_t ) {
443 __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
444 __m128i K1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key + 8));
445 K1 = _mm_srli_si128(K1, 8);
449 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x01), m_EK, 6);
450 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x02), m_EK, 12);
451 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x04), m_EK, 18);
452 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x08), m_EK, 24);
453 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x10), m_EK, 30);
454 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x20), m_EK, 36);
455 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x40), m_EK, 42);
456 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x80), m_EK, 48);
459 const __m128i* EK_mm =
reinterpret_cast<const __m128i*
>(m_EK.data());
461 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
462 _mm_storeu_si128(DK_mm, _mm_loadu_si128(EK_mm + 12));
463 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
464 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
465 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
466 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
467 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
468 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
469 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
470 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
471 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
472 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
473 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
474 _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
480BOTAN_FUNC_ISA(
"ssse3,aes") void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const {
503 keyxor(K0, B0, B1, B2, B3);
504 aesenc(K1, B0, B1, B2, B3);
505 aesenc(K2, B0, B1, B2, B3);
506 aesenc(K3, B0, B1, B2, B3);
507 aesenc(K4, B0, B1, B2, B3);
508 aesenc(K5, B0, B1, B2, B3);
509 aesenc(K6, B0, B1, B2, B3);
510 aesenc(K7, B0, B1, B2, B3);
511 aesenc(K8, B0, B1, B2, B3);
512 aesenc(K9, B0, B1, B2, B3);
513 aesenc(K10, B0, B1, B2, B3);
514 aesenc(K11, B0, B1, B2, B3);
515 aesenc(K12, B0, B1, B2, B3);
516 aesenc(K13, B0, B1, B2, B3);
517 aesenclast(K14, B0, B1, B2, B3);
519 B0.store_le(out + 16 * 0);
520 B1.store_le(out + 16 * 1);
521 B2.store_le(out + 16 * 2);
522 B3.store_le(out + 16 * 3);
529 for(
size_t i = 0; i != blocks; ++i) {
549 B0.store_le(out + 16 * i);
556BOTAN_FUNC_ISA(
"ssse3,aes") void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const {
579 keyxor(K0, B0, B1, B2, B3);
580 aesdec(K1, B0, B1, B2, B3);
581 aesdec(K2, B0, B1, B2, B3);
582 aesdec(K3, B0, B1, B2, B3);
583 aesdec(K4, B0, B1, B2, B3);
584 aesdec(K5, B0, B1, B2, B3);
585 aesdec(K6, B0, B1, B2, B3);
586 aesdec(K7, B0, B1, B2, B3);
587 aesdec(K8, B0, B1, B2, B3);
588 aesdec(K9, B0, B1, B2, B3);
589 aesdec(K10, B0, B1, B2, B3);
590 aesdec(K11, B0, B1, B2, B3);
591 aesdec(K12, B0, B1, B2, B3);
592 aesdec(K13, B0, B1, B2, B3);
593 aesdeclast(K14, B0, B1, B2, B3);
595 B0.store_le(out + 16 * 0);
596 B1.store_le(out + 16 * 1);
597 B2.store_le(out + 16 * 2);
598 B3.store_le(out + 16 * 3);
605 for(
size_t i = 0; i != blocks; ++i) {
625 B0.store_le(out + 16 * i);
632BOTAN_FUNC_ISA(
"ssse3,aes") void AES_256::aesni_key_schedule(const uint8_t key[],
size_t ) {
636 const __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
637 const __m128i K1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key + 16));
639 const __m128i
K2 = aes_128_key_expansion<0x01>(K0, K1);
640 const __m128i K3 = aes_256_key_expansion(K1, K2);
642 const __m128i K4 = aes_128_key_expansion<0x02>(K2, K3);
643 const __m128i K5 = aes_256_key_expansion(K3, K4);
645 const __m128i K6 = aes_128_key_expansion<0x04>(K4, K5);
646 const __m128i K7 = aes_256_key_expansion(K5, K6);
648 const __m128i K8 = aes_128_key_expansion<0x08>(K6, K7);
649 const __m128i K9 = aes_256_key_expansion(K7, K8);
651 const __m128i K10 = aes_128_key_expansion<0x10>(K8, K9);
652 const __m128i K11 = aes_256_key_expansion(K9, K10);
654 const __m128i K12 = aes_128_key_expansion<0x20>(K10, K11);
655 const __m128i K13 = aes_256_key_expansion(K11, K12);
657 const __m128i K14 = aes_128_key_expansion<0x40>(K12, K13);
659 __m128i* EK_mm =
reinterpret_cast<__m128i*
>(m_EK.data());
660 _mm_storeu_si128(EK_mm, K0);
661 _mm_storeu_si128(EK_mm + 1, K1);
662 _mm_storeu_si128(EK_mm + 2, K2);
663 _mm_storeu_si128(EK_mm + 3, K3);
664 _mm_storeu_si128(EK_mm + 4, K4);
665 _mm_storeu_si128(EK_mm + 5, K5);
666 _mm_storeu_si128(EK_mm + 6, K6);
667 _mm_storeu_si128(EK_mm + 7, K7);
668 _mm_storeu_si128(EK_mm + 8, K8);
669 _mm_storeu_si128(EK_mm + 9, K9);
670 _mm_storeu_si128(EK_mm + 10, K10);
671 _mm_storeu_si128(EK_mm + 11, K11);
672 _mm_storeu_si128(EK_mm + 12, K12);
673 _mm_storeu_si128(EK_mm + 13, K13);
674 _mm_storeu_si128(EK_mm + 14, K14);
677 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
678 _mm_storeu_si128(DK_mm, K14);
679 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
680 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
681 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
682 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
683 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
684 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
685 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
686 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
687 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
688 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
689 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
690 _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
691 _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
692 _mm_storeu_si128(DK_mm + 14, K0);
void store_le(uint32_t out[4]) const noexcept
static SIMD_4x32 load_le(const void *in) noexcept
#define BOTAN_FUNC_ISA(isa)
#define BOTAN_FORCE_INLINE
#define BOTAN_FUNC_ISA_INLINE(isa)
constexpr auto load_le(ParamTs &&... params)
std::vector< T, secure_allocator< T > > secure_vector