8#include <botan/internal/aes.h>
10#include <botan/internal/isa_extn.h>
11#include <botan/internal/loadstor.h>
12#include <botan/internal/simd_4x32.h>
22BOTAN_FN_ISA_AESNI
inline __m128i aes_128_key_expansion(__m128i key, __m128i key_getting_rcon) {
23 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key_getting_rcon, RC);
24 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3, 3, 3, 3));
25 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
26 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
27 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
28 return _mm_xor_si128(key, key_with_rcon);
32void aes_192_key_expansion(
37 key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1, 1, 1, 1));
38 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
39 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
40 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
41 key1 = _mm_xor_si128(key1, key2_with_rcon);
44 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(&out[offset]), key1);
50 key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
51 key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3, 3, 3, 3)));
54 out[offset + 4] = _mm_cvtsi128_si32(key2);
55 out[offset + 5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
61BOTAN_FN_ISA_AESNI __m128i aes_256_key_expansion(__m128i key, __m128i key2) {
62 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
63 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2, 2, 2, 2));
65 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
66 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
67 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
68 return _mm_xor_si128(key, key_with_rcon);
79 B =
SIMD_4x32(_mm_aesenc_si128(B.raw(), K.raw()));
84 B0 =
SIMD_4x32(_mm_aesenc_si128(B0.raw(), K.raw()));
85 B1 =
SIMD_4x32(_mm_aesenc_si128(B1.raw(), K.raw()));
86 B2 =
SIMD_4x32(_mm_aesenc_si128(B2.raw(), K.raw()));
87 B3 =
SIMD_4x32(_mm_aesenc_si128(B3.raw(), K.raw()));
91 B =
SIMD_4x32(_mm_aesenclast_si128(B.raw(), K.raw()));
96 B0 =
SIMD_4x32(_mm_aesenclast_si128(B0.raw(), K.raw()));
97 B1 =
SIMD_4x32(_mm_aesenclast_si128(B1.raw(), K.raw()));
98 B2 =
SIMD_4x32(_mm_aesenclast_si128(B2.raw(), K.raw()));
99 B3 =
SIMD_4x32(_mm_aesenclast_si128(B3.raw(), K.raw()));
103 B =
SIMD_4x32(_mm_aesdec_si128(B.raw(), K.raw()));
108 B0 =
SIMD_4x32(_mm_aesdec_si128(B0.raw(), K.raw()));
109 B1 =
SIMD_4x32(_mm_aesdec_si128(B1.raw(), K.raw()));
110 B2 =
SIMD_4x32(_mm_aesdec_si128(B2.raw(), K.raw()));
111 B3 =
SIMD_4x32(_mm_aesdec_si128(B3.raw(), K.raw()));
115 B =
SIMD_4x32(_mm_aesdeclast_si128(B.raw(), K.raw()));
120 B0 =
SIMD_4x32(_mm_aesdeclast_si128(B0.raw(), K.raw()));
121 B1 =
SIMD_4x32(_mm_aesdeclast_si128(B1.raw(), K.raw()));
122 B2 =
SIMD_4x32(_mm_aesdeclast_si128(B2.raw(), K.raw()));
123 B3 =
SIMD_4x32(_mm_aesdeclast_si128(B3.raw(), K.raw()));
133BOTAN_FN_ISA_AESNI
void AES_128::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
152 keyxor(K0, B0, B1, B2, B3);
153 aesenc(K1, B0, B1, B2, B3);
154 aesenc(K2, B0, B1, B2, B3);
155 aesenc(K3, B0, B1, B2, B3);
156 aesenc(K4, B0, B1, B2, B3);
157 aesenc(K5, B0, B1, B2, B3);
158 aesenc(K6, B0, B1, B2, B3);
159 aesenc(K7, B0, B1, B2, B3);
160 aesenc(K8, B0, B1, B2, B3);
161 aesenc(K9, B0, B1, B2, B3);
162 aesenclast(K10, B0, B1, B2, B3);
164 B0.store_le(out + 16 * 0);
165 B1.store_le(out + 16 * 1);
166 B2.store_le(out + 16 * 2);
167 B3.store_le(out + 16 * 3);
174 for(
size_t i = 0; i != blocks; ++i) {
189 B0.store_le(out + 16 * i);
196BOTAN_FN_ISA_AESNI
void AES_128::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
215 keyxor(K0, B0, B1, B2, B3);
216 aesdec(K1, B0, B1, B2, B3);
217 aesdec(K2, B0, B1, B2, B3);
218 aesdec(K3, B0, B1, B2, B3);
219 aesdec(K4, B0, B1, B2, B3);
220 aesdec(K5, B0, B1, B2, B3);
221 aesdec(K6, B0, B1, B2, B3);
222 aesdec(K7, B0, B1, B2, B3);
223 aesdec(K8, B0, B1, B2, B3);
224 aesdec(K9, B0, B1, B2, B3);
225 aesdeclast(K10, B0, B1, B2, B3);
227 B0.store_le(out + 16 * 0);
228 B1.store_le(out + 16 * 1);
229 B2.store_le(out + 16 * 2);
230 B3.store_le(out + 16 * 3);
237 for(
size_t i = 0; i != blocks; ++i) {
252 B0.store_le(out + 16 * i);
259BOTAN_FN_ISA_AESNI
void AES_128::aesni_key_schedule(
const uint8_t key[],
size_t ) {
265 const __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
266 const __m128i
K1 = aes_128_key_expansion<0x01>(K0, K0);
267 const __m128i
K2 = aes_128_key_expansion<0x02>(K1, K1);
268 const __m128i
K3 = aes_128_key_expansion<0x04>(K2, K2);
269 const __m128i
K4 = aes_128_key_expansion<0x08>(K3, K3);
270 const __m128i K5 = aes_128_key_expansion<0x10>(K4, K4);
271 const __m128i K6 = aes_128_key_expansion<0x20>(K5, K5);
272 const __m128i K7 = aes_128_key_expansion<0x40>(K6, K6);
273 const __m128i K8 = aes_128_key_expansion<0x80>(K7, K7);
274 const __m128i K9 = aes_128_key_expansion<0x1B>(K8, K8);
275 const __m128i K10 = aes_128_key_expansion<0x36>(K9, K9);
277 __m128i* EK_mm =
reinterpret_cast<__m128i*
>(m_EK.data());
278 _mm_storeu_si128(EK_mm, K0);
279 _mm_storeu_si128(EK_mm + 1, K1);
280 _mm_storeu_si128(EK_mm + 2, K2);
281 _mm_storeu_si128(EK_mm + 3, K3);
282 _mm_storeu_si128(EK_mm + 4, K4);
283 _mm_storeu_si128(EK_mm + 5, K5);
284 _mm_storeu_si128(EK_mm + 6, K6);
285 _mm_storeu_si128(EK_mm + 7, K7);
286 _mm_storeu_si128(EK_mm + 8, K8);
287 _mm_storeu_si128(EK_mm + 9, K9);
288 _mm_storeu_si128(EK_mm + 10, K10);
292 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
293 _mm_storeu_si128(DK_mm, K10);
294 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
295 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
296 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
297 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
298 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
299 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
300 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
301 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
302 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
303 _mm_storeu_si128(DK_mm + 10, K0);
311BOTAN_FN_ISA_AESNI
void AES_192::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
332 keyxor(K0, B0, B1, B2, B3);
333 aesenc(K1, B0, B1, B2, B3);
334 aesenc(K2, B0, B1, B2, B3);
335 aesenc(K3, B0, B1, B2, B3);
336 aesenc(K4, B0, B1, B2, B3);
337 aesenc(K5, B0, B1, B2, B3);
338 aesenc(K6, B0, B1, B2, B3);
339 aesenc(K7, B0, B1, B2, B3);
340 aesenc(K8, B0, B1, B2, B3);
341 aesenc(K9, B0, B1, B2, B3);
342 aesenc(K10, B0, B1, B2, B3);
343 aesenc(K11, B0, B1, B2, B3);
344 aesenclast(K12, B0, B1, B2, B3);
346 B0.store_le(out + 16 * 0);
347 B1.store_le(out + 16 * 1);
348 B2.store_le(out + 16 * 2);
349 B3.store_le(out + 16 * 3);
356 for(
size_t i = 0; i != blocks; ++i) {
374 B0.store_le(out + 16 * i);
381BOTAN_FN_ISA_AESNI
void AES_192::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
402 keyxor(K0, B0, B1, B2, B3);
403 aesdec(K1, B0, B1, B2, B3);
404 aesdec(K2, B0, B1, B2, B3);
405 aesdec(K3, B0, B1, B2, B3);
406 aesdec(K4, B0, B1, B2, B3);
407 aesdec(K5, B0, B1, B2, B3);
408 aesdec(K6, B0, B1, B2, B3);
409 aesdec(K7, B0, B1, B2, B3);
410 aesdec(K8, B0, B1, B2, B3);
411 aesdec(K9, B0, B1, B2, B3);
412 aesdec(K10, B0, B1, B2, B3);
413 aesdec(K11, B0, B1, B2, B3);
414 aesdeclast(K12, B0, B1, B2, B3);
416 B0.store_le(out + 16 * 0);
417 B1.store_le(out + 16 * 1);
418 B2.store_le(out + 16 * 2);
419 B3.store_le(out + 16 * 3);
426 for(
size_t i = 0; i != blocks; ++i) {
444 B0.store_le(out + 16 * i);
451BOTAN_FN_ISA_AESNI
void AES_192::aesni_key_schedule(
const uint8_t key[],
size_t ) {
457 __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
458 __m128i
K1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key + 8));
459 K1 = _mm_srli_si128(K1, 8);
463 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x01), m_EK, 6);
464 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x02), m_EK, 12);
465 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x04), m_EK, 18);
466 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x08), m_EK, 24);
467 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x10), m_EK, 30);
468 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x20), m_EK, 36);
469 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x40), m_EK, 42);
470 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x80), m_EK, 48);
473 const __m128i* EK_mm =
reinterpret_cast<const __m128i*
>(m_EK.data());
475 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
476 _mm_storeu_si128(DK_mm, _mm_loadu_si128(EK_mm + 12));
477 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
478 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
479 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
480 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
481 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
482 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
483 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
484 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
485 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
486 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
487 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
488 _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
496BOTAN_FN_ISA_AESNI
void AES_256::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
519 keyxor(K0, B0, B1, B2, B3);
520 aesenc(K1, B0, B1, B2, B3);
521 aesenc(K2, B0, B1, B2, B3);
522 aesenc(K3, B0, B1, B2, B3);
523 aesenc(K4, B0, B1, B2, B3);
524 aesenc(K5, B0, B1, B2, B3);
525 aesenc(K6, B0, B1, B2, B3);
526 aesenc(K7, B0, B1, B2, B3);
527 aesenc(K8, B0, B1, B2, B3);
528 aesenc(K9, B0, B1, B2, B3);
529 aesenc(K10, B0, B1, B2, B3);
530 aesenc(K11, B0, B1, B2, B3);
531 aesenc(K12, B0, B1, B2, B3);
532 aesenc(K13, B0, B1, B2, B3);
533 aesenclast(K14, B0, B1, B2, B3);
535 B0.store_le(out + 16 * 0);
536 B1.store_le(out + 16 * 1);
537 B2.store_le(out + 16 * 2);
538 B3.store_le(out + 16 * 3);
545 for(
size_t i = 0; i != blocks; ++i) {
565 B0.store_le(out + 16 * i);
572BOTAN_FN_ISA_AESNI
void AES_256::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
595 keyxor(K0, B0, B1, B2, B3);
596 aesdec(K1, B0, B1, B2, B3);
597 aesdec(K2, B0, B1, B2, B3);
598 aesdec(K3, B0, B1, B2, B3);
599 aesdec(K4, B0, B1, B2, B3);
600 aesdec(K5, B0, B1, B2, B3);
601 aesdec(K6, B0, B1, B2, B3);
602 aesdec(K7, B0, B1, B2, B3);
603 aesdec(K8, B0, B1, B2, B3);
604 aesdec(K9, B0, B1, B2, B3);
605 aesdec(K10, B0, B1, B2, B3);
606 aesdec(K11, B0, B1, B2, B3);
607 aesdec(K12, B0, B1, B2, B3);
608 aesdec(K13, B0, B1, B2, B3);
609 aesdeclast(K14, B0, B1, B2, B3);
611 B0.store_le(out + 16 * 0);
612 B1.store_le(out + 16 * 1);
613 B2.store_le(out + 16 * 2);
614 B3.store_le(out + 16 * 3);
621 for(
size_t i = 0; i != blocks; ++i) {
641 B0.store_le(out + 16 * i);
648BOTAN_FN_ISA_AESNI
void AES_256::aesni_key_schedule(
const uint8_t key[],
size_t ) {
654 const __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
655 const __m128i
K1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key + 16));
657 const __m128i
K2 = aes_128_key_expansion<0x01>(K0, K1);
658 const __m128i
K3 = aes_256_key_expansion(K1, K2);
660 const __m128i
K4 = aes_128_key_expansion<0x02>(K2, K3);
661 const __m128i K5 = aes_256_key_expansion(K3, K4);
663 const __m128i K6 = aes_128_key_expansion<0x04>(K4, K5);
664 const __m128i K7 = aes_256_key_expansion(K5, K6);
666 const __m128i K8 = aes_128_key_expansion<0x08>(K6, K7);
667 const __m128i K9 = aes_256_key_expansion(K7, K8);
669 const __m128i K10 = aes_128_key_expansion<0x10>(K8, K9);
670 const __m128i K11 = aes_256_key_expansion(K9, K10);
672 const __m128i K12 = aes_128_key_expansion<0x20>(K10, K11);
673 const __m128i K13 = aes_256_key_expansion(K11, K12);
675 const __m128i K14 = aes_128_key_expansion<0x40>(K12, K13);
677 __m128i* EK_mm =
reinterpret_cast<__m128i*
>(m_EK.data());
678 _mm_storeu_si128(EK_mm, K0);
679 _mm_storeu_si128(EK_mm + 1, K1);
680 _mm_storeu_si128(EK_mm + 2, K2);
681 _mm_storeu_si128(EK_mm + 3, K3);
682 _mm_storeu_si128(EK_mm + 4, K4);
683 _mm_storeu_si128(EK_mm + 5, K5);
684 _mm_storeu_si128(EK_mm + 6, K6);
685 _mm_storeu_si128(EK_mm + 7, K7);
686 _mm_storeu_si128(EK_mm + 8, K8);
687 _mm_storeu_si128(EK_mm + 9, K9);
688 _mm_storeu_si128(EK_mm + 10, K10);
689 _mm_storeu_si128(EK_mm + 11, K11);
690 _mm_storeu_si128(EK_mm + 12, K12);
691 _mm_storeu_si128(EK_mm + 13, K13);
692 _mm_storeu_si128(EK_mm + 14, K14);
695 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
696 _mm_storeu_si128(DK_mm, K14);
697 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
698 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
699 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
700 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
701 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
702 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
703 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
704 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
705 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
706 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
707 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
708 _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
709 _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
710 _mm_storeu_si128(DK_mm + 14, K0);
static SIMD_4x32 load_le(const void *in) noexcept
#define BOTAN_FORCE_INLINE
constexpr auto load_le(ParamTs &&... params)
std::vector< T, secure_allocator< T > > secure_vector