8#include <botan/internal/aes.h>
10#include <botan/internal/isa_extn.h>
11#include <botan/internal/loadstor.h>
12#include <botan/internal/simd_4x32.h>
20BOTAN_FN_ISA_AESNI
inline __m128i aes_128_key_expansion(__m128i key, __m128i key_getting_rcon) {
21 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key_getting_rcon, RC);
22 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3, 3, 3, 3));
23 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
24 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
25 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
26 return _mm_xor_si128(key, key_with_rcon);
30void aes_192_key_expansion(
35 key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1, 1, 1, 1));
36 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
37 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
38 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
39 key1 = _mm_xor_si128(key1, key2_with_rcon);
42 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(&out[offset]), key1);
48 key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
49 key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3, 3, 3, 3)));
52 out[offset + 4] = _mm_cvtsi128_si32(key2);
53 out[offset + 5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
59BOTAN_FN_ISA_AESNI __m128i aes_256_key_expansion(__m128i key, __m128i key2) {
60 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
61 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2, 2, 2, 2));
63 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
64 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
65 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
66 return _mm_xor_si128(key, key_with_rcon);
77 B =
SIMD_4x32(_mm_aesenc_si128(B.raw(), K.raw()));
82 B0 =
SIMD_4x32(_mm_aesenc_si128(B0.raw(), K.raw()));
83 B1 =
SIMD_4x32(_mm_aesenc_si128(B1.raw(), K.raw()));
84 B2 =
SIMD_4x32(_mm_aesenc_si128(B2.raw(), K.raw()));
85 B3 =
SIMD_4x32(_mm_aesenc_si128(B3.raw(), K.raw()));
89 B =
SIMD_4x32(_mm_aesenclast_si128(B.raw(), K.raw()));
94 B0 =
SIMD_4x32(_mm_aesenclast_si128(B0.raw(), K.raw()));
95 B1 =
SIMD_4x32(_mm_aesenclast_si128(B1.raw(), K.raw()));
96 B2 =
SIMD_4x32(_mm_aesenclast_si128(B2.raw(), K.raw()));
97 B3 =
SIMD_4x32(_mm_aesenclast_si128(B3.raw(), K.raw()));
101 B =
SIMD_4x32(_mm_aesdec_si128(B.raw(), K.raw()));
106 B0 =
SIMD_4x32(_mm_aesdec_si128(B0.raw(), K.raw()));
107 B1 =
SIMD_4x32(_mm_aesdec_si128(B1.raw(), K.raw()));
108 B2 =
SIMD_4x32(_mm_aesdec_si128(B2.raw(), K.raw()));
109 B3 =
SIMD_4x32(_mm_aesdec_si128(B3.raw(), K.raw()));
113 B =
SIMD_4x32(_mm_aesdeclast_si128(B.raw(), K.raw()));
118 B0 =
SIMD_4x32(_mm_aesdeclast_si128(B0.raw(), K.raw()));
119 B1 =
SIMD_4x32(_mm_aesdeclast_si128(B1.raw(), K.raw()));
120 B2 =
SIMD_4x32(_mm_aesdeclast_si128(B2.raw(), K.raw()));
121 B3 =
SIMD_4x32(_mm_aesdeclast_si128(B3.raw(), K.raw()));
129BOTAN_FN_ISA_AESNI
void AES_128::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
148 keyxor(K0, B0, B1, B2, B3);
149 aesenc(K1, B0, B1, B2, B3);
150 aesenc(K2, B0, B1, B2, B3);
151 aesenc(K3, B0, B1, B2, B3);
152 aesenc(K4, B0, B1, B2, B3);
153 aesenc(K5, B0, B1, B2, B3);
154 aesenc(K6, B0, B1, B2, B3);
155 aesenc(K7, B0, B1, B2, B3);
156 aesenc(K8, B0, B1, B2, B3);
157 aesenc(K9, B0, B1, B2, B3);
158 aesenclast(K10, B0, B1, B2, B3);
160 B0.store_le(out + 16 * 0);
161 B1.store_le(out + 16 * 1);
162 B2.store_le(out + 16 * 2);
163 B3.store_le(out + 16 * 3);
170 for(
size_t i = 0; i != blocks; ++i) {
185 B0.store_le(out + 16 * i);
192BOTAN_FN_ISA_AESNI
void AES_128::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
211 keyxor(K0, B0, B1, B2, B3);
212 aesdec(K1, B0, B1, B2, B3);
213 aesdec(K2, B0, B1, B2, B3);
214 aesdec(K3, B0, B1, B2, B3);
215 aesdec(K4, B0, B1, B2, B3);
216 aesdec(K5, B0, B1, B2, B3);
217 aesdec(K6, B0, B1, B2, B3);
218 aesdec(K7, B0, B1, B2, B3);
219 aesdec(K8, B0, B1, B2, B3);
220 aesdec(K9, B0, B1, B2, B3);
221 aesdeclast(K10, B0, B1, B2, B3);
223 B0.store_le(out + 16 * 0);
224 B1.store_le(out + 16 * 1);
225 B2.store_le(out + 16 * 2);
226 B3.store_le(out + 16 * 3);
233 for(
size_t i = 0; i != blocks; ++i) {
248 B0.store_le(out + 16 * i);
255BOTAN_FN_ISA_AESNI
void AES_128::aesni_key_schedule(
const uint8_t key[],
size_t ) {
259 const __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
260 const __m128i
K1 = aes_128_key_expansion<0x01>(K0, K0);
261 const __m128i
K2 = aes_128_key_expansion<0x02>(K1, K1);
262 const __m128i
K3 = aes_128_key_expansion<0x04>(K2, K2);
263 const __m128i
K4 = aes_128_key_expansion<0x08>(K3, K3);
264 const __m128i K5 = aes_128_key_expansion<0x10>(K4, K4);
265 const __m128i K6 = aes_128_key_expansion<0x20>(K5, K5);
266 const __m128i K7 = aes_128_key_expansion<0x40>(K6, K6);
267 const __m128i K8 = aes_128_key_expansion<0x80>(K7, K7);
268 const __m128i K9 = aes_128_key_expansion<0x1B>(K8, K8);
269 const __m128i K10 = aes_128_key_expansion<0x36>(K9, K9);
271 __m128i* EK_mm =
reinterpret_cast<__m128i*
>(m_EK.data());
272 _mm_storeu_si128(EK_mm, K0);
273 _mm_storeu_si128(EK_mm + 1, K1);
274 _mm_storeu_si128(EK_mm + 2, K2);
275 _mm_storeu_si128(EK_mm + 3, K3);
276 _mm_storeu_si128(EK_mm + 4, K4);
277 _mm_storeu_si128(EK_mm + 5, K5);
278 _mm_storeu_si128(EK_mm + 6, K6);
279 _mm_storeu_si128(EK_mm + 7, K7);
280 _mm_storeu_si128(EK_mm + 8, K8);
281 _mm_storeu_si128(EK_mm + 9, K9);
282 _mm_storeu_si128(EK_mm + 10, K10);
286 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
287 _mm_storeu_si128(DK_mm, K10);
288 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
289 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
290 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
291 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
292 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
293 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
294 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
295 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
296 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
297 _mm_storeu_si128(DK_mm + 10, K0);
303BOTAN_FN_ISA_AESNI
void AES_192::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
324 keyxor(K0, B0, B1, B2, B3);
325 aesenc(K1, B0, B1, B2, B3);
326 aesenc(K2, B0, B1, B2, B3);
327 aesenc(K3, B0, B1, B2, B3);
328 aesenc(K4, B0, B1, B2, B3);
329 aesenc(K5, B0, B1, B2, B3);
330 aesenc(K6, B0, B1, B2, B3);
331 aesenc(K7, B0, B1, B2, B3);
332 aesenc(K8, B0, B1, B2, B3);
333 aesenc(K9, B0, B1, B2, B3);
334 aesenc(K10, B0, B1, B2, B3);
335 aesenc(K11, B0, B1, B2, B3);
336 aesenclast(K12, B0, B1, B2, B3);
338 B0.store_le(out + 16 * 0);
339 B1.store_le(out + 16 * 1);
340 B2.store_le(out + 16 * 2);
341 B3.store_le(out + 16 * 3);
348 for(
size_t i = 0; i != blocks; ++i) {
366 B0.store_le(out + 16 * i);
373BOTAN_FN_ISA_AESNI
void AES_192::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
394 keyxor(K0, B0, B1, B2, B3);
395 aesdec(K1, B0, B1, B2, B3);
396 aesdec(K2, B0, B1, B2, B3);
397 aesdec(K3, B0, B1, B2, B3);
398 aesdec(K4, B0, B1, B2, B3);
399 aesdec(K5, B0, B1, B2, B3);
400 aesdec(K6, B0, B1, B2, B3);
401 aesdec(K7, B0, B1, B2, B3);
402 aesdec(K8, B0, B1, B2, B3);
403 aesdec(K9, B0, B1, B2, B3);
404 aesdec(K10, B0, B1, B2, B3);
405 aesdec(K11, B0, B1, B2, B3);
406 aesdeclast(K12, B0, B1, B2, B3);
408 B0.store_le(out + 16 * 0);
409 B1.store_le(out + 16 * 1);
410 B2.store_le(out + 16 * 2);
411 B3.store_le(out + 16 * 3);
418 for(
size_t i = 0; i != blocks; ++i) {
436 B0.store_le(out + 16 * i);
443BOTAN_FN_ISA_AESNI
void AES_192::aesni_key_schedule(
const uint8_t key[],
size_t ) {
447 __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
448 __m128i
K1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key + 8));
449 K1 = _mm_srli_si128(K1, 8);
453 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x01), m_EK, 6);
454 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x02), m_EK, 12);
455 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x04), m_EK, 18);
456 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x08), m_EK, 24);
457 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x10), m_EK, 30);
458 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x20), m_EK, 36);
459 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x40), m_EK, 42);
460 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x80), m_EK, 48);
463 const __m128i* EK_mm =
reinterpret_cast<const __m128i*
>(m_EK.data());
465 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
466 _mm_storeu_si128(DK_mm, _mm_loadu_si128(EK_mm + 12));
467 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
468 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
469 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
470 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
471 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
472 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
473 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
474 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
475 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
476 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
477 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
478 _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
484BOTAN_FN_ISA_AESNI
void AES_256::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
507 keyxor(K0, B0, B1, B2, B3);
508 aesenc(K1, B0, B1, B2, B3);
509 aesenc(K2, B0, B1, B2, B3);
510 aesenc(K3, B0, B1, B2, B3);
511 aesenc(K4, B0, B1, B2, B3);
512 aesenc(K5, B0, B1, B2, B3);
513 aesenc(K6, B0, B1, B2, B3);
514 aesenc(K7, B0, B1, B2, B3);
515 aesenc(K8, B0, B1, B2, B3);
516 aesenc(K9, B0, B1, B2, B3);
517 aesenc(K10, B0, B1, B2, B3);
518 aesenc(K11, B0, B1, B2, B3);
519 aesenc(K12, B0, B1, B2, B3);
520 aesenc(K13, B0, B1, B2, B3);
521 aesenclast(K14, B0, B1, B2, B3);
523 B0.store_le(out + 16 * 0);
524 B1.store_le(out + 16 * 1);
525 B2.store_le(out + 16 * 2);
526 B3.store_le(out + 16 * 3);
533 for(
size_t i = 0; i != blocks; ++i) {
553 B0.store_le(out + 16 * i);
560BOTAN_FN_ISA_AESNI
void AES_256::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
583 keyxor(K0, B0, B1, B2, B3);
584 aesdec(K1, B0, B1, B2, B3);
585 aesdec(K2, B0, B1, B2, B3);
586 aesdec(K3, B0, B1, B2, B3);
587 aesdec(K4, B0, B1, B2, B3);
588 aesdec(K5, B0, B1, B2, B3);
589 aesdec(K6, B0, B1, B2, B3);
590 aesdec(K7, B0, B1, B2, B3);
591 aesdec(K8, B0, B1, B2, B3);
592 aesdec(K9, B0, B1, B2, B3);
593 aesdec(K10, B0, B1, B2, B3);
594 aesdec(K11, B0, B1, B2, B3);
595 aesdec(K12, B0, B1, B2, B3);
596 aesdec(K13, B0, B1, B2, B3);
597 aesdeclast(K14, B0, B1, B2, B3);
599 B0.store_le(out + 16 * 0);
600 B1.store_le(out + 16 * 1);
601 B2.store_le(out + 16 * 2);
602 B3.store_le(out + 16 * 3);
609 for(
size_t i = 0; i != blocks; ++i) {
629 B0.store_le(out + 16 * i);
636BOTAN_FN_ISA_AESNI
void AES_256::aesni_key_schedule(
const uint8_t key[],
size_t ) {
640 const __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
641 const __m128i
K1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key + 16));
643 const __m128i
K2 = aes_128_key_expansion<0x01>(K0, K1);
644 const __m128i
K3 = aes_256_key_expansion(K1, K2);
646 const __m128i
K4 = aes_128_key_expansion<0x02>(K2, K3);
647 const __m128i K5 = aes_256_key_expansion(K3, K4);
649 const __m128i K6 = aes_128_key_expansion<0x04>(K4, K5);
650 const __m128i K7 = aes_256_key_expansion(K5, K6);
652 const __m128i K8 = aes_128_key_expansion<0x08>(K6, K7);
653 const __m128i K9 = aes_256_key_expansion(K7, K8);
655 const __m128i K10 = aes_128_key_expansion<0x10>(K8, K9);
656 const __m128i K11 = aes_256_key_expansion(K9, K10);
658 const __m128i K12 = aes_128_key_expansion<0x20>(K10, K11);
659 const __m128i K13 = aes_256_key_expansion(K11, K12);
661 const __m128i K14 = aes_128_key_expansion<0x40>(K12, K13);
663 __m128i* EK_mm =
reinterpret_cast<__m128i*
>(m_EK.data());
664 _mm_storeu_si128(EK_mm, K0);
665 _mm_storeu_si128(EK_mm + 1, K1);
666 _mm_storeu_si128(EK_mm + 2, K2);
667 _mm_storeu_si128(EK_mm + 3, K3);
668 _mm_storeu_si128(EK_mm + 4, K4);
669 _mm_storeu_si128(EK_mm + 5, K5);
670 _mm_storeu_si128(EK_mm + 6, K6);
671 _mm_storeu_si128(EK_mm + 7, K7);
672 _mm_storeu_si128(EK_mm + 8, K8);
673 _mm_storeu_si128(EK_mm + 9, K9);
674 _mm_storeu_si128(EK_mm + 10, K10);
675 _mm_storeu_si128(EK_mm + 11, K11);
676 _mm_storeu_si128(EK_mm + 12, K12);
677 _mm_storeu_si128(EK_mm + 13, K13);
678 _mm_storeu_si128(EK_mm + 14, K14);
681 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
682 _mm_storeu_si128(DK_mm, K14);
683 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
684 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
685 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
686 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
687 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
688 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
689 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
690 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
691 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
692 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
693 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
694 _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
695 _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
696 _mm_storeu_si128(DK_mm + 14, K0);
static SIMD_4x32 load_le(const void *in) noexcept
#define BOTAN_FORCE_INLINE
constexpr auto load_le(ParamTs &&... params)
std::vector< T, secure_allocator< T > > secure_vector