8#include <botan/internal/aes.h>
10#include <botan/internal/isa_extn.h>
11#include <botan/internal/loadstor.h>
12#include <botan/internal/simd_4x32.h>
22BOTAN_FN_ISA_AESNI
inline __m128i aes_128_key_expansion(__m128i key, __m128i key_getting_rcon) {
23 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key_getting_rcon, RC);
24 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3, 3, 3, 3));
25 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
26 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
27 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
28 return _mm_xor_si128(key, key_with_rcon);
32void aes_192_key_expansion(
37 key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1, 1, 1, 1));
38 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
39 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
40 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
41 key1 = _mm_xor_si128(key1, key2_with_rcon);
44 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(&out[offset]), key1);
50 key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
51 key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3, 3, 3, 3)));
54 out[offset + 4] = _mm_cvtsi128_si32(key2);
55 out[offset + 5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
61BOTAN_FN_ISA_AESNI __m128i aes_256_key_expansion(__m128i key, __m128i key2) {
62 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
63 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2, 2, 2, 2));
65 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
66 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
67 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
68 return _mm_xor_si128(key, key_with_rcon);
80 B =
SIMD_4x32(_mm_aesenc_si128(B.raw(), K.raw()));
85 B0 =
SIMD_4x32(_mm_aesenc_si128(B0.raw(), K.raw()));
86 B1 =
SIMD_4x32(_mm_aesenc_si128(B1.raw(), K.raw()));
87 B2 =
SIMD_4x32(_mm_aesenc_si128(B2.raw(), K.raw()));
88 B3 =
SIMD_4x32(_mm_aesenc_si128(B3.raw(), K.raw()));
92 B =
SIMD_4x32(_mm_aesenclast_si128(B.raw(), K.raw()));
97 B0 =
SIMD_4x32(_mm_aesenclast_si128(B0.raw(), K.raw()));
98 B1 =
SIMD_4x32(_mm_aesenclast_si128(B1.raw(), K.raw()));
99 B2 =
SIMD_4x32(_mm_aesenclast_si128(B2.raw(), K.raw()));
100 B3 =
SIMD_4x32(_mm_aesenclast_si128(B3.raw(), K.raw()));
104 B =
SIMD_4x32(_mm_aesdec_si128(B.raw(), K.raw()));
109 B0 =
SIMD_4x32(_mm_aesdec_si128(B0.raw(), K.raw()));
110 B1 =
SIMD_4x32(_mm_aesdec_si128(B1.raw(), K.raw()));
111 B2 =
SIMD_4x32(_mm_aesdec_si128(B2.raw(), K.raw()));
112 B3 =
SIMD_4x32(_mm_aesdec_si128(B3.raw(), K.raw()));
116 B =
SIMD_4x32(_mm_aesdeclast_si128(B.raw(), K.raw()));
121 B0 =
SIMD_4x32(_mm_aesdeclast_si128(B0.raw(), K.raw()));
122 B1 =
SIMD_4x32(_mm_aesdeclast_si128(B1.raw(), K.raw()));
123 B2 =
SIMD_4x32(_mm_aesdeclast_si128(B2.raw(), K.raw()));
124 B3 =
SIMD_4x32(_mm_aesdeclast_si128(B3.raw(), K.raw()));
134BOTAN_FN_ISA_AESNI
void AES_128::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
153 keyxor(K0, B0, B1, B2, B3);
154 aesenc(K1, B0, B1, B2, B3);
155 aesenc(K2, B0, B1, B2, B3);
156 aesenc(K3, B0, B1, B2, B3);
157 aesenc(K4, B0, B1, B2, B3);
158 aesenc(K5, B0, B1, B2, B3);
159 aesenc(K6, B0, B1, B2, B3);
160 aesenc(K7, B0, B1, B2, B3);
161 aesenc(K8, B0, B1, B2, B3);
162 aesenc(K9, B0, B1, B2, B3);
163 aesenclast(K10, B0, B1, B2, B3);
165 B0.store_le(out + 16 * 0);
166 B1.store_le(out + 16 * 1);
167 B2.store_le(out + 16 * 2);
168 B3.store_le(out + 16 * 3);
175 for(
size_t i = 0; i != blocks; ++i) {
190 B0.store_le(out + 16 * i);
197BOTAN_FN_ISA_AESNI
void AES_128::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
216 keyxor(K0, B0, B1, B2, B3);
217 aesdec(K1, B0, B1, B2, B3);
218 aesdec(K2, B0, B1, B2, B3);
219 aesdec(K3, B0, B1, B2, B3);
220 aesdec(K4, B0, B1, B2, B3);
221 aesdec(K5, B0, B1, B2, B3);
222 aesdec(K6, B0, B1, B2, B3);
223 aesdec(K7, B0, B1, B2, B3);
224 aesdec(K8, B0, B1, B2, B3);
225 aesdec(K9, B0, B1, B2, B3);
226 aesdeclast(K10, B0, B1, B2, B3);
228 B0.store_le(out + 16 * 0);
229 B1.store_le(out + 16 * 1);
230 B2.store_le(out + 16 * 2);
231 B3.store_le(out + 16 * 3);
238 for(
size_t i = 0; i != blocks; ++i) {
253 B0.store_le(out + 16 * i);
260BOTAN_FN_ISA_AESNI
void AES_128::aesni_key_schedule(
const uint8_t key[],
size_t ) {
266 const __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
267 const __m128i
K1 = aes_128_key_expansion<0x01>(K0, K0);
268 const __m128i
K2 = aes_128_key_expansion<0x02>(K1, K1);
269 const __m128i
K3 = aes_128_key_expansion<0x04>(K2, K2);
270 const __m128i
K4 = aes_128_key_expansion<0x08>(K3, K3);
271 const __m128i K5 = aes_128_key_expansion<0x10>(K4, K4);
272 const __m128i K6 = aes_128_key_expansion<0x20>(K5, K5);
273 const __m128i K7 = aes_128_key_expansion<0x40>(K6, K6);
274 const __m128i K8 = aes_128_key_expansion<0x80>(K7, K7);
275 const __m128i K9 = aes_128_key_expansion<0x1B>(K8, K8);
276 const __m128i K10 = aes_128_key_expansion<0x36>(K9, K9);
278 __m128i* EK_mm =
reinterpret_cast<__m128i*
>(m_EK.data());
279 _mm_storeu_si128(EK_mm, K0);
280 _mm_storeu_si128(EK_mm + 1, K1);
281 _mm_storeu_si128(EK_mm + 2, K2);
282 _mm_storeu_si128(EK_mm + 3, K3);
283 _mm_storeu_si128(EK_mm + 4, K4);
284 _mm_storeu_si128(EK_mm + 5, K5);
285 _mm_storeu_si128(EK_mm + 6, K6);
286 _mm_storeu_si128(EK_mm + 7, K7);
287 _mm_storeu_si128(EK_mm + 8, K8);
288 _mm_storeu_si128(EK_mm + 9, K9);
289 _mm_storeu_si128(EK_mm + 10, K10);
293 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
294 _mm_storeu_si128(DK_mm, K10);
295 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
296 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
297 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
298 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
299 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
300 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
301 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
302 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
303 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
304 _mm_storeu_si128(DK_mm + 10, K0);
312BOTAN_FN_ISA_AESNI
void AES_192::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
333 keyxor(K0, B0, B1, B2, B3);
334 aesenc(K1, B0, B1, B2, B3);
335 aesenc(K2, B0, B1, B2, B3);
336 aesenc(K3, B0, B1, B2, B3);
337 aesenc(K4, B0, B1, B2, B3);
338 aesenc(K5, B0, B1, B2, B3);
339 aesenc(K6, B0, B1, B2, B3);
340 aesenc(K7, B0, B1, B2, B3);
341 aesenc(K8, B0, B1, B2, B3);
342 aesenc(K9, B0, B1, B2, B3);
343 aesenc(K10, B0, B1, B2, B3);
344 aesenc(K11, B0, B1, B2, B3);
345 aesenclast(K12, B0, B1, B2, B3);
347 B0.store_le(out + 16 * 0);
348 B1.store_le(out + 16 * 1);
349 B2.store_le(out + 16 * 2);
350 B3.store_le(out + 16 * 3);
357 for(
size_t i = 0; i != blocks; ++i) {
375 B0.store_le(out + 16 * i);
382BOTAN_FN_ISA_AESNI
void AES_192::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
403 keyxor(K0, B0, B1, B2, B3);
404 aesdec(K1, B0, B1, B2, B3);
405 aesdec(K2, B0, B1, B2, B3);
406 aesdec(K3, B0, B1, B2, B3);
407 aesdec(K4, B0, B1, B2, B3);
408 aesdec(K5, B0, B1, B2, B3);
409 aesdec(K6, B0, B1, B2, B3);
410 aesdec(K7, B0, B1, B2, B3);
411 aesdec(K8, B0, B1, B2, B3);
412 aesdec(K9, B0, B1, B2, B3);
413 aesdec(K10, B0, B1, B2, B3);
414 aesdec(K11, B0, B1, B2, B3);
415 aesdeclast(K12, B0, B1, B2, B3);
417 B0.store_le(out + 16 * 0);
418 B1.store_le(out + 16 * 1);
419 B2.store_le(out + 16 * 2);
420 B3.store_le(out + 16 * 3);
427 for(
size_t i = 0; i != blocks; ++i) {
445 B0.store_le(out + 16 * i);
452BOTAN_FN_ISA_AESNI
void AES_192::aesni_key_schedule(
const uint8_t key[],
size_t ) {
458 __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
459 __m128i
K1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key + 8));
460 K1 = _mm_srli_si128(K1, 8);
464 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x01), m_EK, 6);
465 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x02), m_EK, 12);
466 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x04), m_EK, 18);
467 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x08), m_EK, 24);
468 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x10), m_EK, 30);
469 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x20), m_EK, 36);
470 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x40), m_EK, 42);
471 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x80), m_EK, 48);
474 const __m128i* EK_mm =
reinterpret_cast<const __m128i*
>(m_EK.data());
476 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
477 _mm_storeu_si128(DK_mm, _mm_loadu_si128(EK_mm + 12));
478 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
479 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
480 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
481 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
482 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
483 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
484 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
485 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
486 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
487 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
488 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
489 _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
497BOTAN_FN_ISA_AESNI
void AES_256::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
520 keyxor(K0, B0, B1, B2, B3);
521 aesenc(K1, B0, B1, B2, B3);
522 aesenc(K2, B0, B1, B2, B3);
523 aesenc(K3, B0, B1, B2, B3);
524 aesenc(K4, B0, B1, B2, B3);
525 aesenc(K5, B0, B1, B2, B3);
526 aesenc(K6, B0, B1, B2, B3);
527 aesenc(K7, B0, B1, B2, B3);
528 aesenc(K8, B0, B1, B2, B3);
529 aesenc(K9, B0, B1, B2, B3);
530 aesenc(K10, B0, B1, B2, B3);
531 aesenc(K11, B0, B1, B2, B3);
532 aesenc(K12, B0, B1, B2, B3);
533 aesenc(K13, B0, B1, B2, B3);
534 aesenclast(K14, B0, B1, B2, B3);
536 B0.store_le(out + 16 * 0);
537 B1.store_le(out + 16 * 1);
538 B2.store_le(out + 16 * 2);
539 B3.store_le(out + 16 * 3);
546 for(
size_t i = 0; i != blocks; ++i) {
566 B0.store_le(out + 16 * i);
573BOTAN_FN_ISA_AESNI
void AES_256::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
596 keyxor(K0, B0, B1, B2, B3);
597 aesdec(K1, B0, B1, B2, B3);
598 aesdec(K2, B0, B1, B2, B3);
599 aesdec(K3, B0, B1, B2, B3);
600 aesdec(K4, B0, B1, B2, B3);
601 aesdec(K5, B0, B1, B2, B3);
602 aesdec(K6, B0, B1, B2, B3);
603 aesdec(K7, B0, B1, B2, B3);
604 aesdec(K8, B0, B1, B2, B3);
605 aesdec(K9, B0, B1, B2, B3);
606 aesdec(K10, B0, B1, B2, B3);
607 aesdec(K11, B0, B1, B2, B3);
608 aesdec(K12, B0, B1, B2, B3);
609 aesdec(K13, B0, B1, B2, B3);
610 aesdeclast(K14, B0, B1, B2, B3);
612 B0.store_le(out + 16 * 0);
613 B1.store_le(out + 16 * 1);
614 B2.store_le(out + 16 * 2);
615 B3.store_le(out + 16 * 3);
622 for(
size_t i = 0; i != blocks; ++i) {
642 B0.store_le(out + 16 * i);
649BOTAN_FN_ISA_AESNI
void AES_256::aesni_key_schedule(
const uint8_t key[],
size_t ) {
655 const __m128i K0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key));
656 const __m128i
K1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(key + 16));
658 const __m128i
K2 = aes_128_key_expansion<0x01>(K0, K1);
659 const __m128i
K3 = aes_256_key_expansion(K1, K2);
661 const __m128i
K4 = aes_128_key_expansion<0x02>(K2, K3);
662 const __m128i K5 = aes_256_key_expansion(K3, K4);
664 const __m128i K6 = aes_128_key_expansion<0x04>(K4, K5);
665 const __m128i K7 = aes_256_key_expansion(K5, K6);
667 const __m128i K8 = aes_128_key_expansion<0x08>(K6, K7);
668 const __m128i K9 = aes_256_key_expansion(K7, K8);
670 const __m128i K10 = aes_128_key_expansion<0x10>(K8, K9);
671 const __m128i K11 = aes_256_key_expansion(K9, K10);
673 const __m128i K12 = aes_128_key_expansion<0x20>(K10, K11);
674 const __m128i K13 = aes_256_key_expansion(K11, K12);
676 const __m128i K14 = aes_128_key_expansion<0x40>(K12, K13);
678 __m128i* EK_mm =
reinterpret_cast<__m128i*
>(m_EK.data());
679 _mm_storeu_si128(EK_mm, K0);
680 _mm_storeu_si128(EK_mm + 1, K1);
681 _mm_storeu_si128(EK_mm + 2, K2);
682 _mm_storeu_si128(EK_mm + 3, K3);
683 _mm_storeu_si128(EK_mm + 4, K4);
684 _mm_storeu_si128(EK_mm + 5, K5);
685 _mm_storeu_si128(EK_mm + 6, K6);
686 _mm_storeu_si128(EK_mm + 7, K7);
687 _mm_storeu_si128(EK_mm + 8, K8);
688 _mm_storeu_si128(EK_mm + 9, K9);
689 _mm_storeu_si128(EK_mm + 10, K10);
690 _mm_storeu_si128(EK_mm + 11, K11);
691 _mm_storeu_si128(EK_mm + 12, K12);
692 _mm_storeu_si128(EK_mm + 13, K13);
693 _mm_storeu_si128(EK_mm + 14, K14);
696 __m128i* DK_mm =
reinterpret_cast<__m128i*
>(m_DK.data());
697 _mm_storeu_si128(DK_mm, K14);
698 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
699 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
700 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
701 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
702 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
703 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
704 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
705 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
706 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
707 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
708 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
709 _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
710 _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
711 _mm_storeu_si128(DK_mm + 14, K0);
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 load_le(const void *in) noexcept
#define BOTAN_FORCE_INLINE
constexpr auto load_le(ParamTs &&... params)
std::vector< T, secure_allocator< T > > secure_vector