13#include <botan/internal/aes.h>
15#include <botan/internal/ct_utils.h>
16#include <botan/internal/isa_extn.h>
17#include <botan/internal/simd_4x32.h>
18#include <botan/internal/target_info.h>
26 if constexpr(std::endian::native == std::endian::little) {
34 if constexpr(std::endian::native == std::endian::little) {
55const SIMD_4x32 mc_forward[4] = {
SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
56 SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
57 SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
58 SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09)};
61 SIMD_4x32(0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C),
62 SIMD_4x32(0x0F0A0500, 0x030E0904, 0x07020D08, 0x0B06010C),
63 SIMD_4x32(0x0B020900, 0x0F060D04, 0x030A0108, 0x070E050C),
64 SIMD_4x32(0x070A0D00, 0x0B0E0104, 0x0F020508, 0x0306090C),
68 SIMD_4x32(0x00000070, 0x00000000, 0x00000000, 0x00000000),
69 SIMD_4x32(0x0000002A, 0x00000000, 0x00000000, 0x00000000),
70 SIMD_4x32(0x00000098, 0x00000000, 0x00000000, 0x00000000),
71 SIMD_4x32(0x00000008, 0x00000000, 0x00000000, 0x00000000),
72 SIMD_4x32(0x0000004D, 0x00000000, 0x00000000, 0x00000000),
73 SIMD_4x32(0x0000007C, 0x00000000, 0x00000000, 0x00000000),
74 SIMD_4x32(0x0000007D, 0x00000000, 0x00000000, 0x00000000),
75 SIMD_4x32(0x00000081, 0x00000000, 0x00000000, 0x00000000),
76 SIMD_4x32(0x0000001F, 0x00000000, 0x00000000, 0x00000000),
77 SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000),
99 SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09),
100 SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
101 SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
102 SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
106 SIMD_4x32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F),
107 SIMD_4x32(0x0E0D0C0F, 0x02010003, 0x06050407, 0x0A09080B),
108 SIMD_4x32(0x0A09080B, 0x0E0D0C0F, 0x02010003, 0x06050407),
109 SIMD_4x32(0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x02010003),
115 return lo_nibs_mask & x;
119 return (x.shr<4>() & lo_nibs_mask);
123 return shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ K;
129 const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
132 const SIMD_4x32 t5 = Bl ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
133 const SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
135 const SIMD_4x32 t7 = masked_shuffle(sb1t, t6) ^ masked_shuffle(sb1u, t5) ^ K;
136 const SIMD_4x32 t8 = masked_shuffle(sb2t, t6) ^ masked_shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]);
138 return shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8;
144 const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
147 const SIMD_4x32 t5 = Bl ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
148 const SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
150 return shuffle(masked_shuffle(sbou, t5) ^ masked_shuffle(sbot, t6) ^ K, vperm_sr[r % 4]);
154 return shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ K;
164 const SIMD_4x32 t5 = B ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
165 const SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
169 const SIMD_4x32 t8 = masked_shuffle(sb9t, t6) ^ masked_shuffle(sb9u, t5) ^ K;
170 const SIMD_4x32 t9 = shuffle(t8, mc) ^ masked_shuffle(sbdu, t5) ^ masked_shuffle(sbdt, t6);
171 const SIMD_4x32 t12 = shuffle(t9, mc) ^ masked_shuffle(sbbu, t5) ^ masked_shuffle(sbbt, t6);
172 return shuffle(t12, mc) ^ masked_shuffle(sbeu, t5) ^ masked_shuffle(sbet, t6);
176 const uint32_t which_sr = ((((r - 1) << 4) ^ 48) & 48) / 16;
184 const SIMD_4x32 t5 = B ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
185 const SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
187 const SIMD_4x32 x = masked_shuffle(sboud, t5) ^ masked_shuffle(sbotd, t6) ^ K;
188 return shuffle(x, vperm_sr[which_sr]);
191void BOTAN_FN_ISA_SIMD_4X32
192vperm_encrypt_blocks(
const uint8_t in[], uint8_t out[],
size_t blocks,
const SIMD_4x32 K[],
size_t rounds) {
195 const size_t blocks2 = blocks - (blocks % 2);
197 for(
size_t i = 0; i != blocks2; i += 2) {
201 B0 = aes_enc_first_round(B0, K[0]);
202 B1 = aes_enc_first_round(B1, K[0]);
204 for(
size_t r = 1; r != rounds; ++r) {
205 B0 = aes_enc_round(B0, K[r], r);
206 B1 = aes_enc_round(B1, K[r], r);
209 B0 = aes_enc_last_round(B0, K[rounds], rounds);
210 B1 = aes_enc_last_round(B1, K[rounds], rounds);
212 B0.store_le(out + i * 16);
213 B1.store_le(out + (i + 1) * 16);
216 for(
size_t i = blocks2; i < blocks; ++i) {
219 B = aes_enc_first_round(B, K[0]);
221 for(
size_t r = 1; r != rounds; ++r) {
222 B = aes_enc_round(B, K[r], r);
225 B = aes_enc_last_round(B, K[rounds], rounds);
226 B.store_le(out + i * 16);
233void BOTAN_FN_ISA_SIMD_4X32
234vperm_decrypt_blocks(
const uint8_t in[], uint8_t out[],
size_t blocks,
const SIMD_4x32 K[],
size_t rounds) {
237 const size_t blocks2 = blocks - (blocks % 2);
239 for(
size_t i = 0; i != blocks2; i += 2) {
243 B0 = aes_dec_first_round(B0, K[0]);
244 B1 = aes_dec_first_round(B1, K[0]);
246 for(
size_t r = 1; r != rounds; ++r) {
247 B0 = aes_dec_round(B0, K[r], r);
248 B1 = aes_dec_round(B1, K[r], r);
251 B0 = aes_dec_last_round(B0, K[rounds], rounds);
252 B1 = aes_dec_last_round(B1, K[rounds], rounds);
254 B0.store_le(out + i * 16);
255 B1.store_le(out + (i + 1) * 16);
258 for(
size_t i = blocks2; i < blocks; ++i) {
261 B = aes_dec_first_round(B, K[0]);
263 for(
size_t r = 1; r != rounds; ++r) {
264 B = aes_dec_round(B, K[r], r);
267 B = aes_dec_last_round(B, K[rounds], rounds);
268 B.store_le(out + i * 16);
277void AES_128::vperm_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
278 const SIMD_4x32 K[11] = {
292 return vperm_encrypt_blocks(in, out, blocks, K, 10);
295void AES_128::vperm_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
296 const SIMD_4x32 K[11] = {
310 return vperm_decrypt_blocks(in, out, blocks, K, 10);
313void AES_192::vperm_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
314 const SIMD_4x32 K[13] = {
330 return vperm_encrypt_blocks(in, out, blocks, K, 12);
333void AES_192::vperm_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
334 const SIMD_4x32 K[13] = {
350 return vperm_decrypt_blocks(in, out, blocks, K, 12);
353void AES_256::vperm_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
354 const SIMD_4x32 K[15] = {
372 return vperm_encrypt_blocks(in, out, blocks, K, 14);
375void AES_256::vperm_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
376 const SIMD_4x32 K[15] = {
394 return vperm_decrypt_blocks(in, out, blocks, K, 14);
400 return shuffle(table_1, low_nibs(input)) ^ shuffle(table_2, high_nibs(input));
404 const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
408 t = shuffle(t, mc_forward0);
409 t2 = t ^ t2 ^ shuffle(t, mc_forward0);
410 return shuffle(t2, vperm_sr[round_no % 4]);
413SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_mangle_dec(
SIMD_4x32 k, uint8_t round_no) {
414 const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
417 SIMD_4x32(0x7ED9A700, 0xB6116FC8, 0x82255BFC, 0x4AED9334),
418 SIMD_4x32(0x27143300, 0x45765162, 0xE9DAFDCE, 0x8BB89FAC),
419 SIMD_4x32(0xCCA86400, 0x27438FEB, 0xADC90561, 0x4622EE8A),
420 SIMD_4x32(0x4F92DD00, 0x815C13CE, 0xBD602FF2, 0x73AEE13C),
421 SIMD_4x32(0x01C6C700, 0x03C4C502, 0xFA3D3CFB, 0xF83F3EF9),
422 SIMD_4x32(0x38CFF700, 0xEE1921D6, 0x7384BC4B, 0xA5526A9D),
423 SIMD_4x32(0x53732000, 0xE3C390B0, 0x10306343, 0xA080D3F3),
424 SIMD_4x32(0x036982E8, 0xA0CA214B, 0x8CE60D67, 0x2F45AEC4),
427 SIMD_4x32 t = aes_schedule_transform(k, dsk[0], dsk[1]);
428 SIMD_4x32 output = shuffle(t, mc_forward0);
430 t = aes_schedule_transform(t, dsk[2], dsk[3]);
431 output = shuffle(t ^ output, mc_forward0);
433 t = aes_schedule_transform(t, dsk[4], dsk[5]);
434 output = shuffle(t ^ output, mc_forward0);
436 t = aes_schedule_transform(t, dsk[6], dsk[7]);
437 output = shuffle(t ^ output, mc_forward0);
439 return shuffle(output, vperm_sr[round_no % 4]);
442SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_mangle_last(
SIMD_4x32 k, uint8_t round_no) {
443 const SIMD_4x32 out_tr1(0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121);
444 const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1);
446 k = shuffle(k, vperm_sr[round_no % 4]);
448 return aes_schedule_transform(k, out_tr1, out_tr2);
452 const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A);
453 const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB);
456 return aes_schedule_transform(k, deskew1, deskew2);
467 const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
471 SIMD_4x32 t5 = Bl ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
472 SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
474 return smeared ^ masked_shuffle(sb1u, t5) ^ masked_shuffle(sb1t, t6);
480 return aes_schedule_round(shuffle(input1, shuffle3333_15), input2 ^ rc);
484 const SIMD_4x32 shuffle3332 =
SIMD_4x32(0x0B0A0908, 0x0F0E0D0C, 0x0F0E0D0C, 0x0F0E0D0C);
485 const SIMD_4x32 shuffle2000 =
SIMD_4x32(0x03020100, 0x03020100, 0x03020100, 0x0B0A0908);
487 const SIMD_4x32 zero_top_half(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
489 return y ^ shuffle(x, shuffle3332) ^ shuffle(y, shuffle2000);
494void AES_128::vperm_key_schedule(
const uint8_t keyb[],
size_t ) {
500 shuffle(key, vperm_sr[2]).store_le(&m_DK[4 * 10]);
502 key = aes_schedule_transform(key, k_ipt1, k_ipt2);
503 key.store_le(&m_EK[0]);
505 for(
size_t i = 1; i != 10; ++i) {
506 key = aes_schedule_round(rcon[i - 1], key, key);
508 aes_schedule_mangle(key, (12 - i) % 4).store_le(&m_EK[4 * i]);
510 aes_schedule_mangle_dec(key, (10 - i) % 4).store_le(&m_DK[4 * (10 - i)]);
513 key = aes_schedule_round(rcon[9], key, key);
514 aes_schedule_mangle_last(key, 2).store_le(&m_EK[4 * 10]);
515 aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]);
518void AES_192::vperm_key_schedule(
const uint8_t keyb[],
size_t ) {
525 shuffle(key1, vperm_sr[0]).store_le(&m_DK[12 * 4]);
527 key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
528 key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
530 key1.store_le(&m_EK[0]);
532 for(
size_t i = 0; i != 4; ++i) {
535 key2 = aes_schedule_round(rcon[2 * i], key2, key1);
538 aes_schedule_mangle(key2t, (i + 3) % 4).store_le(&m_EK[4 * (3 * i + 1)]);
539 aes_schedule_mangle_dec(key2t, (i + 3) % 4).store_le(&m_DK[4 * (11 - 3 * i)]);
541 t = aes_schedule_192_smear(key2, t);
543 aes_schedule_mangle(t, (i + 2) % 4).store_le(&m_EK[4 * (3 * i + 2)]);
544 aes_schedule_mangle_dec(t, (i + 2) % 4).store_le(&m_DK[4 * (10 - 3 * i)]);
546 key2 = aes_schedule_round(rcon[2 * i + 1], t, key2);
549 aes_schedule_mangle_last(key2, (i + 1) % 4).store_le(&m_EK[4 * (3 * i + 3)]);
550 aes_schedule_mangle_last_dec(key2).store_le(&m_DK[4 * (9 - 3 * i)]);
552 aes_schedule_mangle(key2, (i + 1) % 4).store_le(&m_EK[4 * (3 * i + 3)]);
553 aes_schedule_mangle_dec(key2, (i + 1) % 4).store_le(&m_DK[4 * (9 - 3 * i)]);
557 key2 = aes_schedule_192_smear(key2, t);
561void AES_256::vperm_key_schedule(
const uint8_t keyb[],
size_t ) {
568 shuffle(key1, vperm_sr[2]).store_le(&m_DK[4 * 14]);
570 key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
571 key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
573 key1.store_le(&m_EK[0]);
574 aes_schedule_mangle(key2, 3).store_le(&m_EK[4]);
576 aes_schedule_mangle_dec(key2, 1).store_le(&m_DK[4 * 13]);
580 for(
size_t i = 2; i != 14; i += 2) {
581 const SIMD_4x32 k_t = key2;
582 key1 = key2 = aes_schedule_round(rcon[(i / 2) - 1], key2, key1);
584 aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4 * i]);
585 aes_schedule_mangle_dec(key2, (i + 2) % 4).store_le(&m_DK[4 * (14 - i)]);
587 key2 = aes_schedule_round(shuffle(key2, shuffle3333), k_t);
589 aes_schedule_mangle(key2, (i - 1) % 4).store_le(&m_EK[4 * (i + 1)]);
590 aes_schedule_mangle_dec(key2, (i + 1) % 4).store_le(&m_DK[4 * (13 - i)]);
593 key2 = aes_schedule_round(rcon[6], key2, key1);
595 aes_schedule_mangle_last(key2, 2).store_le(&m_EK[4 * 14]);
596 aes_schedule_mangle_last_dec(key2).store_le(&m_DK[0]);
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 byte_shuffle(const SIMD_4x32 &tbl, const SIMD_4x32 &idx)
static SIMD_4x32 load_le(const void *in) noexcept
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 alignr8(const SIMD_4x32 &a, const SIMD_4x32 &b)
SIMD_4x32 shift_elems_left() const noexcept
static SIMD_4x32 splat_u8(uint8_t B) noexcept
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 masked_byte_shuffle(const SIMD_4x32 &tbl, const SIMD_4x32 &idx)
BOTAN_FN_ISA_SIMD_4X32 SIMD_4x32 bswap() const noexcept
static SIMD_4x32 splat(uint32_t B) noexcept
constexpr void unpoison(const T *p, size_t n)
constexpr void poison(const T *p, size_t n)