11#include <botan/internal/aes.h> 
   13#include <botan/internal/isa_extn.h> 
   14#include <botan/internal/loadstor.h> 
   22   B = vaesmcq_u8(vaeseq_u8(B, K));
 
 
   26   uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K) {
 
   27   B0 = vaesmcq_u8(vaeseq_u8(B0, K));
 
   28   B1 = vaesmcq_u8(vaeseq_u8(B1, K));
 
   29   B2 = vaesmcq_u8(vaeseq_u8(B2, K));
 
   30   B3 = vaesmcq_u8(vaeseq_u8(B3, K));
 
 
   34   B = veorq_u8(vaeseq_u8(B, K), K2);
 
 
   38   uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K, uint8x16_t K2) {
 
   39   B0 = veorq_u8(vaeseq_u8(B0, K), K2);
 
   40   B1 = veorq_u8(vaeseq_u8(B1, K), K2);
 
   41   B2 = veorq_u8(vaeseq_u8(B2, K), K2);
 
   42   B3 = veorq_u8(vaeseq_u8(B3, K), K2);
 
 
   46   B = vaesimcq_u8(vaesdq_u8(B, K));
 
 
   50   uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K) {
 
   51   B0 = vaesimcq_u8(vaesdq_u8(B0, K));
 
   52   B1 = vaesimcq_u8(vaesdq_u8(B1, K));
 
   53   B2 = vaesimcq_u8(vaesdq_u8(B2, K));
 
   54   B3 = vaesimcq_u8(vaesdq_u8(B3, K));
 
 
   58   B = veorq_u8(vaesdq_u8(B, K), K2);
 
 
   62   uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K, uint8x16_t K2) {
 
   63   B0 = veorq_u8(vaesdq_u8(B0, K), K2);
 
   64   B1 = veorq_u8(vaesdq_u8(B1, K), K2);
 
   65   B2 = veorq_u8(vaesdq_u8(B2, K), K2);
 
   66   B3 = veorq_u8(vaesdq_u8(B3, K), K2);
 
 
 
   74BOTAN_FN_ISA_AES 
void AES_128::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[], 
size_t blocks)
 const {
 
   75   const uint8_t* skey = 
reinterpret_cast<const uint8_t*
>(m_EK.data());
 
   77   const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
 
   78   const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
 
   79   const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
 
   80   const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
 
   81   const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
 
   82   const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
 
   83   const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
 
   84   const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
 
   85   const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
 
   86   const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
 
   87   const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
 
   89   using namespace AES_AARCH64;
 
   92      uint8x16_t B0 = vld1q_u8(in);
 
   93      uint8x16_t B1 = vld1q_u8(in + 16);
 
   94      uint8x16_t B2 = vld1q_u8(in + 32);
 
   95      uint8x16_t B3 = vld1q_u8(in + 48);
 
   97      enc4(B0, B1, B2, B3, K0);
 
   98      enc4(B0, B1, B2, B3, K1);
 
   99      enc4(B0, B1, B2, B3, K2);
 
  100      enc4(B0, B1, B2, B3, K3);
 
  101      enc4(B0, B1, B2, B3, K4);
 
  102      enc4(B0, B1, B2, B3, K5);
 
  103      enc4(B0, B1, B2, B3, K6);
 
  104      enc4(B0, B1, B2, B3, K7);
 
  105      enc4(B0, B1, B2, B3, K8);
 
  106      enc4_last(B0, B1, B2, B3, K9, K10);
 
  109      vst1q_u8(out + 16, B1);
 
  110      vst1q_u8(out + 32, B2);
 
  111      vst1q_u8(out + 48, B3);
 
  118   for(
size_t i = 0; i != blocks; ++i) {
 
  119      uint8x16_t B = vld1q_u8(in + 16 * i);
 
  130      vst1q_u8(out + 16 * i, B);
 
  137BOTAN_FN_ISA_AES 
void AES_128::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[], 
size_t blocks)
 const {
 
  138   const uint8_t* skey = 
reinterpret_cast<const uint8_t*
>(m_DK.data());
 
  140   const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
 
  141   const uint8x16_t 
K1 = vld1q_u8(skey + 1 * 16);
 
  142   const uint8x16_t 
K2 = vld1q_u8(skey + 2 * 16);
 
  143   const uint8x16_t 
K3 = vld1q_u8(skey + 3 * 16);
 
  144   const uint8x16_t 
K4 = vld1q_u8(skey + 4 * 16);
 
  145   const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
 
  146   const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
 
  147   const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
 
  148   const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
 
  149   const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
 
  150   const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
 
  152   using namespace AES_AARCH64;
 
  155      uint8x16_t B0 = vld1q_u8(in);
 
  156      uint8x16_t B1 = vld1q_u8(in + 16);
 
  157      uint8x16_t B2 = vld1q_u8(in + 32);
 
  158      uint8x16_t B3 = vld1q_u8(in + 48);
 
  160      dec4(B0, B1, B2, B3, K0);
 
  161      dec4(B0, B1, B2, B3, K1);
 
  162      dec4(B0, B1, B2, B3, K2);
 
  163      dec4(B0, B1, B2, B3, K3);
 
  164      dec4(B0, B1, B2, B3, K4);
 
  165      dec4(B0, B1, B2, B3, K5);
 
  166      dec4(B0, B1, B2, B3, K6);
 
  167      dec4(B0, B1, B2, B3, K7);
 
  168      dec4(B0, B1, B2, B3, K8);
 
  172      vst1q_u8(out + 16, B1);
 
  173      vst1q_u8(out + 32, B2);
 
  174      vst1q_u8(out + 48, B3);
 
  181   for(
size_t i = 0; i != blocks; ++i) {
 
  182      uint8x16_t B = vld1q_u8(in + 16 * i);
 
  192      B = veorq_u8(vaesdq_u8(B, K9), K10);
 
  193      vst1q_u8(out + 16 * i, B);
 
  200BOTAN_FN_ISA_AES 
void AES_192::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[], 
size_t blocks)
 const {
 
  201   const uint8_t* skey = 
reinterpret_cast<const uint8_t*
>(m_EK.data());
 
  203   const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
 
  204   const uint8x16_t 
K1 = vld1q_u8(skey + 1 * 16);
 
  205   const uint8x16_t 
K2 = vld1q_u8(skey + 2 * 16);
 
  206   const uint8x16_t 
K3 = vld1q_u8(skey + 3 * 16);
 
  207   const uint8x16_t 
K4 = vld1q_u8(skey + 4 * 16);
 
  208   const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
 
  209   const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
 
  210   const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
 
  211   const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
 
  212   const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
 
  213   const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
 
  214   const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
 
  215   const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
 
  217   using namespace AES_AARCH64;
 
  220      uint8x16_t B0 = vld1q_u8(in);
 
  221      uint8x16_t B1 = vld1q_u8(in + 16);
 
  222      uint8x16_t B2 = vld1q_u8(in + 32);
 
  223      uint8x16_t B3 = vld1q_u8(in + 48);
 
  225      enc4(B0, B1, B2, B3, K0);
 
  226      enc4(B0, B1, B2, B3, K1);
 
  227      enc4(B0, B1, B2, B3, K2);
 
  228      enc4(B0, B1, B2, B3, K3);
 
  229      enc4(B0, B1, B2, B3, K4);
 
  230      enc4(B0, B1, B2, B3, K5);
 
  231      enc4(B0, B1, B2, B3, K6);
 
  232      enc4(B0, B1, B2, B3, K7);
 
  233      enc4(B0, B1, B2, B3, K8);
 
  234      enc4(B0, B1, B2, B3, K9);
 
  235      enc4(B0, B1, B2, B3, K10);
 
  239      vst1q_u8(out + 16, B1);
 
  240      vst1q_u8(out + 32, B2);
 
  241      vst1q_u8(out + 48, B3);
 
  248   for(
size_t i = 0; i != blocks; ++i) {
 
  249      uint8x16_t B = vld1q_u8(in + 16 * i);
 
  261      B = veorq_u8(vaeseq_u8(B, K11), K12);
 
  262      vst1q_u8(out + 16 * i, B);
 
  269BOTAN_FN_ISA_AES 
void AES_192::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[], 
size_t blocks)
 const {
 
  270   const uint8_t* skey = 
reinterpret_cast<const uint8_t*
>(m_DK.data());
 
  272   const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
 
  273   const uint8x16_t 
K1 = vld1q_u8(skey + 1 * 16);
 
  274   const uint8x16_t 
K2 = vld1q_u8(skey + 2 * 16);
 
  275   const uint8x16_t 
K3 = vld1q_u8(skey + 3 * 16);
 
  276   const uint8x16_t 
K4 = vld1q_u8(skey + 4 * 16);
 
  277   const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
 
  278   const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
 
  279   const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
 
  280   const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
 
  281   const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
 
  282   const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
 
  283   const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
 
  284   const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
 
  286   using namespace AES_AARCH64;
 
  289      uint8x16_t B0 = vld1q_u8(in);
 
  290      uint8x16_t B1 = vld1q_u8(in + 16);
 
  291      uint8x16_t B2 = vld1q_u8(in + 32);
 
  292      uint8x16_t B3 = vld1q_u8(in + 48);
 
  294      dec4(B0, B1, B2, B3, K0);
 
  295      dec4(B0, B1, B2, B3, K1);
 
  296      dec4(B0, B1, B2, B3, K2);
 
  297      dec4(B0, B1, B2, B3, K3);
 
  298      dec4(B0, B1, B2, B3, K4);
 
  299      dec4(B0, B1, B2, B3, K5);
 
  300      dec4(B0, B1, B2, B3, K6);
 
  301      dec4(B0, B1, B2, B3, K7);
 
  302      dec4(B0, B1, B2, B3, K8);
 
  303      dec4(B0, B1, B2, B3, K9);
 
  304      dec4(B0, B1, B2, B3, K10);
 
  308      vst1q_u8(out + 16, B1);
 
  309      vst1q_u8(out + 32, B2);
 
  310      vst1q_u8(out + 48, B3);
 
  317   for(
size_t i = 0; i != blocks; ++i) {
 
  318      uint8x16_t B = vld1q_u8(in + 16 * i);
 
  330      B = veorq_u8(vaesdq_u8(B, K11), K12);
 
  331      vst1q_u8(out + 16 * i, B);
 
  338BOTAN_FN_ISA_AES 
void AES_256::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[], 
size_t blocks)
 const {
 
  339   const uint8_t* skey = 
reinterpret_cast<const uint8_t*
>(m_EK.data());
 
  341   const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
 
  342   const uint8x16_t 
K1 = vld1q_u8(skey + 1 * 16);
 
  343   const uint8x16_t 
K2 = vld1q_u8(skey + 2 * 16);
 
  344   const uint8x16_t 
K3 = vld1q_u8(skey + 3 * 16);
 
  345   const uint8x16_t 
K4 = vld1q_u8(skey + 4 * 16);
 
  346   const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
 
  347   const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
 
  348   const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
 
  349   const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
 
  350   const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
 
  351   const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
 
  352   const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
 
  353   const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
 
  354   const uint8x16_t K13 = vld1q_u8(skey + 13 * 16);
 
  355   const uint8x16_t K14 = vld1q_u8(skey + 14 * 16);
 
  357   using namespace AES_AARCH64;
 
  359   using namespace AES_AARCH64;
 
  362      uint8x16_t B0 = vld1q_u8(in);
 
  363      uint8x16_t B1 = vld1q_u8(in + 16);
 
  364      uint8x16_t B2 = vld1q_u8(in + 32);
 
  365      uint8x16_t B3 = vld1q_u8(in + 48);
 
  367      enc4(B0, B1, B2, B3, K0);
 
  368      enc4(B0, B1, B2, B3, K1);
 
  369      enc4(B0, B1, B2, B3, K2);
 
  370      enc4(B0, B1, B2, B3, K3);
 
  371      enc4(B0, B1, B2, B3, K4);
 
  372      enc4(B0, B1, B2, B3, K5);
 
  373      enc4(B0, B1, B2, B3, K6);
 
  374      enc4(B0, B1, B2, B3, K7);
 
  375      enc4(B0, B1, B2, B3, K8);
 
  376      enc4(B0, B1, B2, B3, K9);
 
  377      enc4(B0, B1, B2, B3, K10);
 
  378      enc4(B0, B1, B2, B3, K11);
 
  379      enc4(B0, B1, B2, B3, K12);
 
  383      vst1q_u8(out + 16, B1);
 
  384      vst1q_u8(out + 32, B2);
 
  385      vst1q_u8(out + 48, B3);
 
  392   for(
size_t i = 0; i != blocks; ++i) {
 
  393      uint8x16_t B = vld1q_u8(in + 16 * i);
 
  407      B = veorq_u8(vaeseq_u8(B, K13), K14);
 
  408      vst1q_u8(out + 16 * i, B);
 
  415BOTAN_FN_ISA_AES 
void AES_256::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[], 
size_t blocks)
 const {
 
  416   const uint8_t* skey = 
reinterpret_cast<const uint8_t*
>(m_DK.data());
 
  418   const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
 
  419   const uint8x16_t 
K1 = vld1q_u8(skey + 1 * 16);
 
  420   const uint8x16_t 
K2 = vld1q_u8(skey + 2 * 16);
 
  421   const uint8x16_t 
K3 = vld1q_u8(skey + 3 * 16);
 
  422   const uint8x16_t 
K4 = vld1q_u8(skey + 4 * 16);
 
  423   const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
 
  424   const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
 
  425   const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
 
  426   const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
 
  427   const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
 
  428   const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
 
  429   const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
 
  430   const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
 
  431   const uint8x16_t K13 = vld1q_u8(skey + 13 * 16);
 
  432   const uint8x16_t K14 = vld1q_u8(skey + 14 * 16);
 
  434   using namespace AES_AARCH64;
 
  437      uint8x16_t B0 = vld1q_u8(in);
 
  438      uint8x16_t B1 = vld1q_u8(in + 16);
 
  439      uint8x16_t B2 = vld1q_u8(in + 32);
 
  440      uint8x16_t B3 = vld1q_u8(in + 48);
 
  442      dec4(B0, B1, B2, B3, K0);
 
  443      dec4(B0, B1, B2, B3, K1);
 
  444      dec4(B0, B1, B2, B3, K2);
 
  445      dec4(B0, B1, B2, B3, K3);
 
  446      dec4(B0, B1, B2, B3, K4);
 
  447      dec4(B0, B1, B2, B3, K5);
 
  448      dec4(B0, B1, B2, B3, K6);
 
  449      dec4(B0, B1, B2, B3, K7);
 
  450      dec4(B0, B1, B2, B3, K8);
 
  451      dec4(B0, B1, B2, B3, K9);
 
  452      dec4(B0, B1, B2, B3, K10);
 
  453      dec4(B0, B1, B2, B3, K11);
 
  454      dec4(B0, B1, B2, B3, K12);
 
  458      vst1q_u8(out + 16, B1);
 
  459      vst1q_u8(out + 32, B2);
 
  460      vst1q_u8(out + 48, B3);
 
  467   for(
size_t i = 0; i != blocks; ++i) {
 
  468      uint8x16_t B = vld1q_u8(in + 16 * i);
 
  482      B = veorq_u8(vaesdq_u8(B, K13), K14);
 
  483      vst1q_u8(out + 16 * i, B);
 
#define BOTAN_FORCE_INLINE
 
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec_last(uint8x16_t &B, uint8x16_t K, uint8x16_t K2)
 
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec(uint8x16_t &B, uint8x16_t K)
 
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc(uint8x16_t &B, uint8x16_t K)
 
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc4(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K)
 
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc4_last(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K, uint8x16_t K2)
 
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec4(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K)
 
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc_last(uint8x16_t &B, uint8x16_t K, uint8x16_t K2)
 
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec4_last(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K, uint8x16_t K2)