11#include <botan/internal/aes.h>
13#include <botan/internal/isa_extn.h>
14#include <botan/internal/loadstor.h>
22 B = vaesmcq_u8(vaeseq_u8(B, K));
26 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K) {
27 B0 = vaesmcq_u8(vaeseq_u8(B0, K));
28 B1 = vaesmcq_u8(vaeseq_u8(B1, K));
29 B2 = vaesmcq_u8(vaeseq_u8(B2, K));
30 B3 = vaesmcq_u8(vaeseq_u8(B3, K));
34 B = veorq_u8(vaeseq_u8(B, K), K2);
38 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K, uint8x16_t K2) {
39 B0 = veorq_u8(vaeseq_u8(B0, K), K2);
40 B1 = veorq_u8(vaeseq_u8(B1, K), K2);
41 B2 = veorq_u8(vaeseq_u8(B2, K), K2);
42 B3 = veorq_u8(vaeseq_u8(B3, K), K2);
46 B = vaesimcq_u8(vaesdq_u8(B, K));
50 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K) {
51 B0 = vaesimcq_u8(vaesdq_u8(B0, K));
52 B1 = vaesimcq_u8(vaesdq_u8(B1, K));
53 B2 = vaesimcq_u8(vaesdq_u8(B2, K));
54 B3 = vaesimcq_u8(vaesdq_u8(B3, K));
58 B = veorq_u8(vaesdq_u8(B, K), K2);
62 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K, uint8x16_t K2) {
63 B0 = veorq_u8(vaesdq_u8(B0, K), K2);
64 B1 = veorq_u8(vaesdq_u8(B1, K), K2);
65 B2 = veorq_u8(vaesdq_u8(B2, K), K2);
66 B3 = veorq_u8(vaesdq_u8(B3, K), K2);
74BOTAN_FN_ISA_AES
void AES_128::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
75 const uint8_t* skey =
reinterpret_cast<const uint8_t*
>(m_EK.data());
77 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
78 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
79 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
80 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
81 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
82 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
83 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
84 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
85 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
86 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
87 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
89 using namespace AES_AARCH64;
92 uint8x16_t B0 = vld1q_u8(in);
93 uint8x16_t B1 = vld1q_u8(in + 16);
94 uint8x16_t B2 = vld1q_u8(in + 32);
95 uint8x16_t B3 = vld1q_u8(in + 48);
97 enc4(B0, B1, B2, B3, K0);
98 enc4(B0, B1, B2, B3, K1);
99 enc4(B0, B1, B2, B3, K2);
100 enc4(B0, B1, B2, B3, K3);
101 enc4(B0, B1, B2, B3, K4);
102 enc4(B0, B1, B2, B3, K5);
103 enc4(B0, B1, B2, B3, K6);
104 enc4(B0, B1, B2, B3, K7);
105 enc4(B0, B1, B2, B3, K8);
106 enc4_last(B0, B1, B2, B3, K9, K10);
109 vst1q_u8(out + 16, B1);
110 vst1q_u8(out + 32, B2);
111 vst1q_u8(out + 48, B3);
118 for(
size_t i = 0; i != blocks; ++i) {
119 uint8x16_t B = vld1q_u8(in + 16 * i);
130 vst1q_u8(out + 16 * i, B);
137BOTAN_FN_ISA_AES
void AES_128::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
138 const uint8_t* skey =
reinterpret_cast<const uint8_t*
>(m_DK.data());
140 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
141 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
142 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
143 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
144 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
145 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
146 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
147 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
148 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
149 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
150 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
152 using namespace AES_AARCH64;
155 uint8x16_t B0 = vld1q_u8(in);
156 uint8x16_t B1 = vld1q_u8(in + 16);
157 uint8x16_t B2 = vld1q_u8(in + 32);
158 uint8x16_t B3 = vld1q_u8(in + 48);
160 dec4(B0, B1, B2, B3, K0);
161 dec4(B0, B1, B2, B3, K1);
162 dec4(B0, B1, B2, B3, K2);
163 dec4(B0, B1, B2, B3, K3);
164 dec4(B0, B1, B2, B3, K4);
165 dec4(B0, B1, B2, B3, K5);
166 dec4(B0, B1, B2, B3, K6);
167 dec4(B0, B1, B2, B3, K7);
168 dec4(B0, B1, B2, B3, K8);
172 vst1q_u8(out + 16, B1);
173 vst1q_u8(out + 32, B2);
174 vst1q_u8(out + 48, B3);
181 for(
size_t i = 0; i != blocks; ++i) {
182 uint8x16_t B = vld1q_u8(in + 16 * i);
192 B = veorq_u8(vaesdq_u8(B, K9), K10);
193 vst1q_u8(out + 16 * i, B);
200BOTAN_FN_ISA_AES
void AES_192::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
201 const uint8_t* skey =
reinterpret_cast<const uint8_t*
>(m_EK.data());
203 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
204 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
205 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
206 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
207 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
208 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
209 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
210 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
211 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
212 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
213 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
214 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
215 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
217 using namespace AES_AARCH64;
220 uint8x16_t B0 = vld1q_u8(in);
221 uint8x16_t B1 = vld1q_u8(in + 16);
222 uint8x16_t B2 = vld1q_u8(in + 32);
223 uint8x16_t B3 = vld1q_u8(in + 48);
225 enc4(B0, B1, B2, B3, K0);
226 enc4(B0, B1, B2, B3, K1);
227 enc4(B0, B1, B2, B3, K2);
228 enc4(B0, B1, B2, B3, K3);
229 enc4(B0, B1, B2, B3, K4);
230 enc4(B0, B1, B2, B3, K5);
231 enc4(B0, B1, B2, B3, K6);
232 enc4(B0, B1, B2, B3, K7);
233 enc4(B0, B1, B2, B3, K8);
234 enc4(B0, B1, B2, B3, K9);
235 enc4(B0, B1, B2, B3, K10);
239 vst1q_u8(out + 16, B1);
240 vst1q_u8(out + 32, B2);
241 vst1q_u8(out + 48, B3);
248 for(
size_t i = 0; i != blocks; ++i) {
249 uint8x16_t B = vld1q_u8(in + 16 * i);
261 B = veorq_u8(vaeseq_u8(B, K11), K12);
262 vst1q_u8(out + 16 * i, B);
269BOTAN_FN_ISA_AES
void AES_192::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
270 const uint8_t* skey =
reinterpret_cast<const uint8_t*
>(m_DK.data());
272 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
273 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
274 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
275 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
276 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
277 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
278 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
279 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
280 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
281 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
282 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
283 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
284 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
286 using namespace AES_AARCH64;
289 uint8x16_t B0 = vld1q_u8(in);
290 uint8x16_t B1 = vld1q_u8(in + 16);
291 uint8x16_t B2 = vld1q_u8(in + 32);
292 uint8x16_t B3 = vld1q_u8(in + 48);
294 dec4(B0, B1, B2, B3, K0);
295 dec4(B0, B1, B2, B3, K1);
296 dec4(B0, B1, B2, B3, K2);
297 dec4(B0, B1, B2, B3, K3);
298 dec4(B0, B1, B2, B3, K4);
299 dec4(B0, B1, B2, B3, K5);
300 dec4(B0, B1, B2, B3, K6);
301 dec4(B0, B1, B2, B3, K7);
302 dec4(B0, B1, B2, B3, K8);
303 dec4(B0, B1, B2, B3, K9);
304 dec4(B0, B1, B2, B3, K10);
308 vst1q_u8(out + 16, B1);
309 vst1q_u8(out + 32, B2);
310 vst1q_u8(out + 48, B3);
317 for(
size_t i = 0; i != blocks; ++i) {
318 uint8x16_t B = vld1q_u8(in + 16 * i);
330 B = veorq_u8(vaesdq_u8(B, K11), K12);
331 vst1q_u8(out + 16 * i, B);
338BOTAN_FN_ISA_AES
void AES_256::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
339 const uint8_t* skey =
reinterpret_cast<const uint8_t*
>(m_EK.data());
341 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
342 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
343 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
344 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
345 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
346 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
347 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
348 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
349 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
350 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
351 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
352 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
353 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
354 const uint8x16_t K13 = vld1q_u8(skey + 13 * 16);
355 const uint8x16_t K14 = vld1q_u8(skey + 14 * 16);
357 using namespace AES_AARCH64;
359 using namespace AES_AARCH64;
362 uint8x16_t B0 = vld1q_u8(in);
363 uint8x16_t B1 = vld1q_u8(in + 16);
364 uint8x16_t B2 = vld1q_u8(in + 32);
365 uint8x16_t B3 = vld1q_u8(in + 48);
367 enc4(B0, B1, B2, B3, K0);
368 enc4(B0, B1, B2, B3, K1);
369 enc4(B0, B1, B2, B3, K2);
370 enc4(B0, B1, B2, B3, K3);
371 enc4(B0, B1, B2, B3, K4);
372 enc4(B0, B1, B2, B3, K5);
373 enc4(B0, B1, B2, B3, K6);
374 enc4(B0, B1, B2, B3, K7);
375 enc4(B0, B1, B2, B3, K8);
376 enc4(B0, B1, B2, B3, K9);
377 enc4(B0, B1, B2, B3, K10);
378 enc4(B0, B1, B2, B3, K11);
379 enc4(B0, B1, B2, B3, K12);
383 vst1q_u8(out + 16, B1);
384 vst1q_u8(out + 32, B2);
385 vst1q_u8(out + 48, B3);
392 for(
size_t i = 0; i != blocks; ++i) {
393 uint8x16_t B = vld1q_u8(in + 16 * i);
407 B = veorq_u8(vaeseq_u8(B, K13), K14);
408 vst1q_u8(out + 16 * i, B);
415BOTAN_FN_ISA_AES
void AES_256::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
416 const uint8_t* skey =
reinterpret_cast<const uint8_t*
>(m_DK.data());
418 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
419 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
420 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
421 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
422 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
423 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
424 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
425 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
426 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
427 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
428 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
429 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
430 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
431 const uint8x16_t K13 = vld1q_u8(skey + 13 * 16);
432 const uint8x16_t K14 = vld1q_u8(skey + 14 * 16);
434 using namespace AES_AARCH64;
437 uint8x16_t B0 = vld1q_u8(in);
438 uint8x16_t B1 = vld1q_u8(in + 16);
439 uint8x16_t B2 = vld1q_u8(in + 32);
440 uint8x16_t B3 = vld1q_u8(in + 48);
442 dec4(B0, B1, B2, B3, K0);
443 dec4(B0, B1, B2, B3, K1);
444 dec4(B0, B1, B2, B3, K2);
445 dec4(B0, B1, B2, B3, K3);
446 dec4(B0, B1, B2, B3, K4);
447 dec4(B0, B1, B2, B3, K5);
448 dec4(B0, B1, B2, B3, K6);
449 dec4(B0, B1, B2, B3, K7);
450 dec4(B0, B1, B2, B3, K8);
451 dec4(B0, B1, B2, B3, K9);
452 dec4(B0, B1, B2, B3, K10);
453 dec4(B0, B1, B2, B3, K11);
454 dec4(B0, B1, B2, B3, K12);
458 vst1q_u8(out + 16, B1);
459 vst1q_u8(out + 32, B2);
460 vst1q_u8(out + 48, B3);
467 for(
size_t i = 0; i != blocks; ++i) {
468 uint8x16_t B = vld1q_u8(in + 16 * i);
482 B = veorq_u8(vaesdq_u8(B, K13), K14);
483 vst1q_u8(out + 16 * i, B);
#define BOTAN_FORCE_INLINE
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec_last(uint8x16_t &B, uint8x16_t K, uint8x16_t K2)
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec(uint8x16_t &B, uint8x16_t K)
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc(uint8x16_t &B, uint8x16_t K)
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc4(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K)
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc4_last(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K, uint8x16_t K2)
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec4(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K)
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc_last(uint8x16_t &B, uint8x16_t K, uint8x16_t K2)
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec4_last(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K, uint8x16_t K2)