11#include <botan/internal/aes.h>
13#include <botan/internal/isa_extn.h>
14#include <botan/internal/loadstor.h>
24 B = vaesmcq_u8(vaeseq_u8(B, K));
28 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K) {
29 B0 = vaesmcq_u8(vaeseq_u8(B0, K));
30 B1 = vaesmcq_u8(vaeseq_u8(B1, K));
31 B2 = vaesmcq_u8(vaeseq_u8(B2, K));
32 B3 = vaesmcq_u8(vaeseq_u8(B3, K));
35BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES
void enc_last(uint8x16_t& B, uint8x16_t K, uint8x16_t K2) {
36 B = veorq_u8(vaeseq_u8(B, K), K2);
40 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K, uint8x16_t K2) {
41 B0 = veorq_u8(vaeseq_u8(B0, K), K2);
42 B1 = veorq_u8(vaeseq_u8(B1, K), K2);
43 B2 = veorq_u8(vaeseq_u8(B2, K), K2);
44 B3 = veorq_u8(vaeseq_u8(B3, K), K2);
48 B = vaesimcq_u8(vaesdq_u8(B, K));
52 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K) {
53 B0 = vaesimcq_u8(vaesdq_u8(B0, K));
54 B1 = vaesimcq_u8(vaesdq_u8(B1, K));
55 B2 = vaesimcq_u8(vaesdq_u8(B2, K));
56 B3 = vaesimcq_u8(vaesdq_u8(B3, K));
59BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES
void dec_last(uint8x16_t& B, uint8x16_t K, uint8x16_t K2) {
60 B = veorq_u8(vaesdq_u8(B, K), K2);
64 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K, uint8x16_t K2) {
65 B0 = veorq_u8(vaesdq_u8(B0, K), K2);
66 B1 = veorq_u8(vaesdq_u8(B1, K), K2);
67 B2 = veorq_u8(vaesdq_u8(B2, K), K2);
68 B3 = veorq_u8(vaesdq_u8(B3, K), K2);
78BOTAN_FN_ISA_AES
void AES_128::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
79 const uint8_t* skey =
reinterpret_cast<const uint8_t*
>(m_EK.data());
81 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
82 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
83 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
84 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
85 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
86 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
87 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
88 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
89 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
90 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
91 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
96 uint8x16_t B0 = vld1q_u8(in);
97 uint8x16_t B1 = vld1q_u8(in + 16);
98 uint8x16_t B2 = vld1q_u8(in + 32);
99 uint8x16_t B3 = vld1q_u8(in + 48);
101 enc4(B0, B1, B2, B3, K0);
102 enc4(B0, B1, B2, B3, K1);
103 enc4(B0, B1, B2, B3, K2);
104 enc4(B0, B1, B2, B3, K3);
105 enc4(B0, B1, B2, B3, K4);
106 enc4(B0, B1, B2, B3, K5);
107 enc4(B0, B1, B2, B3, K6);
108 enc4(B0, B1, B2, B3, K7);
109 enc4(B0, B1, B2, B3, K8);
110 enc4_last(B0, B1, B2, B3, K9, K10);
113 vst1q_u8(out + 16, B1);
114 vst1q_u8(out + 32, B2);
115 vst1q_u8(out + 48, B3);
122 for(
size_t i = 0; i != blocks; ++i) {
123 uint8x16_t B = vld1q_u8(in + 16 * i);
133 enc_last(B, K9, K10);
134 vst1q_u8(out + 16 * i, B);
141BOTAN_FN_ISA_AES
void AES_128::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
142 const uint8_t* skey =
reinterpret_cast<const uint8_t*
>(m_DK.data());
144 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
145 const uint8x16_t
K1 = vld1q_u8(skey + 1 * 16);
146 const uint8x16_t
K2 = vld1q_u8(skey + 2 * 16);
147 const uint8x16_t
K3 = vld1q_u8(skey + 3 * 16);
148 const uint8x16_t
K4 = vld1q_u8(skey + 4 * 16);
149 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
150 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
151 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
152 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
153 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
154 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
156 using namespace AES_AARCH64;
159 uint8x16_t B0 = vld1q_u8(in);
160 uint8x16_t B1 = vld1q_u8(in + 16);
161 uint8x16_t B2 = vld1q_u8(in + 32);
162 uint8x16_t B3 = vld1q_u8(in + 48);
164 dec4(B0, B1, B2, B3, K0);
165 dec4(B0, B1, B2, B3, K1);
166 dec4(B0, B1, B2, B3, K2);
167 dec4(B0, B1, B2, B3, K3);
168 dec4(B0, B1, B2, B3, K4);
169 dec4(B0, B1, B2, B3, K5);
170 dec4(B0, B1, B2, B3, K6);
171 dec4(B0, B1, B2, B3, K7);
172 dec4(B0, B1, B2, B3, K8);
173 dec4_last(B0, B1, B2, B3, K9, K10);
176 vst1q_u8(out + 16, B1);
177 vst1q_u8(out + 32, B2);
178 vst1q_u8(out + 48, B3);
185 for(
size_t i = 0; i != blocks; ++i) {
186 uint8x16_t B = vld1q_u8(in + 16 * i);
196 dec_last(B, K9, K10);
197 vst1q_u8(out + 16 * i, B);
204BOTAN_FN_ISA_AES
void AES_192::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
205 const uint8_t* skey =
reinterpret_cast<const uint8_t*
>(m_EK.data());
207 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
208 const uint8x16_t
K1 = vld1q_u8(skey + 1 * 16);
209 const uint8x16_t
K2 = vld1q_u8(skey + 2 * 16);
210 const uint8x16_t
K3 = vld1q_u8(skey + 3 * 16);
211 const uint8x16_t
K4 = vld1q_u8(skey + 4 * 16);
212 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
213 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
214 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
215 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
216 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
217 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
218 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
219 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
221 using namespace AES_AARCH64;
224 uint8x16_t B0 = vld1q_u8(in);
225 uint8x16_t B1 = vld1q_u8(in + 16);
226 uint8x16_t B2 = vld1q_u8(in + 32);
227 uint8x16_t B3 = vld1q_u8(in + 48);
229 enc4(B0, B1, B2, B3, K0);
230 enc4(B0, B1, B2, B3, K1);
231 enc4(B0, B1, B2, B3, K2);
232 enc4(B0, B1, B2, B3, K3);
233 enc4(B0, B1, B2, B3, K4);
234 enc4(B0, B1, B2, B3, K5);
235 enc4(B0, B1, B2, B3, K6);
236 enc4(B0, B1, B2, B3, K7);
237 enc4(B0, B1, B2, B3, K8);
238 enc4(B0, B1, B2, B3, K9);
239 enc4(B0, B1, B2, B3, K10);
240 enc4_last(B0, B1, B2, B3, K11, K12);
243 vst1q_u8(out + 16, B1);
244 vst1q_u8(out + 32, B2);
245 vst1q_u8(out + 48, B3);
252 for(
size_t i = 0; i != blocks; ++i) {
253 uint8x16_t B = vld1q_u8(in + 16 * i);
265 enc_last(B, K11, K12);
266 vst1q_u8(out + 16 * i, B);
273BOTAN_FN_ISA_AES
void AES_192::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
274 const uint8_t* skey =
reinterpret_cast<const uint8_t*
>(m_DK.data());
276 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
277 const uint8x16_t
K1 = vld1q_u8(skey + 1 * 16);
278 const uint8x16_t
K2 = vld1q_u8(skey + 2 * 16);
279 const uint8x16_t
K3 = vld1q_u8(skey + 3 * 16);
280 const uint8x16_t
K4 = vld1q_u8(skey + 4 * 16);
281 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
282 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
283 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
284 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
285 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
286 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
287 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
288 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
290 using namespace AES_AARCH64;
293 uint8x16_t B0 = vld1q_u8(in);
294 uint8x16_t B1 = vld1q_u8(in + 16);
295 uint8x16_t B2 = vld1q_u8(in + 32);
296 uint8x16_t B3 = vld1q_u8(in + 48);
298 dec4(B0, B1, B2, B3, K0);
299 dec4(B0, B1, B2, B3, K1);
300 dec4(B0, B1, B2, B3, K2);
301 dec4(B0, B1, B2, B3, K3);
302 dec4(B0, B1, B2, B3, K4);
303 dec4(B0, B1, B2, B3, K5);
304 dec4(B0, B1, B2, B3, K6);
305 dec4(B0, B1, B2, B3, K7);
306 dec4(B0, B1, B2, B3, K8);
307 dec4(B0, B1, B2, B3, K9);
308 dec4(B0, B1, B2, B3, K10);
309 dec4_last(B0, B1, B2, B3, K11, K12);
312 vst1q_u8(out + 16, B1);
313 vst1q_u8(out + 32, B2);
314 vst1q_u8(out + 48, B3);
321 for(
size_t i = 0; i != blocks; ++i) {
322 uint8x16_t B = vld1q_u8(in + 16 * i);
334 dec_last(B, K11, K12);
335 vst1q_u8(out + 16 * i, B);
342BOTAN_FN_ISA_AES
void AES_256::hw_aes_encrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
343 const uint8_t* skey =
reinterpret_cast<const uint8_t*
>(m_EK.data());
345 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
346 const uint8x16_t
K1 = vld1q_u8(skey + 1 * 16);
347 const uint8x16_t
K2 = vld1q_u8(skey + 2 * 16);
348 const uint8x16_t
K3 = vld1q_u8(skey + 3 * 16);
349 const uint8x16_t
K4 = vld1q_u8(skey + 4 * 16);
350 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
351 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
352 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
353 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
354 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
355 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
356 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
357 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
358 const uint8x16_t K13 = vld1q_u8(skey + 13 * 16);
359 const uint8x16_t K14 = vld1q_u8(skey + 14 * 16);
361 using namespace AES_AARCH64;
363 using namespace AES_AARCH64;
366 uint8x16_t B0 = vld1q_u8(in);
367 uint8x16_t B1 = vld1q_u8(in + 16);
368 uint8x16_t B2 = vld1q_u8(in + 32);
369 uint8x16_t B3 = vld1q_u8(in + 48);
371 enc4(B0, B1, B2, B3, K0);
372 enc4(B0, B1, B2, B3, K1);
373 enc4(B0, B1, B2, B3, K2);
374 enc4(B0, B1, B2, B3, K3);
375 enc4(B0, B1, B2, B3, K4);
376 enc4(B0, B1, B2, B3, K5);
377 enc4(B0, B1, B2, B3, K6);
378 enc4(B0, B1, B2, B3, K7);
379 enc4(B0, B1, B2, B3, K8);
380 enc4(B0, B1, B2, B3, K9);
381 enc4(B0, B1, B2, B3, K10);
382 enc4(B0, B1, B2, B3, K11);
383 enc4(B0, B1, B2, B3, K12);
384 enc4_last(B0, B1, B2, B3, K13, K14);
387 vst1q_u8(out + 16, B1);
388 vst1q_u8(out + 32, B2);
389 vst1q_u8(out + 48, B3);
396 for(
size_t i = 0; i != blocks; ++i) {
397 uint8x16_t B = vld1q_u8(in + 16 * i);
411 enc_last(B, K13, K14);
412 vst1q_u8(out + 16 * i, B);
419BOTAN_FN_ISA_AES
void AES_256::hw_aes_decrypt_n(
const uint8_t in[], uint8_t out[],
size_t blocks)
const {
420 const uint8_t* skey =
reinterpret_cast<const uint8_t*
>(m_DK.data());
422 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
423 const uint8x16_t
K1 = vld1q_u8(skey + 1 * 16);
424 const uint8x16_t
K2 = vld1q_u8(skey + 2 * 16);
425 const uint8x16_t
K3 = vld1q_u8(skey + 3 * 16);
426 const uint8x16_t
K4 = vld1q_u8(skey + 4 * 16);
427 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
428 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
429 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
430 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
431 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
432 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
433 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
434 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
435 const uint8x16_t K13 = vld1q_u8(skey + 13 * 16);
436 const uint8x16_t K14 = vld1q_u8(skey + 14 * 16);
438 using namespace AES_AARCH64;
441 uint8x16_t B0 = vld1q_u8(in);
442 uint8x16_t B1 = vld1q_u8(in + 16);
443 uint8x16_t B2 = vld1q_u8(in + 32);
444 uint8x16_t B3 = vld1q_u8(in + 48);
446 dec4(B0, B1, B2, B3, K0);
447 dec4(B0, B1, B2, B3, K1);
448 dec4(B0, B1, B2, B3, K2);
449 dec4(B0, B1, B2, B3, K3);
450 dec4(B0, B1, B2, B3, K4);
451 dec4(B0, B1, B2, B3, K5);
452 dec4(B0, B1, B2, B3, K6);
453 dec4(B0, B1, B2, B3, K7);
454 dec4(B0, B1, B2, B3, K8);
455 dec4(B0, B1, B2, B3, K9);
456 dec4(B0, B1, B2, B3, K10);
457 dec4(B0, B1, B2, B3, K11);
458 dec4(B0, B1, B2, B3, K12);
459 dec4_last(B0, B1, B2, B3, K13, K14);
462 vst1q_u8(out + 16, B1);
463 vst1q_u8(out + 32, B2);
464 vst1q_u8(out + 48, B3);
471 for(
size_t i = 0; i != blocks; ++i) {
472 uint8x16_t B = vld1q_u8(in + 16 * i);
486 dec_last(B, K13, K14);
487 vst1q_u8(out + 16 * i, B);
#define BOTAN_FORCE_INLINE