12#include <botan/loadstor.h>
17#define AES_ENC_4_ROUNDS(K) \
20 B0 = vaesmcq_u8(vaeseq_u8(B0, K)); \
21 B1 = vaesmcq_u8(vaeseq_u8(B1, K)); \
22 B2 = vaesmcq_u8(vaeseq_u8(B2, K)); \
23 B3 = vaesmcq_u8(vaeseq_u8(B3, K)); \
26#define AES_ENC_4_LAST_ROUNDS(K, K2) \
29 B0 = veorq_u8(vaeseq_u8(B0, K), K2); \
30 B1 = veorq_u8(vaeseq_u8(B1, K), K2); \
31 B2 = veorq_u8(vaeseq_u8(B2, K), K2); \
32 B3 = veorq_u8(vaeseq_u8(B3, K), K2); \
35#define AES_DEC_4_ROUNDS(K) \
38 B0 = vaesimcq_u8(vaesdq_u8(B0, K)); \
39 B1 = vaesimcq_u8(vaesdq_u8(B1, K)); \
40 B2 = vaesimcq_u8(vaesdq_u8(B2, K)); \
41 B3 = vaesimcq_u8(vaesdq_u8(B3, K)); \
44#define AES_DEC_4_LAST_ROUNDS(K, K2) \
47 B0 = veorq_u8(vaesdq_u8(B0, K), K2); \
48 B1 = veorq_u8(vaesdq_u8(B1, K), K2); \
49 B2 = veorq_u8(vaesdq_u8(B2, K), K2); \
50 B3 = veorq_u8(vaesdq_u8(B3, K), K2); \
57void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
59 const uint8_t *skey =
reinterpret_cast<const uint8_t*
>(m_EK.data());
61 const uint8x16_t K0 = vld1q_u8(skey + 0*16);
62 const uint8x16_t K1 = vld1q_u8(skey + 1*16);
63 const uint8x16_t K2 = vld1q_u8(skey + 2*16);
64 const uint8x16_t K3 = vld1q_u8(skey + 3*16);
65 const uint8x16_t K4 = vld1q_u8(skey + 4*16);
66 const uint8x16_t K5 = vld1q_u8(skey + 5*16);
67 const uint8x16_t K6 = vld1q_u8(skey + 6*16);
68 const uint8x16_t K7 = vld1q_u8(skey + 7*16);
69 const uint8x16_t K8 = vld1q_u8(skey + 8*16);
70 const uint8x16_t K9 = vld1q_u8(skey + 9*16);
71 const uint8x16_t K10 = vld1q_u8(skey + 10*16);
75 uint8x16_t B0 = vld1q_u8(in);
76 uint8x16_t B1 = vld1q_u8(in+16);
77 uint8x16_t B2 = vld1q_u8(in+32);
78 uint8x16_t B3 = vld1q_u8(in+48);
101 for(
size_t i = 0; i != blocks; ++i)
103 uint8x16_t B = vld1q_u8(in+16*i);
104 B = vaesmcq_u8(vaeseq_u8(B, K0));
105 B = vaesmcq_u8(vaeseq_u8(B, K1));
106 B = vaesmcq_u8(vaeseq_u8(B, K2));
107 B = vaesmcq_u8(vaeseq_u8(B, K3));
108 B = vaesmcq_u8(vaeseq_u8(B, K4));
109 B = vaesmcq_u8(vaeseq_u8(B, K5));
110 B = vaesmcq_u8(vaeseq_u8(B, K6));
111 B = vaesmcq_u8(vaeseq_u8(B, K7));
112 B = vaesmcq_u8(vaeseq_u8(B, K8));
113 B = veorq_u8(vaeseq_u8(B, K9), K10);
114 vst1q_u8(out+16*i, B);
122void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
124 const uint8_t *skey =
reinterpret_cast<const uint8_t*
>(m_DK.data());
126 const uint8x16_t K0 = vld1q_u8(skey + 0*16);
127 const uint8x16_t K1 = vld1q_u8(skey + 1*16);
128 const uint8x16_t K2 = vld1q_u8(skey + 2*16);
129 const uint8x16_t K3 = vld1q_u8(skey + 3*16);
130 const uint8x16_t K4 = vld1q_u8(skey + 4*16);
131 const uint8x16_t K5 = vld1q_u8(skey + 5*16);
132 const uint8x16_t K6 = vld1q_u8(skey + 6*16);
133 const uint8x16_t K7 = vld1q_u8(skey + 7*16);
134 const uint8x16_t K8 = vld1q_u8(skey + 8*16);
135 const uint8x16_t K9 = vld1q_u8(skey + 9*16);
136 const uint8x16_t K10 = vld1q_u8(skey + 10*16);
140 uint8x16_t B0 = vld1q_u8(in);
141 uint8x16_t B1 = vld1q_u8(in+16);
142 uint8x16_t B2 = vld1q_u8(in+32);
143 uint8x16_t B3 = vld1q_u8(in+48);
157 vst1q_u8(out+16, B1);
158 vst1q_u8(out+32, B2);
159 vst1q_u8(out+48, B3);
166 for(
size_t i = 0; i != blocks; ++i)
168 uint8x16_t B = vld1q_u8(in+16*i);
169 B = vaesimcq_u8(vaesdq_u8(B, K0));
170 B = vaesimcq_u8(vaesdq_u8(B, K1));
171 B = vaesimcq_u8(vaesdq_u8(B, K2));
172 B = vaesimcq_u8(vaesdq_u8(B, K3));
173 B = vaesimcq_u8(vaesdq_u8(B, K4));
174 B = vaesimcq_u8(vaesdq_u8(B, K5));
175 B = vaesimcq_u8(vaesdq_u8(B, K6));
176 B = vaesimcq_u8(vaesdq_u8(B, K7));
177 B = vaesimcq_u8(vaesdq_u8(B, K8));
178 B = veorq_u8(vaesdq_u8(B, K9), K10);
179 vst1q_u8(out+16*i, B);
187void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
189 const uint8_t *skey =
reinterpret_cast<const uint8_t*
>(m_EK.data());
191 const uint8x16_t K0 = vld1q_u8(skey + 0*16);
192 const uint8x16_t K1 = vld1q_u8(skey + 1*16);
193 const uint8x16_t K2 = vld1q_u8(skey + 2*16);
194 const uint8x16_t K3 = vld1q_u8(skey + 3*16);
195 const uint8x16_t K4 = vld1q_u8(skey + 4*16);
196 const uint8x16_t K5 = vld1q_u8(skey + 5*16);
197 const uint8x16_t K6 = vld1q_u8(skey + 6*16);
198 const uint8x16_t K7 = vld1q_u8(skey + 7*16);
199 const uint8x16_t K8 = vld1q_u8(skey + 8*16);
200 const uint8x16_t K9 = vld1q_u8(skey + 9*16);
201 const uint8x16_t K10 = vld1q_u8(skey + 10*16);
202 const uint8x16_t K11 = vld1q_u8(skey + 11*16);
203 const uint8x16_t K12 = vld1q_u8(skey + 12*16);
207 uint8x16_t B0 = vld1q_u8(in);
208 uint8x16_t B1 = vld1q_u8(in+16);
209 uint8x16_t B2 = vld1q_u8(in+32);
210 uint8x16_t B3 = vld1q_u8(in+48);
226 vst1q_u8(out+16, B1);
227 vst1q_u8(out+32, B2);
228 vst1q_u8(out+48, B3);
235 for(
size_t i = 0; i != blocks; ++i)
237 uint8x16_t B = vld1q_u8(in+16*i);
238 B = vaesmcq_u8(vaeseq_u8(B, K0));
239 B = vaesmcq_u8(vaeseq_u8(B, K1));
240 B = vaesmcq_u8(vaeseq_u8(B, K2));
241 B = vaesmcq_u8(vaeseq_u8(B, K3));
242 B = vaesmcq_u8(vaeseq_u8(B, K4));
243 B = vaesmcq_u8(vaeseq_u8(B, K5));
244 B = vaesmcq_u8(vaeseq_u8(B, K6));
245 B = vaesmcq_u8(vaeseq_u8(B, K7));
246 B = vaesmcq_u8(vaeseq_u8(B, K8));
247 B = vaesmcq_u8(vaeseq_u8(B, K9));
248 B = vaesmcq_u8(vaeseq_u8(B, K10));
249 B = veorq_u8(vaeseq_u8(B, K11), K12);
250 vst1q_u8(out+16*i, B);
258void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
260 const uint8_t *skey =
reinterpret_cast<const uint8_t*
>(m_DK.data());
262 const uint8x16_t K0 = vld1q_u8(skey + 0*16);
263 const uint8x16_t K1 = vld1q_u8(skey + 1*16);
264 const uint8x16_t K2 = vld1q_u8(skey + 2*16);
265 const uint8x16_t K3 = vld1q_u8(skey + 3*16);
266 const uint8x16_t K4 = vld1q_u8(skey + 4*16);
267 const uint8x16_t K5 = vld1q_u8(skey + 5*16);
268 const uint8x16_t K6 = vld1q_u8(skey + 6*16);
269 const uint8x16_t K7 = vld1q_u8(skey + 7*16);
270 const uint8x16_t K8 = vld1q_u8(skey + 8*16);
271 const uint8x16_t K9 = vld1q_u8(skey + 9*16);
272 const uint8x16_t K10 = vld1q_u8(skey + 10*16);
273 const uint8x16_t K11 = vld1q_u8(skey + 11*16);
274 const uint8x16_t K12 = vld1q_u8(skey + 12*16);
278 uint8x16_t B0 = vld1q_u8(in);
279 uint8x16_t B1 = vld1q_u8(in+16);
280 uint8x16_t B2 = vld1q_u8(in+32);
281 uint8x16_t B3 = vld1q_u8(in+48);
297 vst1q_u8(out+16, B1);
298 vst1q_u8(out+32, B2);
299 vst1q_u8(out+48, B3);
306 for(
size_t i = 0; i != blocks; ++i)
308 uint8x16_t B = vld1q_u8(in+16*i);
309 B = vaesimcq_u8(vaesdq_u8(B, K0));
310 B = vaesimcq_u8(vaesdq_u8(B, K1));
311 B = vaesimcq_u8(vaesdq_u8(B, K2));
312 B = vaesimcq_u8(vaesdq_u8(B, K3));
313 B = vaesimcq_u8(vaesdq_u8(B, K4));
314 B = vaesimcq_u8(vaesdq_u8(B, K5));
315 B = vaesimcq_u8(vaesdq_u8(B, K6));
316 B = vaesimcq_u8(vaesdq_u8(B, K7));
317 B = vaesimcq_u8(vaesdq_u8(B, K8));
318 B = vaesimcq_u8(vaesdq_u8(B, K9));
319 B = vaesimcq_u8(vaesdq_u8(B, K10));
320 B = veorq_u8(vaesdq_u8(B, K11), K12);
321 vst1q_u8(out+16*i, B);
329void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
331 const uint8_t *skey =
reinterpret_cast<const uint8_t*
>(m_EK.data());
333 const uint8x16_t K0 = vld1q_u8(skey + 0*16);
334 const uint8x16_t K1 = vld1q_u8(skey + 1*16);
335 const uint8x16_t K2 = vld1q_u8(skey + 2*16);
336 const uint8x16_t K3 = vld1q_u8(skey + 3*16);
337 const uint8x16_t K4 = vld1q_u8(skey + 4*16);
338 const uint8x16_t K5 = vld1q_u8(skey + 5*16);
339 const uint8x16_t K6 = vld1q_u8(skey + 6*16);
340 const uint8x16_t K7 = vld1q_u8(skey + 7*16);
341 const uint8x16_t K8 = vld1q_u8(skey + 8*16);
342 const uint8x16_t K9 = vld1q_u8(skey + 9*16);
343 const uint8x16_t K10 = vld1q_u8(skey + 10*16);
344 const uint8x16_t K11 = vld1q_u8(skey + 11*16);
345 const uint8x16_t K12 = vld1q_u8(skey + 12*16);
346 const uint8x16_t K13 = vld1q_u8(skey + 13*16);
347 const uint8x16_t K14 = vld1q_u8(skey + 14*16);
351 uint8x16_t B0 = vld1q_u8(in);
352 uint8x16_t B1 = vld1q_u8(in+16);
353 uint8x16_t B2 = vld1q_u8(in+32);
354 uint8x16_t B3 = vld1q_u8(in+48);
372 vst1q_u8(out+16, B1);
373 vst1q_u8(out+32, B2);
374 vst1q_u8(out+48, B3);
381 for(
size_t i = 0; i != blocks; ++i)
383 uint8x16_t B = vld1q_u8(in+16*i);
384 B = vaesmcq_u8(vaeseq_u8(B, K0));
385 B = vaesmcq_u8(vaeseq_u8(B, K1));
386 B = vaesmcq_u8(vaeseq_u8(B, K2));
387 B = vaesmcq_u8(vaeseq_u8(B, K3));
388 B = vaesmcq_u8(vaeseq_u8(B, K4));
389 B = vaesmcq_u8(vaeseq_u8(B, K5));
390 B = vaesmcq_u8(vaeseq_u8(B, K6));
391 B = vaesmcq_u8(vaeseq_u8(B, K7));
392 B = vaesmcq_u8(vaeseq_u8(B, K8));
393 B = vaesmcq_u8(vaeseq_u8(B, K9));
394 B = vaesmcq_u8(vaeseq_u8(B, K10));
395 B = vaesmcq_u8(vaeseq_u8(B, K11));
396 B = vaesmcq_u8(vaeseq_u8(B, K12));
397 B = veorq_u8(vaeseq_u8(B, K13), K14);
398 vst1q_u8(out+16*i, B);
406void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
408 const uint8_t *skey =
reinterpret_cast<const uint8_t*
>(m_DK.data());
410 const uint8x16_t K0 = vld1q_u8(skey + 0*16);
411 const uint8x16_t K1 = vld1q_u8(skey + 1*16);
412 const uint8x16_t K2 = vld1q_u8(skey + 2*16);
413 const uint8x16_t K3 = vld1q_u8(skey + 3*16);
414 const uint8x16_t K4 = vld1q_u8(skey + 4*16);
415 const uint8x16_t K5 = vld1q_u8(skey + 5*16);
416 const uint8x16_t K6 = vld1q_u8(skey + 6*16);
417 const uint8x16_t K7 = vld1q_u8(skey + 7*16);
418 const uint8x16_t K8 = vld1q_u8(skey + 8*16);
419 const uint8x16_t K9 = vld1q_u8(skey + 9*16);
420 const uint8x16_t K10 = vld1q_u8(skey + 10*16);
421 const uint8x16_t K11 = vld1q_u8(skey + 11*16);
422 const uint8x16_t K12 = vld1q_u8(skey + 12*16);
423 const uint8x16_t K13 = vld1q_u8(skey + 13*16);
424 const uint8x16_t K14 = vld1q_u8(skey + 14*16);
428 uint8x16_t B0 = vld1q_u8(in);
429 uint8x16_t B1 = vld1q_u8(in+16);
430 uint8x16_t B2 = vld1q_u8(in+32);
431 uint8x16_t B3 = vld1q_u8(in+48);
449 vst1q_u8(out+16, B1);
450 vst1q_u8(out+32, B2);
451 vst1q_u8(out+48, B3);
458 for(
size_t i = 0; i != blocks; ++i)
460 uint8x16_t B = vld1q_u8(in+16*i);
461 B = vaesimcq_u8(vaesdq_u8(B, K0));
462 B = vaesimcq_u8(vaesdq_u8(B, K1));
463 B = vaesimcq_u8(vaesdq_u8(B, K2));
464 B = vaesimcq_u8(vaesdq_u8(B, K3));
465 B = vaesimcq_u8(vaesdq_u8(B, K4));
466 B = vaesimcq_u8(vaesdq_u8(B, K5));
467 B = vaesimcq_u8(vaesdq_u8(B, K6));
468 B = vaesimcq_u8(vaesdq_u8(B, K7));
469 B = vaesimcq_u8(vaesdq_u8(B, K8));
470 B = vaesimcq_u8(vaesdq_u8(B, K9));
471 B = vaesimcq_u8(vaesdq_u8(B, K10));
472 B = vaesimcq_u8(vaesdq_u8(B, K11));
473 B = vaesimcq_u8(vaesdq_u8(B, K12));
474 B = veorq_u8(vaesdq_u8(B, K13), K14);
475 vst1q_u8(out+16*i, B);
479#undef AES_ENC_4_ROUNDS
480#undef AES_ENC_4_LAST_ROUNDS
481#undef AES_DEC_4_ROUNDS
482#undef AES_DEC_4_LAST_ROUNDS
#define AES_DEC_4_ROUNDS(K)
#define AES_ENC_4_LAST_ROUNDS(K, K2)
#define AES_ENC_4_ROUNDS(K)
#define AES_DEC_4_LAST_ROUNDS(K, K2)
#define BOTAN_FUNC_ISA(isa)