Botan 2.19.1
Crypto and TLS for C&
aes_armv8.cpp
Go to the documentation of this file.
1/*
2* AES using ARMv8
3* Contributed by Jeffrey Walton
4*
5* Further changes
6* (C) 2017,2018 Jack Lloyd
7*
8* Botan is released under the Simplified BSD License (see license.txt)
9*/
10
11#include <botan/aes.h>
12#include <botan/loadstor.h>
13#include <arm_neon.h>
14
15namespace Botan {
16
17#define AES_ENC_4_ROUNDS(K) \
18 do \
19 { \
20 B0 = vaesmcq_u8(vaeseq_u8(B0, K)); \
21 B1 = vaesmcq_u8(vaeseq_u8(B1, K)); \
22 B2 = vaesmcq_u8(vaeseq_u8(B2, K)); \
23 B3 = vaesmcq_u8(vaeseq_u8(B3, K)); \
24 } while(0)
25
26#define AES_ENC_4_LAST_ROUNDS(K, K2) \
27 do \
28 { \
29 B0 = veorq_u8(vaeseq_u8(B0, K), K2); \
30 B1 = veorq_u8(vaeseq_u8(B1, K), K2); \
31 B2 = veorq_u8(vaeseq_u8(B2, K), K2); \
32 B3 = veorq_u8(vaeseq_u8(B3, K), K2); \
33 } while(0)
34
35#define AES_DEC_4_ROUNDS(K) \
36 do \
37 { \
38 B0 = vaesimcq_u8(vaesdq_u8(B0, K)); \
39 B1 = vaesimcq_u8(vaesdq_u8(B1, K)); \
40 B2 = vaesimcq_u8(vaesdq_u8(B2, K)); \
41 B3 = vaesimcq_u8(vaesdq_u8(B3, K)); \
42 } while(0)
43
44#define AES_DEC_4_LAST_ROUNDS(K, K2) \
45 do \
46 { \
47 B0 = veorq_u8(vaesdq_u8(B0, K), K2); \
48 B1 = veorq_u8(vaesdq_u8(B1, K), K2); \
49 B2 = veorq_u8(vaesdq_u8(B2, K), K2); \
50 B3 = veorq_u8(vaesdq_u8(B3, K), K2); \
51 } while(0)
52
53/*
54* AES-128 Encryption
55*/
56BOTAN_FUNC_ISA("+crypto")
57void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
58 {
59 const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_EK.data());
60
61 const uint8x16_t K0 = vld1q_u8(skey + 0*16);
62 const uint8x16_t K1 = vld1q_u8(skey + 1*16);
63 const uint8x16_t K2 = vld1q_u8(skey + 2*16);
64 const uint8x16_t K3 = vld1q_u8(skey + 3*16);
65 const uint8x16_t K4 = vld1q_u8(skey + 4*16);
66 const uint8x16_t K5 = vld1q_u8(skey + 5*16);
67 const uint8x16_t K6 = vld1q_u8(skey + 6*16);
68 const uint8x16_t K7 = vld1q_u8(skey + 7*16);
69 const uint8x16_t K8 = vld1q_u8(skey + 8*16);
70 const uint8x16_t K9 = vld1q_u8(skey + 9*16);
71 const uint8x16_t K10 = vld1q_u8(skey + 10*16);
72
73 while(blocks >= 4)
74 {
75 uint8x16_t B0 = vld1q_u8(in);
76 uint8x16_t B1 = vld1q_u8(in+16);
77 uint8x16_t B2 = vld1q_u8(in+32);
78 uint8x16_t B3 = vld1q_u8(in+48);
79
89 AES_ENC_4_LAST_ROUNDS(K9, K10);
90
91 vst1q_u8(out, B0);
92 vst1q_u8(out+16, B1);
93 vst1q_u8(out+32, B2);
94 vst1q_u8(out+48, B3);
95
96 in += 16*4;
97 out += 16*4;
98 blocks -= 4;
99 }
100
101 for(size_t i = 0; i != blocks; ++i)
102 {
103 uint8x16_t B = vld1q_u8(in+16*i);
104 B = vaesmcq_u8(vaeseq_u8(B, K0));
105 B = vaesmcq_u8(vaeseq_u8(B, K1));
106 B = vaesmcq_u8(vaeseq_u8(B, K2));
107 B = vaesmcq_u8(vaeseq_u8(B, K3));
108 B = vaesmcq_u8(vaeseq_u8(B, K4));
109 B = vaesmcq_u8(vaeseq_u8(B, K5));
110 B = vaesmcq_u8(vaeseq_u8(B, K6));
111 B = vaesmcq_u8(vaeseq_u8(B, K7));
112 B = vaesmcq_u8(vaeseq_u8(B, K8));
113 B = veorq_u8(vaeseq_u8(B, K9), K10);
114 vst1q_u8(out+16*i, B);
115 }
116 }
117
118/*
119* AES-128 Decryption
120*/
121BOTAN_FUNC_ISA("+crypto")
122void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
123 {
124 const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_DK.data());
125
126 const uint8x16_t K0 = vld1q_u8(skey + 0*16);
127 const uint8x16_t K1 = vld1q_u8(skey + 1*16);
128 const uint8x16_t K2 = vld1q_u8(skey + 2*16);
129 const uint8x16_t K3 = vld1q_u8(skey + 3*16);
130 const uint8x16_t K4 = vld1q_u8(skey + 4*16);
131 const uint8x16_t K5 = vld1q_u8(skey + 5*16);
132 const uint8x16_t K6 = vld1q_u8(skey + 6*16);
133 const uint8x16_t K7 = vld1q_u8(skey + 7*16);
134 const uint8x16_t K8 = vld1q_u8(skey + 8*16);
135 const uint8x16_t K9 = vld1q_u8(skey + 9*16);
136 const uint8x16_t K10 = vld1q_u8(skey + 10*16);
137
138 while(blocks >= 4)
139 {
140 uint8x16_t B0 = vld1q_u8(in);
141 uint8x16_t B1 = vld1q_u8(in+16);
142 uint8x16_t B2 = vld1q_u8(in+32);
143 uint8x16_t B3 = vld1q_u8(in+48);
144
154 AES_DEC_4_LAST_ROUNDS(K9, K10);
155
156 vst1q_u8(out, B0);
157 vst1q_u8(out+16, B1);
158 vst1q_u8(out+32, B2);
159 vst1q_u8(out+48, B3);
160
161 in += 16*4;
162 out += 16*4;
163 blocks -= 4;
164 }
165
166 for(size_t i = 0; i != blocks; ++i)
167 {
168 uint8x16_t B = vld1q_u8(in+16*i);
169 B = vaesimcq_u8(vaesdq_u8(B, K0));
170 B = vaesimcq_u8(vaesdq_u8(B, K1));
171 B = vaesimcq_u8(vaesdq_u8(B, K2));
172 B = vaesimcq_u8(vaesdq_u8(B, K3));
173 B = vaesimcq_u8(vaesdq_u8(B, K4));
174 B = vaesimcq_u8(vaesdq_u8(B, K5));
175 B = vaesimcq_u8(vaesdq_u8(B, K6));
176 B = vaesimcq_u8(vaesdq_u8(B, K7));
177 B = vaesimcq_u8(vaesdq_u8(B, K8));
178 B = veorq_u8(vaesdq_u8(B, K9), K10);
179 vst1q_u8(out+16*i, B);
180 }
181 }
182
183/*
184* AES-192 Encryption
185*/
186BOTAN_FUNC_ISA("+crypto")
187void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
188 {
189 const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_EK.data());
190
191 const uint8x16_t K0 = vld1q_u8(skey + 0*16);
192 const uint8x16_t K1 = vld1q_u8(skey + 1*16);
193 const uint8x16_t K2 = vld1q_u8(skey + 2*16);
194 const uint8x16_t K3 = vld1q_u8(skey + 3*16);
195 const uint8x16_t K4 = vld1q_u8(skey + 4*16);
196 const uint8x16_t K5 = vld1q_u8(skey + 5*16);
197 const uint8x16_t K6 = vld1q_u8(skey + 6*16);
198 const uint8x16_t K7 = vld1q_u8(skey + 7*16);
199 const uint8x16_t K8 = vld1q_u8(skey + 8*16);
200 const uint8x16_t K9 = vld1q_u8(skey + 9*16);
201 const uint8x16_t K10 = vld1q_u8(skey + 10*16);
202 const uint8x16_t K11 = vld1q_u8(skey + 11*16);
203 const uint8x16_t K12 = vld1q_u8(skey + 12*16);
204
205 while(blocks >= 4)
206 {
207 uint8x16_t B0 = vld1q_u8(in);
208 uint8x16_t B1 = vld1q_u8(in+16);
209 uint8x16_t B2 = vld1q_u8(in+32);
210 uint8x16_t B3 = vld1q_u8(in+48);
211
222 AES_ENC_4_ROUNDS(K10);
223 AES_ENC_4_LAST_ROUNDS(K11, K12);
224
225 vst1q_u8(out, B0);
226 vst1q_u8(out+16, B1);
227 vst1q_u8(out+32, B2);
228 vst1q_u8(out+48, B3);
229
230 in += 16*4;
231 out += 16*4;
232 blocks -= 4;
233 }
234
235 for(size_t i = 0; i != blocks; ++i)
236 {
237 uint8x16_t B = vld1q_u8(in+16*i);
238 B = vaesmcq_u8(vaeseq_u8(B, K0));
239 B = vaesmcq_u8(vaeseq_u8(B, K1));
240 B = vaesmcq_u8(vaeseq_u8(B, K2));
241 B = vaesmcq_u8(vaeseq_u8(B, K3));
242 B = vaesmcq_u8(vaeseq_u8(B, K4));
243 B = vaesmcq_u8(vaeseq_u8(B, K5));
244 B = vaesmcq_u8(vaeseq_u8(B, K6));
245 B = vaesmcq_u8(vaeseq_u8(B, K7));
246 B = vaesmcq_u8(vaeseq_u8(B, K8));
247 B = vaesmcq_u8(vaeseq_u8(B, K9));
248 B = vaesmcq_u8(vaeseq_u8(B, K10));
249 B = veorq_u8(vaeseq_u8(B, K11), K12);
250 vst1q_u8(out+16*i, B);
251 }
252 }
253
254/*
255* AES-192 Decryption
256*/
257BOTAN_FUNC_ISA("+crypto")
258void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
259 {
260 const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_DK.data());
261
262 const uint8x16_t K0 = vld1q_u8(skey + 0*16);
263 const uint8x16_t K1 = vld1q_u8(skey + 1*16);
264 const uint8x16_t K2 = vld1q_u8(skey + 2*16);
265 const uint8x16_t K3 = vld1q_u8(skey + 3*16);
266 const uint8x16_t K4 = vld1q_u8(skey + 4*16);
267 const uint8x16_t K5 = vld1q_u8(skey + 5*16);
268 const uint8x16_t K6 = vld1q_u8(skey + 6*16);
269 const uint8x16_t K7 = vld1q_u8(skey + 7*16);
270 const uint8x16_t K8 = vld1q_u8(skey + 8*16);
271 const uint8x16_t K9 = vld1q_u8(skey + 9*16);
272 const uint8x16_t K10 = vld1q_u8(skey + 10*16);
273 const uint8x16_t K11 = vld1q_u8(skey + 11*16);
274 const uint8x16_t K12 = vld1q_u8(skey + 12*16);
275
276 while(blocks >= 4)
277 {
278 uint8x16_t B0 = vld1q_u8(in);
279 uint8x16_t B1 = vld1q_u8(in+16);
280 uint8x16_t B2 = vld1q_u8(in+32);
281 uint8x16_t B3 = vld1q_u8(in+48);
282
293 AES_DEC_4_ROUNDS(K10);
294 AES_DEC_4_LAST_ROUNDS(K11, K12);
295
296 vst1q_u8(out, B0);
297 vst1q_u8(out+16, B1);
298 vst1q_u8(out+32, B2);
299 vst1q_u8(out+48, B3);
300
301 in += 16*4;
302 out += 16*4;
303 blocks -= 4;
304 }
305
306 for(size_t i = 0; i != blocks; ++i)
307 {
308 uint8x16_t B = vld1q_u8(in+16*i);
309 B = vaesimcq_u8(vaesdq_u8(B, K0));
310 B = vaesimcq_u8(vaesdq_u8(B, K1));
311 B = vaesimcq_u8(vaesdq_u8(B, K2));
312 B = vaesimcq_u8(vaesdq_u8(B, K3));
313 B = vaesimcq_u8(vaesdq_u8(B, K4));
314 B = vaesimcq_u8(vaesdq_u8(B, K5));
315 B = vaesimcq_u8(vaesdq_u8(B, K6));
316 B = vaesimcq_u8(vaesdq_u8(B, K7));
317 B = vaesimcq_u8(vaesdq_u8(B, K8));
318 B = vaesimcq_u8(vaesdq_u8(B, K9));
319 B = vaesimcq_u8(vaesdq_u8(B, K10));
320 B = veorq_u8(vaesdq_u8(B, K11), K12);
321 vst1q_u8(out+16*i, B);
322 }
323 }
324
325/*
326* AES-256 Encryption
327*/
328BOTAN_FUNC_ISA("+crypto")
329void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
330 {
331 const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_EK.data());
332
333 const uint8x16_t K0 = vld1q_u8(skey + 0*16);
334 const uint8x16_t K1 = vld1q_u8(skey + 1*16);
335 const uint8x16_t K2 = vld1q_u8(skey + 2*16);
336 const uint8x16_t K3 = vld1q_u8(skey + 3*16);
337 const uint8x16_t K4 = vld1q_u8(skey + 4*16);
338 const uint8x16_t K5 = vld1q_u8(skey + 5*16);
339 const uint8x16_t K6 = vld1q_u8(skey + 6*16);
340 const uint8x16_t K7 = vld1q_u8(skey + 7*16);
341 const uint8x16_t K8 = vld1q_u8(skey + 8*16);
342 const uint8x16_t K9 = vld1q_u8(skey + 9*16);
343 const uint8x16_t K10 = vld1q_u8(skey + 10*16);
344 const uint8x16_t K11 = vld1q_u8(skey + 11*16);
345 const uint8x16_t K12 = vld1q_u8(skey + 12*16);
346 const uint8x16_t K13 = vld1q_u8(skey + 13*16);
347 const uint8x16_t K14 = vld1q_u8(skey + 14*16);
348
349 while(blocks >= 4)
350 {
351 uint8x16_t B0 = vld1q_u8(in);
352 uint8x16_t B1 = vld1q_u8(in+16);
353 uint8x16_t B2 = vld1q_u8(in+32);
354 uint8x16_t B3 = vld1q_u8(in+48);
355
366 AES_ENC_4_ROUNDS(K10);
367 AES_ENC_4_ROUNDS(K11);
368 AES_ENC_4_ROUNDS(K12);
369 AES_ENC_4_LAST_ROUNDS(K13, K14);
370
371 vst1q_u8(out, B0);
372 vst1q_u8(out+16, B1);
373 vst1q_u8(out+32, B2);
374 vst1q_u8(out+48, B3);
375
376 in += 16*4;
377 out += 16*4;
378 blocks -= 4;
379 }
380
381 for(size_t i = 0; i != blocks; ++i)
382 {
383 uint8x16_t B = vld1q_u8(in+16*i);
384 B = vaesmcq_u8(vaeseq_u8(B, K0));
385 B = vaesmcq_u8(vaeseq_u8(B, K1));
386 B = vaesmcq_u8(vaeseq_u8(B, K2));
387 B = vaesmcq_u8(vaeseq_u8(B, K3));
388 B = vaesmcq_u8(vaeseq_u8(B, K4));
389 B = vaesmcq_u8(vaeseq_u8(B, K5));
390 B = vaesmcq_u8(vaeseq_u8(B, K6));
391 B = vaesmcq_u8(vaeseq_u8(B, K7));
392 B = vaesmcq_u8(vaeseq_u8(B, K8));
393 B = vaesmcq_u8(vaeseq_u8(B, K9));
394 B = vaesmcq_u8(vaeseq_u8(B, K10));
395 B = vaesmcq_u8(vaeseq_u8(B, K11));
396 B = vaesmcq_u8(vaeseq_u8(B, K12));
397 B = veorq_u8(vaeseq_u8(B, K13), K14);
398 vst1q_u8(out+16*i, B);
399 }
400 }
401
402/*
403* AES-256 Decryption
404*/
405BOTAN_FUNC_ISA("+crypto")
406void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
407 {
408 const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_DK.data());
409
410 const uint8x16_t K0 = vld1q_u8(skey + 0*16);
411 const uint8x16_t K1 = vld1q_u8(skey + 1*16);
412 const uint8x16_t K2 = vld1q_u8(skey + 2*16);
413 const uint8x16_t K3 = vld1q_u8(skey + 3*16);
414 const uint8x16_t K4 = vld1q_u8(skey + 4*16);
415 const uint8x16_t K5 = vld1q_u8(skey + 5*16);
416 const uint8x16_t K6 = vld1q_u8(skey + 6*16);
417 const uint8x16_t K7 = vld1q_u8(skey + 7*16);
418 const uint8x16_t K8 = vld1q_u8(skey + 8*16);
419 const uint8x16_t K9 = vld1q_u8(skey + 9*16);
420 const uint8x16_t K10 = vld1q_u8(skey + 10*16);
421 const uint8x16_t K11 = vld1q_u8(skey + 11*16);
422 const uint8x16_t K12 = vld1q_u8(skey + 12*16);
423 const uint8x16_t K13 = vld1q_u8(skey + 13*16);
424 const uint8x16_t K14 = vld1q_u8(skey + 14*16);
425
426 while(blocks >= 4)
427 {
428 uint8x16_t B0 = vld1q_u8(in);
429 uint8x16_t B1 = vld1q_u8(in+16);
430 uint8x16_t B2 = vld1q_u8(in+32);
431 uint8x16_t B3 = vld1q_u8(in+48);
432
443 AES_DEC_4_ROUNDS(K10);
444 AES_DEC_4_ROUNDS(K11);
445 AES_DEC_4_ROUNDS(K12);
446 AES_DEC_4_LAST_ROUNDS(K13, K14);
447
448 vst1q_u8(out, B0);
449 vst1q_u8(out+16, B1);
450 vst1q_u8(out+32, B2);
451 vst1q_u8(out+48, B3);
452
453 in += 16*4;
454 out += 16*4;
455 blocks -= 4;
456 }
457
458 for(size_t i = 0; i != blocks; ++i)
459 {
460 uint8x16_t B = vld1q_u8(in+16*i);
461 B = vaesimcq_u8(vaesdq_u8(B, K0));
462 B = vaesimcq_u8(vaesdq_u8(B, K1));
463 B = vaesimcq_u8(vaesdq_u8(B, K2));
464 B = vaesimcq_u8(vaesdq_u8(B, K3));
465 B = vaesimcq_u8(vaesdq_u8(B, K4));
466 B = vaesimcq_u8(vaesdq_u8(B, K5));
467 B = vaesimcq_u8(vaesdq_u8(B, K6));
468 B = vaesimcq_u8(vaesdq_u8(B, K7));
469 B = vaesimcq_u8(vaesdq_u8(B, K8));
470 B = vaesimcq_u8(vaesdq_u8(B, K9));
471 B = vaesimcq_u8(vaesdq_u8(B, K10));
472 B = vaesimcq_u8(vaesdq_u8(B, K11));
473 B = vaesimcq_u8(vaesdq_u8(B, K12));
474 B = veorq_u8(vaesdq_u8(B, K13), K14);
475 vst1q_u8(out+16*i, B);
476 }
477 }
478
479#undef AES_ENC_4_ROUNDS
480#undef AES_ENC_4_LAST_ROUNDS
481#undef AES_DEC_4_ROUNDS
482#undef AES_DEC_4_LAST_ROUNDS
483
484}
#define AES_DEC_4_ROUNDS(K)
Definition: aes_armv8.cpp:35
#define AES_ENC_4_LAST_ROUNDS(K, K2)
Definition: aes_armv8.cpp:26
#define AES_ENC_4_ROUNDS(K)
Definition: aes_armv8.cpp:17
#define AES_DEC_4_LAST_ROUNDS(K, K2)
Definition: aes_armv8.cpp:44
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:77
Definition: alg_id.cpp:13