Botan 3.8.0
Crypto and TLS for C&
aes_armv8.cpp
Go to the documentation of this file.
1/*
2* AES using ARMv8
3* Contributed by Jeffrey Walton
4*
5* Further changes
6* (C) 2017,2018 Jack Lloyd
7*
8* Botan is released under the Simplified BSD License (see license.txt)
9*/
10
11#include <botan/internal/aes.h>
12
13#include <botan/internal/isa_extn.h>
14#include <botan/internal/loadstor.h>
15#include <arm_neon.h>
16
17namespace Botan {
18
19namespace AES_AARCH64 {
20
21BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc(uint8x16_t& B, uint8x16_t K) {
22 B = vaesmcq_u8(vaeseq_u8(B, K));
23}
24
25BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc4(
26 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K) {
27 B0 = vaesmcq_u8(vaeseq_u8(B0, K));
28 B1 = vaesmcq_u8(vaeseq_u8(B1, K));
29 B2 = vaesmcq_u8(vaeseq_u8(B2, K));
30 B3 = vaesmcq_u8(vaeseq_u8(B3, K));
31}
32
33BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc_last(uint8x16_t& B, uint8x16_t K, uint8x16_t K2) {
34 B = veorq_u8(vaeseq_u8(B, K), K2);
35}
36
37BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc4_last(
38 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K, uint8x16_t K2) {
39 B0 = veorq_u8(vaeseq_u8(B0, K), K2);
40 B1 = veorq_u8(vaeseq_u8(B1, K), K2);
41 B2 = veorq_u8(vaeseq_u8(B2, K), K2);
42 B3 = veorq_u8(vaeseq_u8(B3, K), K2);
43}
44
45BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec(uint8x16_t& B, uint8x16_t K) {
46 B = vaesimcq_u8(vaesdq_u8(B, K));
47}
48
49BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec4(
50 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K) {
51 B0 = vaesimcq_u8(vaesdq_u8(B0, K));
52 B1 = vaesimcq_u8(vaesdq_u8(B1, K));
53 B2 = vaesimcq_u8(vaesdq_u8(B2, K));
54 B3 = vaesimcq_u8(vaesdq_u8(B3, K));
55}
56
57BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec_last(uint8x16_t& B, uint8x16_t K, uint8x16_t K2) {
58 B = veorq_u8(vaesdq_u8(B, K), K2);
59}
60
61BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec4_last(
62 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K, uint8x16_t K2) {
63 B0 = veorq_u8(vaesdq_u8(B0, K), K2);
64 B1 = veorq_u8(vaesdq_u8(B1, K), K2);
65 B2 = veorq_u8(vaesdq_u8(B2, K), K2);
66 B3 = veorq_u8(vaesdq_u8(B3, K), K2);
67}
68
69} // namespace AES_AARCH64
70
71/*
72* AES-128 Encryption
73*/
74BOTAN_FN_ISA_AES void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
75 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_EK.data());
76
77 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
78 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
79 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
80 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
81 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
82 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
83 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
84 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
85 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
86 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
87 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
88
89 using namespace AES_AARCH64;
90
91 while(blocks >= 4) {
92 uint8x16_t B0 = vld1q_u8(in);
93 uint8x16_t B1 = vld1q_u8(in + 16);
94 uint8x16_t B2 = vld1q_u8(in + 32);
95 uint8x16_t B3 = vld1q_u8(in + 48);
96
97 enc4(B0, B1, B2, B3, K0);
98 enc4(B0, B1, B2, B3, K1);
99 enc4(B0, B1, B2, B3, K2);
100 enc4(B0, B1, B2, B3, K3);
101 enc4(B0, B1, B2, B3, K4);
102 enc4(B0, B1, B2, B3, K5);
103 enc4(B0, B1, B2, B3, K6);
104 enc4(B0, B1, B2, B3, K7);
105 enc4(B0, B1, B2, B3, K8);
106 enc4_last(B0, B1, B2, B3, K9, K10);
107
108 vst1q_u8(out, B0);
109 vst1q_u8(out + 16, B1);
110 vst1q_u8(out + 32, B2);
111 vst1q_u8(out + 48, B3);
112
113 in += 16 * 4;
114 out += 16 * 4;
115 blocks -= 4;
116 }
117
118 for(size_t i = 0; i != blocks; ++i) {
119 uint8x16_t B = vld1q_u8(in + 16 * i);
120 enc(B, K0);
121 enc(B, K1);
122 enc(B, K2);
123 enc(B, K3);
124 enc(B, K4);
125 enc(B, K5);
126 enc(B, K6);
127 enc(B, K7);
128 enc(B, K8);
129 enc_last(B, K9, K10);
130 vst1q_u8(out + 16 * i, B);
131 }
132}
133
134/*
135* AES-128 Decryption
136*/
137BOTAN_FN_ISA_AES void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
138 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_DK.data());
139
140 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
141 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
142 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
143 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
144 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
145 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
146 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
147 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
148 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
149 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
150 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
151
152 using namespace AES_AARCH64;
153
154 while(blocks >= 4) {
155 uint8x16_t B0 = vld1q_u8(in);
156 uint8x16_t B1 = vld1q_u8(in + 16);
157 uint8x16_t B2 = vld1q_u8(in + 32);
158 uint8x16_t B3 = vld1q_u8(in + 48);
159
160 dec4(B0, B1, B2, B3, K0);
161 dec4(B0, B1, B2, B3, K1);
162 dec4(B0, B1, B2, B3, K2);
163 dec4(B0, B1, B2, B3, K3);
164 dec4(B0, B1, B2, B3, K4);
165 dec4(B0, B1, B2, B3, K5);
166 dec4(B0, B1, B2, B3, K6);
167 dec4(B0, B1, B2, B3, K7);
168 dec4(B0, B1, B2, B3, K8);
169 dec4_last(B0, B1, B2, B3, K9, K10);
170
171 vst1q_u8(out, B0);
172 vst1q_u8(out + 16, B1);
173 vst1q_u8(out + 32, B2);
174 vst1q_u8(out + 48, B3);
175
176 in += 16 * 4;
177 out += 16 * 4;
178 blocks -= 4;
179 }
180
181 for(size_t i = 0; i != blocks; ++i) {
182 uint8x16_t B = vld1q_u8(in + 16 * i);
183 dec(B, K0);
184 dec(B, K1);
185 dec(B, K2);
186 dec(B, K3);
187 dec(B, K4);
188 dec(B, K5);
189 dec(B, K6);
190 dec(B, K7);
191 dec(B, K8);
192 B = veorq_u8(vaesdq_u8(B, K9), K10);
193 vst1q_u8(out + 16 * i, B);
194 }
195}
196
197/*
198* AES-192 Encryption
199*/
200BOTAN_FN_ISA_AES void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
201 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_EK.data());
202
203 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
204 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
205 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
206 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
207 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
208 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
209 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
210 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
211 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
212 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
213 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
214 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
215 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
216
217 using namespace AES_AARCH64;
218
219 while(blocks >= 4) {
220 uint8x16_t B0 = vld1q_u8(in);
221 uint8x16_t B1 = vld1q_u8(in + 16);
222 uint8x16_t B2 = vld1q_u8(in + 32);
223 uint8x16_t B3 = vld1q_u8(in + 48);
224
225 enc4(B0, B1, B2, B3, K0);
226 enc4(B0, B1, B2, B3, K1);
227 enc4(B0, B1, B2, B3, K2);
228 enc4(B0, B1, B2, B3, K3);
229 enc4(B0, B1, B2, B3, K4);
230 enc4(B0, B1, B2, B3, K5);
231 enc4(B0, B1, B2, B3, K6);
232 enc4(B0, B1, B2, B3, K7);
233 enc4(B0, B1, B2, B3, K8);
234 enc4(B0, B1, B2, B3, K9);
235 enc4(B0, B1, B2, B3, K10);
236 enc4_last(B0, B1, B2, B3, K11, K12);
237
238 vst1q_u8(out, B0);
239 vst1q_u8(out + 16, B1);
240 vst1q_u8(out + 32, B2);
241 vst1q_u8(out + 48, B3);
242
243 in += 16 * 4;
244 out += 16 * 4;
245 blocks -= 4;
246 }
247
248 for(size_t i = 0; i != blocks; ++i) {
249 uint8x16_t B = vld1q_u8(in + 16 * i);
250 enc(B, K0);
251 enc(B, K1);
252 enc(B, K2);
253 enc(B, K3);
254 enc(B, K4);
255 enc(B, K5);
256 enc(B, K6);
257 enc(B, K7);
258 enc(B, K8);
259 enc(B, K9);
260 enc(B, K10);
261 B = veorq_u8(vaeseq_u8(B, K11), K12);
262 vst1q_u8(out + 16 * i, B);
263 }
264}
265
266/*
267* AES-192 Decryption
268*/
269BOTAN_FN_ISA_AES void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
270 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_DK.data());
271
272 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
273 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
274 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
275 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
276 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
277 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
278 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
279 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
280 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
281 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
282 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
283 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
284 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
285
286 using namespace AES_AARCH64;
287
288 while(blocks >= 4) {
289 uint8x16_t B0 = vld1q_u8(in);
290 uint8x16_t B1 = vld1q_u8(in + 16);
291 uint8x16_t B2 = vld1q_u8(in + 32);
292 uint8x16_t B3 = vld1q_u8(in + 48);
293
294 dec4(B0, B1, B2, B3, K0);
295 dec4(B0, B1, B2, B3, K1);
296 dec4(B0, B1, B2, B3, K2);
297 dec4(B0, B1, B2, B3, K3);
298 dec4(B0, B1, B2, B3, K4);
299 dec4(B0, B1, B2, B3, K5);
300 dec4(B0, B1, B2, B3, K6);
301 dec4(B0, B1, B2, B3, K7);
302 dec4(B0, B1, B2, B3, K8);
303 dec4(B0, B1, B2, B3, K9);
304 dec4(B0, B1, B2, B3, K10);
305 dec4_last(B0, B1, B2, B3, K11, K12);
306
307 vst1q_u8(out, B0);
308 vst1q_u8(out + 16, B1);
309 vst1q_u8(out + 32, B2);
310 vst1q_u8(out + 48, B3);
311
312 in += 16 * 4;
313 out += 16 * 4;
314 blocks -= 4;
315 }
316
317 for(size_t i = 0; i != blocks; ++i) {
318 uint8x16_t B = vld1q_u8(in + 16 * i);
319 dec(B, K0);
320 dec(B, K1);
321 dec(B, K2);
322 dec(B, K3);
323 dec(B, K4);
324 dec(B, K5);
325 dec(B, K6);
326 dec(B, K7);
327 dec(B, K8);
328 dec(B, K9);
329 dec(B, K10);
330 B = veorq_u8(vaesdq_u8(B, K11), K12);
331 vst1q_u8(out + 16 * i, B);
332 }
333}
334
335/*
336* AES-256 Encryption
337*/
338BOTAN_FN_ISA_AES void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
339 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_EK.data());
340
341 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
342 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
343 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
344 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
345 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
346 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
347 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
348 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
349 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
350 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
351 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
352 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
353 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
354 const uint8x16_t K13 = vld1q_u8(skey + 13 * 16);
355 const uint8x16_t K14 = vld1q_u8(skey + 14 * 16);
356
357 using namespace AES_AARCH64;
358
359 using namespace AES_AARCH64;
360
361 while(blocks >= 4) {
362 uint8x16_t B0 = vld1q_u8(in);
363 uint8x16_t B1 = vld1q_u8(in + 16);
364 uint8x16_t B2 = vld1q_u8(in + 32);
365 uint8x16_t B3 = vld1q_u8(in + 48);
366
367 enc4(B0, B1, B2, B3, K0);
368 enc4(B0, B1, B2, B3, K1);
369 enc4(B0, B1, B2, B3, K2);
370 enc4(B0, B1, B2, B3, K3);
371 enc4(B0, B1, B2, B3, K4);
372 enc4(B0, B1, B2, B3, K5);
373 enc4(B0, B1, B2, B3, K6);
374 enc4(B0, B1, B2, B3, K7);
375 enc4(B0, B1, B2, B3, K8);
376 enc4(B0, B1, B2, B3, K9);
377 enc4(B0, B1, B2, B3, K10);
378 enc4(B0, B1, B2, B3, K11);
379 enc4(B0, B1, B2, B3, K12);
380 enc4_last(B0, B1, B2, B3, K13, K14);
381
382 vst1q_u8(out, B0);
383 vst1q_u8(out + 16, B1);
384 vst1q_u8(out + 32, B2);
385 vst1q_u8(out + 48, B3);
386
387 in += 16 * 4;
388 out += 16 * 4;
389 blocks -= 4;
390 }
391
392 for(size_t i = 0; i != blocks; ++i) {
393 uint8x16_t B = vld1q_u8(in + 16 * i);
394 enc(B, K0);
395 enc(B, K1);
396 enc(B, K2);
397 enc(B, K3);
398 enc(B, K4);
399 enc(B, K5);
400 enc(B, K6);
401 enc(B, K7);
402 enc(B, K8);
403 enc(B, K9);
404 enc(B, K10);
405 enc(B, K11);
406 enc(B, K12);
407 B = veorq_u8(vaeseq_u8(B, K13), K14);
408 vst1q_u8(out + 16 * i, B);
409 }
410}
411
412/*
413* AES-256 Decryption
414*/
415BOTAN_FN_ISA_AES void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
416 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_DK.data());
417
418 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
419 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
420 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
421 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
422 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
423 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
424 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
425 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
426 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
427 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
428 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
429 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
430 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
431 const uint8x16_t K13 = vld1q_u8(skey + 13 * 16);
432 const uint8x16_t K14 = vld1q_u8(skey + 14 * 16);
433
434 using namespace AES_AARCH64;
435
436 while(blocks >= 4) {
437 uint8x16_t B0 = vld1q_u8(in);
438 uint8x16_t B1 = vld1q_u8(in + 16);
439 uint8x16_t B2 = vld1q_u8(in + 32);
440 uint8x16_t B3 = vld1q_u8(in + 48);
441
442 dec4(B0, B1, B2, B3, K0);
443 dec4(B0, B1, B2, B3, K1);
444 dec4(B0, B1, B2, B3, K2);
445 dec4(B0, B1, B2, B3, K3);
446 dec4(B0, B1, B2, B3, K4);
447 dec4(B0, B1, B2, B3, K5);
448 dec4(B0, B1, B2, B3, K6);
449 dec4(B0, B1, B2, B3, K7);
450 dec4(B0, B1, B2, B3, K8);
451 dec4(B0, B1, B2, B3, K9);
452 dec4(B0, B1, B2, B3, K10);
453 dec4(B0, B1, B2, B3, K11);
454 dec4(B0, B1, B2, B3, K12);
455 dec4_last(B0, B1, B2, B3, K13, K14);
456
457 vst1q_u8(out, B0);
458 vst1q_u8(out + 16, B1);
459 vst1q_u8(out + 32, B2);
460 vst1q_u8(out + 48, B3);
461
462 in += 16 * 4;
463 out += 16 * 4;
464 blocks -= 4;
465 }
466
467 for(size_t i = 0; i != blocks; ++i) {
468 uint8x16_t B = vld1q_u8(in + 16 * i);
469 dec(B, K0);
470 dec(B, K1);
471 dec(B, K2);
472 dec(B, K3);
473 dec(B, K4);
474 dec(B, K5);
475 dec(B, K6);
476 dec(B, K7);
477 dec(B, K8);
478 dec(B, K9);
479 dec(B, K10);
480 dec(B, K11);
481 dec(B, K12);
482 B = veorq_u8(vaesdq_u8(B, K13), K14);
483 vst1q_u8(out + 16 * i, B);
484 }
485}
486
487} // namespace Botan
#define BOTAN_FORCE_INLINE
Definition compiler.h:85
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec_last(uint8x16_t &B, uint8x16_t K, uint8x16_t K2)
Definition aes_armv8.cpp:57
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec(uint8x16_t &B, uint8x16_t K)
Definition aes_armv8.cpp:45
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc(uint8x16_t &B, uint8x16_t K)
Definition aes_armv8.cpp:21
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc4(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K)
Definition aes_armv8.cpp:25
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc4_last(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K, uint8x16_t K2)
Definition aes_armv8.cpp:37
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec4(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K)
Definition aes_armv8.cpp:49
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc_last(uint8x16_t &B, uint8x16_t K, uint8x16_t K2)
Definition aes_armv8.cpp:33
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec4_last(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K, uint8x16_t K2)
Definition aes_armv8.cpp:61