Botan 3.10.0
Crypto and TLS for C&
aes_armv8.cpp
Go to the documentation of this file.
1/*
2* AES using ARMv8
3* Contributed by Jeffrey Walton
4*
5* Further changes
6* (C) 2017,2018 Jack Lloyd
7*
8* Botan is released under the Simplified BSD License (see license.txt)
9*/
10
11#include <botan/internal/aes.h>
12
13#include <botan/internal/isa_extn.h>
14#include <botan/internal/loadstor.h>
15#include <arm_neon.h>
16
17namespace Botan {
18
19namespace AES_AARCH64 {
20
21namespace {
22
23BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc(uint8x16_t& B, uint8x16_t K) {
24 B = vaesmcq_u8(vaeseq_u8(B, K));
25}
26
27BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc4(
28 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K) {
29 B0 = vaesmcq_u8(vaeseq_u8(B0, K));
30 B1 = vaesmcq_u8(vaeseq_u8(B1, K));
31 B2 = vaesmcq_u8(vaeseq_u8(B2, K));
32 B3 = vaesmcq_u8(vaeseq_u8(B3, K));
33}
34
35BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc_last(uint8x16_t& B, uint8x16_t K, uint8x16_t K2) {
36 B = veorq_u8(vaeseq_u8(B, K), K2);
37}
38
39BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void enc4_last(
40 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K, uint8x16_t K2) {
41 B0 = veorq_u8(vaeseq_u8(B0, K), K2);
42 B1 = veorq_u8(vaeseq_u8(B1, K), K2);
43 B2 = veorq_u8(vaeseq_u8(B2, K), K2);
44 B3 = veorq_u8(vaeseq_u8(B3, K), K2);
45}
46
47BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec(uint8x16_t& B, uint8x16_t K) {
48 B = vaesimcq_u8(vaesdq_u8(B, K));
49}
50
51BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec4(
52 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K) {
53 B0 = vaesimcq_u8(vaesdq_u8(B0, K));
54 B1 = vaesimcq_u8(vaesdq_u8(B1, K));
55 B2 = vaesimcq_u8(vaesdq_u8(B2, K));
56 B3 = vaesimcq_u8(vaesdq_u8(B3, K));
57}
58
59BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec_last(uint8x16_t& B, uint8x16_t K, uint8x16_t K2) {
60 B = veorq_u8(vaesdq_u8(B, K), K2);
61}
62
63BOTAN_FORCE_INLINE BOTAN_FN_ISA_AES void dec4_last(
64 uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K, uint8x16_t K2) {
65 B0 = veorq_u8(vaesdq_u8(B0, K), K2);
66 B1 = veorq_u8(vaesdq_u8(B1, K), K2);
67 B2 = veorq_u8(vaesdq_u8(B2, K), K2);
68 B3 = veorq_u8(vaesdq_u8(B3, K), K2);
69}
70
71} // namespace
72
73} // namespace AES_AARCH64
74
75/*
76* AES-128 Encryption
77*/
78BOTAN_FN_ISA_AES void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
79 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_EK.data());
80
81 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
82 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
83 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
84 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
85 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
86 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
87 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
88 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
89 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
90 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
91 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
92
93 using namespace AES_AARCH64;
94
95 while(blocks >= 4) {
96 uint8x16_t B0 = vld1q_u8(in);
97 uint8x16_t B1 = vld1q_u8(in + 16);
98 uint8x16_t B2 = vld1q_u8(in + 32);
99 uint8x16_t B3 = vld1q_u8(in + 48);
100
101 enc4(B0, B1, B2, B3, K0);
102 enc4(B0, B1, B2, B3, K1);
103 enc4(B0, B1, B2, B3, K2);
104 enc4(B0, B1, B2, B3, K3);
105 enc4(B0, B1, B2, B3, K4);
106 enc4(B0, B1, B2, B3, K5);
107 enc4(B0, B1, B2, B3, K6);
108 enc4(B0, B1, B2, B3, K7);
109 enc4(B0, B1, B2, B3, K8);
110 enc4_last(B0, B1, B2, B3, K9, K10);
111
112 vst1q_u8(out, B0);
113 vst1q_u8(out + 16, B1);
114 vst1q_u8(out + 32, B2);
115 vst1q_u8(out + 48, B3);
116
117 in += 16 * 4;
118 out += 16 * 4;
119 blocks -= 4;
120 }
121
122 for(size_t i = 0; i != blocks; ++i) {
123 uint8x16_t B = vld1q_u8(in + 16 * i);
124 enc(B, K0);
125 enc(B, K1);
126 enc(B, K2);
127 enc(B, K3);
128 enc(B, K4);
129 enc(B, K5);
130 enc(B, K6);
131 enc(B, K7);
132 enc(B, K8);
133 enc_last(B, K9, K10);
134 vst1q_u8(out + 16 * i, B);
135 }
136}
137
138/*
139* AES-128 Decryption
140*/
141BOTAN_FN_ISA_AES void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
142 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_DK.data());
143
144 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
145 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
146 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
147 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
148 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
149 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
150 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
151 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
152 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
153 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
154 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
155
156 using namespace AES_AARCH64;
157
158 while(blocks >= 4) {
159 uint8x16_t B0 = vld1q_u8(in);
160 uint8x16_t B1 = vld1q_u8(in + 16);
161 uint8x16_t B2 = vld1q_u8(in + 32);
162 uint8x16_t B3 = vld1q_u8(in + 48);
163
164 dec4(B0, B1, B2, B3, K0);
165 dec4(B0, B1, B2, B3, K1);
166 dec4(B0, B1, B2, B3, K2);
167 dec4(B0, B1, B2, B3, K3);
168 dec4(B0, B1, B2, B3, K4);
169 dec4(B0, B1, B2, B3, K5);
170 dec4(B0, B1, B2, B3, K6);
171 dec4(B0, B1, B2, B3, K7);
172 dec4(B0, B1, B2, B3, K8);
173 dec4_last(B0, B1, B2, B3, K9, K10);
174
175 vst1q_u8(out, B0);
176 vst1q_u8(out + 16, B1);
177 vst1q_u8(out + 32, B2);
178 vst1q_u8(out + 48, B3);
179
180 in += 16 * 4;
181 out += 16 * 4;
182 blocks -= 4;
183 }
184
185 for(size_t i = 0; i != blocks; ++i) {
186 uint8x16_t B = vld1q_u8(in + 16 * i);
187 dec(B, K0);
188 dec(B, K1);
189 dec(B, K2);
190 dec(B, K3);
191 dec(B, K4);
192 dec(B, K5);
193 dec(B, K6);
194 dec(B, K7);
195 dec(B, K8);
196 dec_last(B, K9, K10);
197 vst1q_u8(out + 16 * i, B);
198 }
199}
200
201/*
202* AES-192 Encryption
203*/
204BOTAN_FN_ISA_AES void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
205 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_EK.data());
206
207 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
208 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
209 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
210 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
211 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
212 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
213 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
214 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
215 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
216 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
217 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
218 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
219 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
220
221 using namespace AES_AARCH64;
222
223 while(blocks >= 4) {
224 uint8x16_t B0 = vld1q_u8(in);
225 uint8x16_t B1 = vld1q_u8(in + 16);
226 uint8x16_t B2 = vld1q_u8(in + 32);
227 uint8x16_t B3 = vld1q_u8(in + 48);
228
229 enc4(B0, B1, B2, B3, K0);
230 enc4(B0, B1, B2, B3, K1);
231 enc4(B0, B1, B2, B3, K2);
232 enc4(B0, B1, B2, B3, K3);
233 enc4(B0, B1, B2, B3, K4);
234 enc4(B0, B1, B2, B3, K5);
235 enc4(B0, B1, B2, B3, K6);
236 enc4(B0, B1, B2, B3, K7);
237 enc4(B0, B1, B2, B3, K8);
238 enc4(B0, B1, B2, B3, K9);
239 enc4(B0, B1, B2, B3, K10);
240 enc4_last(B0, B1, B2, B3, K11, K12);
241
242 vst1q_u8(out, B0);
243 vst1q_u8(out + 16, B1);
244 vst1q_u8(out + 32, B2);
245 vst1q_u8(out + 48, B3);
246
247 in += 16 * 4;
248 out += 16 * 4;
249 blocks -= 4;
250 }
251
252 for(size_t i = 0; i != blocks; ++i) {
253 uint8x16_t B = vld1q_u8(in + 16 * i);
254 enc(B, K0);
255 enc(B, K1);
256 enc(B, K2);
257 enc(B, K3);
258 enc(B, K4);
259 enc(B, K5);
260 enc(B, K6);
261 enc(B, K7);
262 enc(B, K8);
263 enc(B, K9);
264 enc(B, K10);
265 enc_last(B, K11, K12);
266 vst1q_u8(out + 16 * i, B);
267 }
268}
269
270/*
271* AES-192 Decryption
272*/
273BOTAN_FN_ISA_AES void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
274 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_DK.data());
275
276 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
277 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
278 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
279 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
280 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
281 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
282 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
283 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
284 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
285 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
286 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
287 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
288 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
289
290 using namespace AES_AARCH64;
291
292 while(blocks >= 4) {
293 uint8x16_t B0 = vld1q_u8(in);
294 uint8x16_t B1 = vld1q_u8(in + 16);
295 uint8x16_t B2 = vld1q_u8(in + 32);
296 uint8x16_t B3 = vld1q_u8(in + 48);
297
298 dec4(B0, B1, B2, B3, K0);
299 dec4(B0, B1, B2, B3, K1);
300 dec4(B0, B1, B2, B3, K2);
301 dec4(B0, B1, B2, B3, K3);
302 dec4(B0, B1, B2, B3, K4);
303 dec4(B0, B1, B2, B3, K5);
304 dec4(B0, B1, B2, B3, K6);
305 dec4(B0, B1, B2, B3, K7);
306 dec4(B0, B1, B2, B3, K8);
307 dec4(B0, B1, B2, B3, K9);
308 dec4(B0, B1, B2, B3, K10);
309 dec4_last(B0, B1, B2, B3, K11, K12);
310
311 vst1q_u8(out, B0);
312 vst1q_u8(out + 16, B1);
313 vst1q_u8(out + 32, B2);
314 vst1q_u8(out + 48, B3);
315
316 in += 16 * 4;
317 out += 16 * 4;
318 blocks -= 4;
319 }
320
321 for(size_t i = 0; i != blocks; ++i) {
322 uint8x16_t B = vld1q_u8(in + 16 * i);
323 dec(B, K0);
324 dec(B, K1);
325 dec(B, K2);
326 dec(B, K3);
327 dec(B, K4);
328 dec(B, K5);
329 dec(B, K6);
330 dec(B, K7);
331 dec(B, K8);
332 dec(B, K9);
333 dec(B, K10);
334 dec_last(B, K11, K12);
335 vst1q_u8(out + 16 * i, B);
336 }
337}
338
339/*
340* AES-256 Encryption
341*/
342BOTAN_FN_ISA_AES void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
343 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_EK.data());
344
345 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
346 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
347 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
348 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
349 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
350 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
351 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
352 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
353 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
354 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
355 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
356 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
357 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
358 const uint8x16_t K13 = vld1q_u8(skey + 13 * 16);
359 const uint8x16_t K14 = vld1q_u8(skey + 14 * 16);
360
361 using namespace AES_AARCH64;
362
363 using namespace AES_AARCH64;
364
365 while(blocks >= 4) {
366 uint8x16_t B0 = vld1q_u8(in);
367 uint8x16_t B1 = vld1q_u8(in + 16);
368 uint8x16_t B2 = vld1q_u8(in + 32);
369 uint8x16_t B3 = vld1q_u8(in + 48);
370
371 enc4(B0, B1, B2, B3, K0);
372 enc4(B0, B1, B2, B3, K1);
373 enc4(B0, B1, B2, B3, K2);
374 enc4(B0, B1, B2, B3, K3);
375 enc4(B0, B1, B2, B3, K4);
376 enc4(B0, B1, B2, B3, K5);
377 enc4(B0, B1, B2, B3, K6);
378 enc4(B0, B1, B2, B3, K7);
379 enc4(B0, B1, B2, B3, K8);
380 enc4(B0, B1, B2, B3, K9);
381 enc4(B0, B1, B2, B3, K10);
382 enc4(B0, B1, B2, B3, K11);
383 enc4(B0, B1, B2, B3, K12);
384 enc4_last(B0, B1, B2, B3, K13, K14);
385
386 vst1q_u8(out, B0);
387 vst1q_u8(out + 16, B1);
388 vst1q_u8(out + 32, B2);
389 vst1q_u8(out + 48, B3);
390
391 in += 16 * 4;
392 out += 16 * 4;
393 blocks -= 4;
394 }
395
396 for(size_t i = 0; i != blocks; ++i) {
397 uint8x16_t B = vld1q_u8(in + 16 * i);
398 enc(B, K0);
399 enc(B, K1);
400 enc(B, K2);
401 enc(B, K3);
402 enc(B, K4);
403 enc(B, K5);
404 enc(B, K6);
405 enc(B, K7);
406 enc(B, K8);
407 enc(B, K9);
408 enc(B, K10);
409 enc(B, K11);
410 enc(B, K12);
411 enc_last(B, K13, K14);
412 vst1q_u8(out + 16 * i, B);
413 }
414}
415
416/*
417* AES-256 Decryption
418*/
419BOTAN_FN_ISA_AES void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
420 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_DK.data());
421
422 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
423 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
424 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
425 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
426 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
427 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
428 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
429 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
430 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
431 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
432 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
433 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
434 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
435 const uint8x16_t K13 = vld1q_u8(skey + 13 * 16);
436 const uint8x16_t K14 = vld1q_u8(skey + 14 * 16);
437
438 using namespace AES_AARCH64;
439
440 while(blocks >= 4) {
441 uint8x16_t B0 = vld1q_u8(in);
442 uint8x16_t B1 = vld1q_u8(in + 16);
443 uint8x16_t B2 = vld1q_u8(in + 32);
444 uint8x16_t B3 = vld1q_u8(in + 48);
445
446 dec4(B0, B1, B2, B3, K0);
447 dec4(B0, B1, B2, B3, K1);
448 dec4(B0, B1, B2, B3, K2);
449 dec4(B0, B1, B2, B3, K3);
450 dec4(B0, B1, B2, B3, K4);
451 dec4(B0, B1, B2, B3, K5);
452 dec4(B0, B1, B2, B3, K6);
453 dec4(B0, B1, B2, B3, K7);
454 dec4(B0, B1, B2, B3, K8);
455 dec4(B0, B1, B2, B3, K9);
456 dec4(B0, B1, B2, B3, K10);
457 dec4(B0, B1, B2, B3, K11);
458 dec4(B0, B1, B2, B3, K12);
459 dec4_last(B0, B1, B2, B3, K13, K14);
460
461 vst1q_u8(out, B0);
462 vst1q_u8(out + 16, B1);
463 vst1q_u8(out + 32, B2);
464 vst1q_u8(out + 48, B3);
465
466 in += 16 * 4;
467 out += 16 * 4;
468 blocks -= 4;
469 }
470
471 for(size_t i = 0; i != blocks; ++i) {
472 uint8x16_t B = vld1q_u8(in + 16 * i);
473 dec(B, K0);
474 dec(B, K1);
475 dec(B, K2);
476 dec(B, K3);
477 dec(B, K4);
478 dec(B, K5);
479 dec(B, K6);
480 dec(B, K7);
481 dec(B, K8);
482 dec(B, K9);
483 dec(B, K10);
484 dec(B, K11);
485 dec(B, K12);
486 dec_last(B, K13, K14);
487 vst1q_u8(out + 16 * i, B);
488 }
489}
490
491} // namespace Botan
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
constexpr uint32_t K1
Definition sha1_f.h:16
constexpr uint32_t K4
Definition sha1_f.h:19
constexpr uint32_t K3
Definition sha1_f.h:18
constexpr uint32_t K2
Definition sha1_f.h:17