Botan 3.4.0
Crypto and TLS for C&
aes_armv8.cpp
Go to the documentation of this file.
1/*
2* AES using ARMv8
3* Contributed by Jeffrey Walton
4*
5* Further changes
6* (C) 2017,2018 Jack Lloyd
7*
8* Botan is released under the Simplified BSD License (see license.txt)
9*/
10
11#include <botan/internal/aes.h>
12
13#include <botan/internal/loadstor.h>
14#include <arm_neon.h>
15
16namespace Botan {
17
18namespace AES_AARCH64 {
19
20BOTAN_FUNC_ISA_INLINE("+crypto+aes") void enc(uint8x16_t& B, uint8x16_t K) {
21 B = vaesmcq_u8(vaeseq_u8(B, K));
22}
23
24BOTAN_FUNC_ISA_INLINE("+crypto+aes")
25void enc4(uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K) {
26 B0 = vaesmcq_u8(vaeseq_u8(B0, K));
27 B1 = vaesmcq_u8(vaeseq_u8(B1, K));
28 B2 = vaesmcq_u8(vaeseq_u8(B2, K));
29 B3 = vaesmcq_u8(vaeseq_u8(B3, K));
30}
31
32BOTAN_FUNC_ISA_INLINE("+crypto+aes") void enc_last(uint8x16_t& B, uint8x16_t K, uint8x16_t K2) {
33 B = veorq_u8(vaeseq_u8(B, K), K2);
34}
35
36BOTAN_FUNC_ISA_INLINE("+crypto+aes")
37void enc4_last(uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K, uint8x16_t K2) {
38 B0 = veorq_u8(vaeseq_u8(B0, K), K2);
39 B1 = veorq_u8(vaeseq_u8(B1, K), K2);
40 B2 = veorq_u8(vaeseq_u8(B2, K), K2);
41 B3 = veorq_u8(vaeseq_u8(B3, K), K2);
42}
43
44BOTAN_FUNC_ISA_INLINE("+crypto+aes") void dec(uint8x16_t& B, uint8x16_t K) {
45 B = vaesimcq_u8(vaesdq_u8(B, K));
46}
47
48BOTAN_FUNC_ISA_INLINE("+crypto+aes")
49void dec4(uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K) {
50 B0 = vaesimcq_u8(vaesdq_u8(B0, K));
51 B1 = vaesimcq_u8(vaesdq_u8(B1, K));
52 B2 = vaesimcq_u8(vaesdq_u8(B2, K));
53 B3 = vaesimcq_u8(vaesdq_u8(B3, K));
54}
55
56BOTAN_FUNC_ISA_INLINE("+crypto+aes") void dec_last(uint8x16_t& B, uint8x16_t K, uint8x16_t K2) {
57 B = veorq_u8(vaesdq_u8(B, K), K2);
58}
59
60BOTAN_FUNC_ISA_INLINE("+crypto+aes")
61void dec4_last(uint8x16_t& B0, uint8x16_t& B1, uint8x16_t& B2, uint8x16_t& B3, uint8x16_t K, uint8x16_t K2) {
62 B0 = veorq_u8(vaesdq_u8(B0, K), K2);
63 B1 = veorq_u8(vaesdq_u8(B1, K), K2);
64 B2 = veorq_u8(vaesdq_u8(B2, K), K2);
65 B3 = veorq_u8(vaesdq_u8(B3, K), K2);
66}
67
68} // namespace AES_AARCH64
69
70/*
71* AES-128 Encryption
72*/
73BOTAN_FUNC_ISA("+crypto+aes") void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
74 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_EK.data());
75
76 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
77 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
78 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
79 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
80 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
81 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
82 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
83 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
84 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
85 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
86 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
87
88 using namespace AES_AARCH64;
89
90 while(blocks >= 4) {
91 uint8x16_t B0 = vld1q_u8(in);
92 uint8x16_t B1 = vld1q_u8(in + 16);
93 uint8x16_t B2 = vld1q_u8(in + 32);
94 uint8x16_t B3 = vld1q_u8(in + 48);
95
96 enc4(B0, B1, B2, B3, K0);
97 enc4(B0, B1, B2, B3, K1);
98 enc4(B0, B1, B2, B3, K2);
99 enc4(B0, B1, B2, B3, K3);
100 enc4(B0, B1, B2, B3, K4);
101 enc4(B0, B1, B2, B3, K5);
102 enc4(B0, B1, B2, B3, K6);
103 enc4(B0, B1, B2, B3, K7);
104 enc4(B0, B1, B2, B3, K8);
105 enc4_last(B0, B1, B2, B3, K9, K10);
106
107 vst1q_u8(out, B0);
108 vst1q_u8(out + 16, B1);
109 vst1q_u8(out + 32, B2);
110 vst1q_u8(out + 48, B3);
111
112 in += 16 * 4;
113 out += 16 * 4;
114 blocks -= 4;
115 }
116
117 for(size_t i = 0; i != blocks; ++i) {
118 uint8x16_t B = vld1q_u8(in + 16 * i);
119 enc(B, K0);
120 enc(B, K1);
121 enc(B, K2);
122 enc(B, K3);
123 enc(B, K4);
124 enc(B, K5);
125 enc(B, K6);
126 enc(B, K7);
127 enc(B, K8);
128 enc_last(B, K9, K10);
129 vst1q_u8(out + 16 * i, B);
130 }
131}
132
133/*
134* AES-128 Decryption
135*/
136BOTAN_FUNC_ISA("+crypto+aes") void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
137 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_DK.data());
138
139 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
140 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
141 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
142 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
143 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
144 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
145 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
146 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
147 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
148 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
149 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
150
151 using namespace AES_AARCH64;
152
153 while(blocks >= 4) {
154 uint8x16_t B0 = vld1q_u8(in);
155 uint8x16_t B1 = vld1q_u8(in + 16);
156 uint8x16_t B2 = vld1q_u8(in + 32);
157 uint8x16_t B3 = vld1q_u8(in + 48);
158
159 dec4(B0, B1, B2, B3, K0);
160 dec4(B0, B1, B2, B3, K1);
161 dec4(B0, B1, B2, B3, K2);
162 dec4(B0, B1, B2, B3, K3);
163 dec4(B0, B1, B2, B3, K4);
164 dec4(B0, B1, B2, B3, K5);
165 dec4(B0, B1, B2, B3, K6);
166 dec4(B0, B1, B2, B3, K7);
167 dec4(B0, B1, B2, B3, K8);
168 dec4_last(B0, B1, B2, B3, K9, K10);
169
170 vst1q_u8(out, B0);
171 vst1q_u8(out + 16, B1);
172 vst1q_u8(out + 32, B2);
173 vst1q_u8(out + 48, B3);
174
175 in += 16 * 4;
176 out += 16 * 4;
177 blocks -= 4;
178 }
179
180 for(size_t i = 0; i != blocks; ++i) {
181 uint8x16_t B = vld1q_u8(in + 16 * i);
182 dec(B, K0);
183 dec(B, K1);
184 dec(B, K2);
185 dec(B, K3);
186 dec(B, K4);
187 dec(B, K5);
188 dec(B, K6);
189 dec(B, K7);
190 dec(B, K8);
191 B = veorq_u8(vaesdq_u8(B, K9), K10);
192 vst1q_u8(out + 16 * i, B);
193 }
194}
195
196/*
197* AES-192 Encryption
198*/
199BOTAN_FUNC_ISA("+crypto+aes") void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
200 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_EK.data());
201
202 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
203 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
204 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
205 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
206 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
207 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
208 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
209 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
210 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
211 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
212 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
213 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
214 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
215
216 using namespace AES_AARCH64;
217
218 while(blocks >= 4) {
219 uint8x16_t B0 = vld1q_u8(in);
220 uint8x16_t B1 = vld1q_u8(in + 16);
221 uint8x16_t B2 = vld1q_u8(in + 32);
222 uint8x16_t B3 = vld1q_u8(in + 48);
223
224 enc4(B0, B1, B2, B3, K0);
225 enc4(B0, B1, B2, B3, K1);
226 enc4(B0, B1, B2, B3, K2);
227 enc4(B0, B1, B2, B3, K3);
228 enc4(B0, B1, B2, B3, K4);
229 enc4(B0, B1, B2, B3, K5);
230 enc4(B0, B1, B2, B3, K6);
231 enc4(B0, B1, B2, B3, K7);
232 enc4(B0, B1, B2, B3, K8);
233 enc4(B0, B1, B2, B3, K9);
234 enc4(B0, B1, B2, B3, K10);
235 enc4_last(B0, B1, B2, B3, K11, K12);
236
237 vst1q_u8(out, B0);
238 vst1q_u8(out + 16, B1);
239 vst1q_u8(out + 32, B2);
240 vst1q_u8(out + 48, B3);
241
242 in += 16 * 4;
243 out += 16 * 4;
244 blocks -= 4;
245 }
246
247 for(size_t i = 0; i != blocks; ++i) {
248 uint8x16_t B = vld1q_u8(in + 16 * i);
249 enc(B, K0);
250 enc(B, K1);
251 enc(B, K2);
252 enc(B, K3);
253 enc(B, K4);
254 enc(B, K5);
255 enc(B, K6);
256 enc(B, K7);
257 enc(B, K8);
258 enc(B, K9);
259 enc(B, K10);
260 B = veorq_u8(vaeseq_u8(B, K11), K12);
261 vst1q_u8(out + 16 * i, B);
262 }
263}
264
265/*
266* AES-192 Decryption
267*/
268BOTAN_FUNC_ISA("+crypto+aes") void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
269 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_DK.data());
270
271 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
272 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
273 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
274 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
275 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
276 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
277 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
278 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
279 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
280 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
281 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
282 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
283 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
284
285 using namespace AES_AARCH64;
286
287 while(blocks >= 4) {
288 uint8x16_t B0 = vld1q_u8(in);
289 uint8x16_t B1 = vld1q_u8(in + 16);
290 uint8x16_t B2 = vld1q_u8(in + 32);
291 uint8x16_t B3 = vld1q_u8(in + 48);
292
293 dec4(B0, B1, B2, B3, K0);
294 dec4(B0, B1, B2, B3, K1);
295 dec4(B0, B1, B2, B3, K2);
296 dec4(B0, B1, B2, B3, K3);
297 dec4(B0, B1, B2, B3, K4);
298 dec4(B0, B1, B2, B3, K5);
299 dec4(B0, B1, B2, B3, K6);
300 dec4(B0, B1, B2, B3, K7);
301 dec4(B0, B1, B2, B3, K8);
302 dec4(B0, B1, B2, B3, K9);
303 dec4(B0, B1, B2, B3, K10);
304 dec4_last(B0, B1, B2, B3, K11, K12);
305
306 vst1q_u8(out, B0);
307 vst1q_u8(out + 16, B1);
308 vst1q_u8(out + 32, B2);
309 vst1q_u8(out + 48, B3);
310
311 in += 16 * 4;
312 out += 16 * 4;
313 blocks -= 4;
314 }
315
316 for(size_t i = 0; i != blocks; ++i) {
317 uint8x16_t B = vld1q_u8(in + 16 * i);
318 dec(B, K0);
319 dec(B, K1);
320 dec(B, K2);
321 dec(B, K3);
322 dec(B, K4);
323 dec(B, K5);
324 dec(B, K6);
325 dec(B, K7);
326 dec(B, K8);
327 dec(B, K9);
328 dec(B, K10);
329 B = veorq_u8(vaesdq_u8(B, K11), K12);
330 vst1q_u8(out + 16 * i, B);
331 }
332}
333
334/*
335* AES-256 Encryption
336*/
337BOTAN_FUNC_ISA("+crypto+aes") void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
338 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_EK.data());
339
340 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
341 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
342 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
343 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
344 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
345 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
346 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
347 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
348 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
349 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
350 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
351 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
352 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
353 const uint8x16_t K13 = vld1q_u8(skey + 13 * 16);
354 const uint8x16_t K14 = vld1q_u8(skey + 14 * 16);
355
356 using namespace AES_AARCH64;
357
358 using namespace AES_AARCH64;
359
360 while(blocks >= 4) {
361 uint8x16_t B0 = vld1q_u8(in);
362 uint8x16_t B1 = vld1q_u8(in + 16);
363 uint8x16_t B2 = vld1q_u8(in + 32);
364 uint8x16_t B3 = vld1q_u8(in + 48);
365
366 enc4(B0, B1, B2, B3, K0);
367 enc4(B0, B1, B2, B3, K1);
368 enc4(B0, B1, B2, B3, K2);
369 enc4(B0, B1, B2, B3, K3);
370 enc4(B0, B1, B2, B3, K4);
371 enc4(B0, B1, B2, B3, K5);
372 enc4(B0, B1, B2, B3, K6);
373 enc4(B0, B1, B2, B3, K7);
374 enc4(B0, B1, B2, B3, K8);
375 enc4(B0, B1, B2, B3, K9);
376 enc4(B0, B1, B2, B3, K10);
377 enc4(B0, B1, B2, B3, K11);
378 enc4(B0, B1, B2, B3, K12);
379 enc4_last(B0, B1, B2, B3, K13, K14);
380
381 vst1q_u8(out, B0);
382 vst1q_u8(out + 16, B1);
383 vst1q_u8(out + 32, B2);
384 vst1q_u8(out + 48, B3);
385
386 in += 16 * 4;
387 out += 16 * 4;
388 blocks -= 4;
389 }
390
391 for(size_t i = 0; i != blocks; ++i) {
392 uint8x16_t B = vld1q_u8(in + 16 * i);
393 enc(B, K0);
394 enc(B, K1);
395 enc(B, K2);
396 enc(B, K3);
397 enc(B, K4);
398 enc(B, K5);
399 enc(B, K6);
400 enc(B, K7);
401 enc(B, K8);
402 enc(B, K9);
403 enc(B, K10);
404 enc(B, K11);
405 enc(B, K12);
406 B = veorq_u8(vaeseq_u8(B, K13), K14);
407 vst1q_u8(out + 16 * i, B);
408 }
409}
410
411/*
412* AES-256 Decryption
413*/
414BOTAN_FUNC_ISA("+crypto+aes") void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
415 const uint8_t* skey = reinterpret_cast<const uint8_t*>(m_DK.data());
416
417 const uint8x16_t K0 = vld1q_u8(skey + 0 * 16);
418 const uint8x16_t K1 = vld1q_u8(skey + 1 * 16);
419 const uint8x16_t K2 = vld1q_u8(skey + 2 * 16);
420 const uint8x16_t K3 = vld1q_u8(skey + 3 * 16);
421 const uint8x16_t K4 = vld1q_u8(skey + 4 * 16);
422 const uint8x16_t K5 = vld1q_u8(skey + 5 * 16);
423 const uint8x16_t K6 = vld1q_u8(skey + 6 * 16);
424 const uint8x16_t K7 = vld1q_u8(skey + 7 * 16);
425 const uint8x16_t K8 = vld1q_u8(skey + 8 * 16);
426 const uint8x16_t K9 = vld1q_u8(skey + 9 * 16);
427 const uint8x16_t K10 = vld1q_u8(skey + 10 * 16);
428 const uint8x16_t K11 = vld1q_u8(skey + 11 * 16);
429 const uint8x16_t K12 = vld1q_u8(skey + 12 * 16);
430 const uint8x16_t K13 = vld1q_u8(skey + 13 * 16);
431 const uint8x16_t K14 = vld1q_u8(skey + 14 * 16);
432
433 using namespace AES_AARCH64;
434
435 while(blocks >= 4) {
436 uint8x16_t B0 = vld1q_u8(in);
437 uint8x16_t B1 = vld1q_u8(in + 16);
438 uint8x16_t B2 = vld1q_u8(in + 32);
439 uint8x16_t B3 = vld1q_u8(in + 48);
440
441 dec4(B0, B1, B2, B3, K0);
442 dec4(B0, B1, B2, B3, K1);
443 dec4(B0, B1, B2, B3, K2);
444 dec4(B0, B1, B2, B3, K3);
445 dec4(B0, B1, B2, B3, K4);
446 dec4(B0, B1, B2, B3, K5);
447 dec4(B0, B1, B2, B3, K6);
448 dec4(B0, B1, B2, B3, K7);
449 dec4(B0, B1, B2, B3, K8);
450 dec4(B0, B1, B2, B3, K9);
451 dec4(B0, B1, B2, B3, K10);
452 dec4(B0, B1, B2, B3, K11);
453 dec4(B0, B1, B2, B3, K12);
454 dec4_last(B0, B1, B2, B3, K13, K14);
455
456 vst1q_u8(out, B0);
457 vst1q_u8(out + 16, B1);
458 vst1q_u8(out + 32, B2);
459 vst1q_u8(out + 48, B3);
460
461 in += 16 * 4;
462 out += 16 * 4;
463 blocks -= 4;
464 }
465
466 for(size_t i = 0; i != blocks; ++i) {
467 uint8x16_t B = vld1q_u8(in + 16 * i);
468 dec(B, K0);
469 dec(B, K1);
470 dec(B, K2);
471 dec(B, K3);
472 dec(B, K4);
473 dec(B, K5);
474 dec(B, K6);
475 dec(B, K7);
476 dec(B, K8);
477 dec(B, K9);
478 dec(B, K10);
479 dec(B, K11);
480 dec(B, K12);
481 B = veorq_u8(vaesdq_u8(B, K13), K14);
482 vst1q_u8(out + 16 * i, B);
483 }
484}
485
486} // namespace Botan
#define BOTAN_FUNC_ISA(isa)
Definition compiler.h:92
#define BOTAN_FUNC_ISA_INLINE(isa)
Definition compiler.h:98
void dec4_last(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K, uint8x16_t K2)
Definition aes_armv8.cpp:61
void enc4_last(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K, uint8x16_t K2)
Definition aes_armv8.cpp:37
uint8x16_t uint8x16_t K2
Definition aes_armv8.cpp:32
void enc4(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K)
Definition aes_armv8.cpp:25
void dec4(uint8x16_t &B0, uint8x16_t &B1, uint8x16_t &B2, uint8x16_t &B3, uint8x16_t K)
Definition aes_armv8.cpp:49