Botan 3.5.0
Crypto and TLS for C&
aes_ni.cpp
Go to the documentation of this file.
1/*
2* AES using AES-NI instructions
3* (C) 2009,2012 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#include <botan/internal/aes.h>
9
10#include <botan/internal/loadstor.h>
11#include <botan/internal/simd_32.h>
12#include <wmmintrin.h>
13
14namespace Botan {
15
16namespace {
17
18template <uint8_t RC>
19BOTAN_FUNC_ISA("ssse3,aes")
20inline __m128i aes_128_key_expansion(__m128i key, __m128i key_getting_rcon) {
21 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key_getting_rcon, RC);
22 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3, 3, 3, 3));
23 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
24 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
25 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
26 return _mm_xor_si128(key, key_with_rcon);
27}
28
29BOTAN_FUNC_ISA("ssse3")
30void aes_192_key_expansion(
31 __m128i* K1, __m128i* K2, __m128i key2_with_rcon, secure_vector<uint32_t>& out, size_t offset) {
32 __m128i key1 = *K1;
33 __m128i key2 = *K2;
34
35 key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1, 1, 1, 1));
36 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
37 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
38 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
39 key1 = _mm_xor_si128(key1, key2_with_rcon);
40
41 *K1 = key1;
42 _mm_storeu_si128(reinterpret_cast<__m128i*>(&out[offset]), key1);
43
44 if(offset == 48) { // last key
45 return;
46 }
47
48 key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
49 key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3, 3, 3, 3)));
50
51 *K2 = key2;
52 out[offset + 4] = _mm_cvtsi128_si32(key2);
53 out[offset + 5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
54}
55
56/*
57* The second half of the AES-256 key expansion (other half same as AES-128)
58*/
59BOTAN_FUNC_ISA("ssse3,aes") __m128i aes_256_key_expansion(__m128i key, __m128i key2) {
60 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
61 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2, 2, 2, 2));
62
63 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
64 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
65 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
66 return _mm_xor_si128(key, key_with_rcon);
67}
68
69BOTAN_FORCE_INLINE void keyxor(SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
70 B0 ^= K;
71 B1 ^= K;
72 B2 ^= K;
73 B3 ^= K;
74}
75
76BOTAN_FUNC_ISA_INLINE("aes") void aesenc(SIMD_4x32 K, SIMD_4x32& B) {
77 B = SIMD_4x32(_mm_aesenc_si128(B.raw(), K.raw()));
78}
79
80BOTAN_FUNC_ISA_INLINE("aes") void aesenc(SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
81 B0 = SIMD_4x32(_mm_aesenc_si128(B0.raw(), K.raw()));
82 B1 = SIMD_4x32(_mm_aesenc_si128(B1.raw(), K.raw()));
83 B2 = SIMD_4x32(_mm_aesenc_si128(B2.raw(), K.raw()));
84 B3 = SIMD_4x32(_mm_aesenc_si128(B3.raw(), K.raw()));
85}
86
87BOTAN_FUNC_ISA_INLINE("aes") void aesenclast(SIMD_4x32 K, SIMD_4x32& B) {
88 B = SIMD_4x32(_mm_aesenclast_si128(B.raw(), K.raw()));
89}
90
91BOTAN_FUNC_ISA_INLINE("aes") void aesenclast(SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
92 B0 = SIMD_4x32(_mm_aesenclast_si128(B0.raw(), K.raw()));
93 B1 = SIMD_4x32(_mm_aesenclast_si128(B1.raw(), K.raw()));
94 B2 = SIMD_4x32(_mm_aesenclast_si128(B2.raw(), K.raw()));
95 B3 = SIMD_4x32(_mm_aesenclast_si128(B3.raw(), K.raw()));
96}
97
98BOTAN_FUNC_ISA_INLINE("aes") void aesdec(SIMD_4x32 K, SIMD_4x32& B) {
99 B = SIMD_4x32(_mm_aesdec_si128(B.raw(), K.raw()));
100}
101
102BOTAN_FUNC_ISA_INLINE("aes") void aesdec(SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
103 B0 = SIMD_4x32(_mm_aesdec_si128(B0.raw(), K.raw()));
104 B1 = SIMD_4x32(_mm_aesdec_si128(B1.raw(), K.raw()));
105 B2 = SIMD_4x32(_mm_aesdec_si128(B2.raw(), K.raw()));
106 B3 = SIMD_4x32(_mm_aesdec_si128(B3.raw(), K.raw()));
107}
108
109BOTAN_FUNC_ISA_INLINE("aes") void aesdeclast(SIMD_4x32 K, SIMD_4x32& B) {
110 B = SIMD_4x32(_mm_aesdeclast_si128(B.raw(), K.raw()));
111}
112
113BOTAN_FUNC_ISA_INLINE("aes") void aesdeclast(SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
114 B0 = SIMD_4x32(_mm_aesdeclast_si128(B0.raw(), K.raw()));
115 B1 = SIMD_4x32(_mm_aesdeclast_si128(B1.raw(), K.raw()));
116 B2 = SIMD_4x32(_mm_aesdeclast_si128(B2.raw(), K.raw()));
117 B3 = SIMD_4x32(_mm_aesdeclast_si128(B3.raw(), K.raw()));
118}
119
120} // namespace
121
122/*
123* AES-128 Encryption
124*/
125BOTAN_FUNC_ISA("ssse3,aes") void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
126 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4 * 0]);
127 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4 * 1]);
128 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4 * 2]);
129 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4 * 3]);
130 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4 * 4]);
131 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4 * 5]);
132 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4 * 6]);
133 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4 * 7]);
134 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4 * 8]);
135 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4 * 9]);
136 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4 * 10]);
137
138 while(blocks >= 4) {
139 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
140 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
141 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
142 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
143
144 keyxor(K0, B0, B1, B2, B3);
145 aesenc(K1, B0, B1, B2, B3);
146 aesenc(K2, B0, B1, B2, B3);
147 aesenc(K3, B0, B1, B2, B3);
148 aesenc(K4, B0, B1, B2, B3);
149 aesenc(K5, B0, B1, B2, B3);
150 aesenc(K6, B0, B1, B2, B3);
151 aesenc(K7, B0, B1, B2, B3);
152 aesenc(K8, B0, B1, B2, B3);
153 aesenc(K9, B0, B1, B2, B3);
154 aesenclast(K10, B0, B1, B2, B3);
155
156 B0.store_le(out + 16 * 0);
157 B1.store_le(out + 16 * 1);
158 B2.store_le(out + 16 * 2);
159 B3.store_le(out + 16 * 3);
160
161 blocks -= 4;
162 in += 4 * 16;
163 out += 4 * 16;
164 }
165
166 for(size_t i = 0; i != blocks; ++i) {
167 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
168
169 B0 ^= K0;
170 aesenc(K1, B0);
171 aesenc(K2, B0);
172 aesenc(K3, B0);
173 aesenc(K4, B0);
174 aesenc(K5, B0);
175 aesenc(K6, B0);
176 aesenc(K7, B0);
177 aesenc(K8, B0);
178 aesenc(K9, B0);
179 aesenclast(K10, B0);
180
181 B0.store_le(out + 16 * i);
182 }
183}
184
185/*
186* AES-128 Decryption
187*/
188BOTAN_FUNC_ISA("ssse3,aes") void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
189 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4 * 0]);
190 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4 * 1]);
191 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4 * 2]);
192 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4 * 3]);
193 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4 * 4]);
194 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4 * 5]);
195 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4 * 6]);
196 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4 * 7]);
197 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4 * 8]);
198 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4 * 9]);
199 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4 * 10]);
200
201 while(blocks >= 4) {
202 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
203 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
204 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
205 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
206
207 keyxor(K0, B0, B1, B2, B3);
208 aesdec(K1, B0, B1, B2, B3);
209 aesdec(K2, B0, B1, B2, B3);
210 aesdec(K3, B0, B1, B2, B3);
211 aesdec(K4, B0, B1, B2, B3);
212 aesdec(K5, B0, B1, B2, B3);
213 aesdec(K6, B0, B1, B2, B3);
214 aesdec(K7, B0, B1, B2, B3);
215 aesdec(K8, B0, B1, B2, B3);
216 aesdec(K9, B0, B1, B2, B3);
217 aesdeclast(K10, B0, B1, B2, B3);
218
219 B0.store_le(out + 16 * 0);
220 B1.store_le(out + 16 * 1);
221 B2.store_le(out + 16 * 2);
222 B3.store_le(out + 16 * 3);
223
224 blocks -= 4;
225 in += 4 * 16;
226 out += 4 * 16;
227 }
228
229 for(size_t i = 0; i != blocks; ++i) {
230 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
231
232 B0 ^= K0;
233 aesdec(K1, B0);
234 aesdec(K2, B0);
235 aesdec(K3, B0);
236 aesdec(K4, B0);
237 aesdec(K5, B0);
238 aesdec(K6, B0);
239 aesdec(K7, B0);
240 aesdec(K8, B0);
241 aesdec(K9, B0);
242 aesdeclast(K10, B0);
243
244 B0.store_le(out + 16 * i);
245 }
246}
247
248/*
249* AES-128 Key Schedule
250*/
251BOTAN_FUNC_ISA("ssse3,aes") void AES_128::aesni_key_schedule(const uint8_t key[], size_t /*length*/) {
252 m_EK.resize(44);
253 m_DK.resize(44);
254
255 const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
256 const __m128i K1 = aes_128_key_expansion<0x01>(K0, K0);
257 const __m128i K2 = aes_128_key_expansion<0x02>(K1, K1);
258 const __m128i K3 = aes_128_key_expansion<0x04>(K2, K2);
259 const __m128i K4 = aes_128_key_expansion<0x08>(K3, K3);
260 const __m128i K5 = aes_128_key_expansion<0x10>(K4, K4);
261 const __m128i K6 = aes_128_key_expansion<0x20>(K5, K5);
262 const __m128i K7 = aes_128_key_expansion<0x40>(K6, K6);
263 const __m128i K8 = aes_128_key_expansion<0x80>(K7, K7);
264 const __m128i K9 = aes_128_key_expansion<0x1B>(K8, K8);
265 const __m128i K10 = aes_128_key_expansion<0x36>(K9, K9);
266
267 __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
268 _mm_storeu_si128(EK_mm, K0);
269 _mm_storeu_si128(EK_mm + 1, K1);
270 _mm_storeu_si128(EK_mm + 2, K2);
271 _mm_storeu_si128(EK_mm + 3, K3);
272 _mm_storeu_si128(EK_mm + 4, K4);
273 _mm_storeu_si128(EK_mm + 5, K5);
274 _mm_storeu_si128(EK_mm + 6, K6);
275 _mm_storeu_si128(EK_mm + 7, K7);
276 _mm_storeu_si128(EK_mm + 8, K8);
277 _mm_storeu_si128(EK_mm + 9, K9);
278 _mm_storeu_si128(EK_mm + 10, K10);
279
280 // Now generate decryption keys
281
282 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
283 _mm_storeu_si128(DK_mm, K10);
284 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
285 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
286 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
287 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
288 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
289 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
290 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
291 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
292 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
293 _mm_storeu_si128(DK_mm + 10, K0);
294}
295
296/*
297* AES-192 Encryption
298*/
299BOTAN_FUNC_ISA("ssse3,aes") void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
300 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4 * 0]);
301 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4 * 1]);
302 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4 * 2]);
303 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4 * 3]);
304 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4 * 4]);
305 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4 * 5]);
306 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4 * 6]);
307 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4 * 7]);
308 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4 * 8]);
309 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4 * 9]);
310 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4 * 10]);
311 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_EK[4 * 11]);
312 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_EK[4 * 12]);
313
314 while(blocks >= 4) {
315 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
316 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
317 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
318 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
319
320 keyxor(K0, B0, B1, B2, B3);
321 aesenc(K1, B0, B1, B2, B3);
322 aesenc(K2, B0, B1, B2, B3);
323 aesenc(K3, B0, B1, B2, B3);
324 aesenc(K4, B0, B1, B2, B3);
325 aesenc(K5, B0, B1, B2, B3);
326 aesenc(K6, B0, B1, B2, B3);
327 aesenc(K7, B0, B1, B2, B3);
328 aesenc(K8, B0, B1, B2, B3);
329 aesenc(K9, B0, B1, B2, B3);
330 aesenc(K10, B0, B1, B2, B3);
331 aesenc(K11, B0, B1, B2, B3);
332 aesenclast(K12, B0, B1, B2, B3);
333
334 B0.store_le(out + 16 * 0);
335 B1.store_le(out + 16 * 1);
336 B2.store_le(out + 16 * 2);
337 B3.store_le(out + 16 * 3);
338
339 blocks -= 4;
340 in += 4 * 16;
341 out += 4 * 16;
342 }
343
344 for(size_t i = 0; i != blocks; ++i) {
345 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
346
347 B0 ^= K0;
348
349 aesenc(K1, B0);
350 aesenc(K2, B0);
351 aesenc(K3, B0);
352 aesenc(K4, B0);
353 aesenc(K5, B0);
354 aesenc(K6, B0);
355 aesenc(K7, B0);
356 aesenc(K8, B0);
357 aesenc(K9, B0);
358 aesenc(K10, B0);
359 aesenc(K11, B0);
360 aesenclast(K12, B0);
361
362 B0.store_le(out + 16 * i);
363 }
364}
365
366/*
367* AES-192 Decryption
368*/
369BOTAN_FUNC_ISA("ssse3,aes") void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
370 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4 * 0]);
371 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4 * 1]);
372 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4 * 2]);
373 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4 * 3]);
374 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4 * 4]);
375 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4 * 5]);
376 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4 * 6]);
377 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4 * 7]);
378 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4 * 8]);
379 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4 * 9]);
380 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4 * 10]);
381 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_DK[4 * 11]);
382 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_DK[4 * 12]);
383
384 while(blocks >= 4) {
385 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
386 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
387 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
388 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
389
390 keyxor(K0, B0, B1, B2, B3);
391 aesdec(K1, B0, B1, B2, B3);
392 aesdec(K2, B0, B1, B2, B3);
393 aesdec(K3, B0, B1, B2, B3);
394 aesdec(K4, B0, B1, B2, B3);
395 aesdec(K5, B0, B1, B2, B3);
396 aesdec(K6, B0, B1, B2, B3);
397 aesdec(K7, B0, B1, B2, B3);
398 aesdec(K8, B0, B1, B2, B3);
399 aesdec(K9, B0, B1, B2, B3);
400 aesdec(K10, B0, B1, B2, B3);
401 aesdec(K11, B0, B1, B2, B3);
402 aesdeclast(K12, B0, B1, B2, B3);
403
404 B0.store_le(out + 16 * 0);
405 B1.store_le(out + 16 * 1);
406 B2.store_le(out + 16 * 2);
407 B3.store_le(out + 16 * 3);
408
409 blocks -= 4;
410 in += 4 * 16;
411 out += 4 * 16;
412 }
413
414 for(size_t i = 0; i != blocks; ++i) {
415 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
416
417 B0 ^= K0;
418
419 aesdec(K1, B0);
420 aesdec(K2, B0);
421 aesdec(K3, B0);
422 aesdec(K4, B0);
423 aesdec(K5, B0);
424 aesdec(K6, B0);
425 aesdec(K7, B0);
426 aesdec(K8, B0);
427 aesdec(K9, B0);
428 aesdec(K10, B0);
429 aesdec(K11, B0);
430 aesdeclast(K12, B0);
431
432 B0.store_le(out + 16 * i);
433 }
434}
435
436/*
437* AES-192 Key Schedule
438*/
439BOTAN_FUNC_ISA("ssse3,aes") void AES_192::aesni_key_schedule(const uint8_t key[], size_t /*length*/) {
440 m_EK.resize(52);
441 m_DK.resize(52);
442
443 __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
444 __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8));
445 K1 = _mm_srli_si128(K1, 8);
446
447 load_le(m_EK.data(), key, 6);
448
449 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x01), m_EK, 6);
450 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x02), m_EK, 12);
451 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x04), m_EK, 18);
452 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x08), m_EK, 24);
453 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x10), m_EK, 30);
454 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x20), m_EK, 36);
455 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x40), m_EK, 42);
456 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x80), m_EK, 48);
457
458 // Now generate decryption keys
459 const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data());
460
461 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
462 _mm_storeu_si128(DK_mm, _mm_loadu_si128(EK_mm + 12));
463 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
464 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
465 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
466 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
467 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
468 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
469 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
470 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
471 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
472 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
473 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
474 _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
475}
476
477/*
478* AES-256 Encryption
479*/
480BOTAN_FUNC_ISA("ssse3,aes") void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
481 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4 * 0]);
482 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4 * 1]);
483 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4 * 2]);
484 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4 * 3]);
485 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4 * 4]);
486 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4 * 5]);
487 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4 * 6]);
488 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4 * 7]);
489 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4 * 8]);
490 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4 * 9]);
491 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4 * 10]);
492 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_EK[4 * 11]);
493 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_EK[4 * 12]);
494 const SIMD_4x32 K13 = SIMD_4x32::load_le(&m_EK[4 * 13]);
495 const SIMD_4x32 K14 = SIMD_4x32::load_le(&m_EK[4 * 14]);
496
497 while(blocks >= 4) {
498 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
499 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
500 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
501 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
502
503 keyxor(K0, B0, B1, B2, B3);
504 aesenc(K1, B0, B1, B2, B3);
505 aesenc(K2, B0, B1, B2, B3);
506 aesenc(K3, B0, B1, B2, B3);
507 aesenc(K4, B0, B1, B2, B3);
508 aesenc(K5, B0, B1, B2, B3);
509 aesenc(K6, B0, B1, B2, B3);
510 aesenc(K7, B0, B1, B2, B3);
511 aesenc(K8, B0, B1, B2, B3);
512 aesenc(K9, B0, B1, B2, B3);
513 aesenc(K10, B0, B1, B2, B3);
514 aesenc(K11, B0, B1, B2, B3);
515 aesenc(K12, B0, B1, B2, B3);
516 aesenc(K13, B0, B1, B2, B3);
517 aesenclast(K14, B0, B1, B2, B3);
518
519 B0.store_le(out + 16 * 0);
520 B1.store_le(out + 16 * 1);
521 B2.store_le(out + 16 * 2);
522 B3.store_le(out + 16 * 3);
523
524 blocks -= 4;
525 in += 4 * 16;
526 out += 4 * 16;
527 }
528
529 for(size_t i = 0; i != blocks; ++i) {
530 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
531
532 B0 ^= K0;
533
534 aesenc(K1, B0);
535 aesenc(K2, B0);
536 aesenc(K3, B0);
537 aesenc(K4, B0);
538 aesenc(K5, B0);
539 aesenc(K6, B0);
540 aesenc(K7, B0);
541 aesenc(K8, B0);
542 aesenc(K9, B0);
543 aesenc(K10, B0);
544 aesenc(K11, B0);
545 aesenc(K12, B0);
546 aesenc(K13, B0);
547 aesenclast(K14, B0);
548
549 B0.store_le(out + 16 * i);
550 }
551}
552
553/*
554* AES-256 Decryption
555*/
556BOTAN_FUNC_ISA("ssse3,aes") void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
557 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4 * 0]);
558 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4 * 1]);
559 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4 * 2]);
560 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4 * 3]);
561 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4 * 4]);
562 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4 * 5]);
563 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4 * 6]);
564 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4 * 7]);
565 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4 * 8]);
566 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4 * 9]);
567 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4 * 10]);
568 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_DK[4 * 11]);
569 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_DK[4 * 12]);
570 const SIMD_4x32 K13 = SIMD_4x32::load_le(&m_DK[4 * 13]);
571 const SIMD_4x32 K14 = SIMD_4x32::load_le(&m_DK[4 * 14]);
572
573 while(blocks >= 4) {
574 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
575 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
576 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
577 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
578
579 keyxor(K0, B0, B1, B2, B3);
580 aesdec(K1, B0, B1, B2, B3);
581 aesdec(K2, B0, B1, B2, B3);
582 aesdec(K3, B0, B1, B2, B3);
583 aesdec(K4, B0, B1, B2, B3);
584 aesdec(K5, B0, B1, B2, B3);
585 aesdec(K6, B0, B1, B2, B3);
586 aesdec(K7, B0, B1, B2, B3);
587 aesdec(K8, B0, B1, B2, B3);
588 aesdec(K9, B0, B1, B2, B3);
589 aesdec(K10, B0, B1, B2, B3);
590 aesdec(K11, B0, B1, B2, B3);
591 aesdec(K12, B0, B1, B2, B3);
592 aesdec(K13, B0, B1, B2, B3);
593 aesdeclast(K14, B0, B1, B2, B3);
594
595 B0.store_le(out + 16 * 0);
596 B1.store_le(out + 16 * 1);
597 B2.store_le(out + 16 * 2);
598 B3.store_le(out + 16 * 3);
599
600 blocks -= 4;
601 in += 4 * 16;
602 out += 4 * 16;
603 }
604
605 for(size_t i = 0; i != blocks; ++i) {
606 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
607
608 B0 ^= K0;
609
610 aesdec(K1, B0);
611 aesdec(K2, B0);
612 aesdec(K3, B0);
613 aesdec(K4, B0);
614 aesdec(K5, B0);
615 aesdec(K6, B0);
616 aesdec(K7, B0);
617 aesdec(K8, B0);
618 aesdec(K9, B0);
619 aesdec(K10, B0);
620 aesdec(K11, B0);
621 aesdec(K12, B0);
622 aesdec(K13, B0);
623 aesdeclast(K14, B0);
624
625 B0.store_le(out + 16 * i);
626 }
627}
628
629/*
630* AES-256 Key Schedule
631*/
632BOTAN_FUNC_ISA("ssse3,aes") void AES_256::aesni_key_schedule(const uint8_t key[], size_t /*length*/) {
633 m_EK.resize(60);
634 m_DK.resize(60);
635
636 const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
637 const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16));
638
639 const __m128i K2 = aes_128_key_expansion<0x01>(K0, K1);
640 const __m128i K3 = aes_256_key_expansion(K1, K2);
641
642 const __m128i K4 = aes_128_key_expansion<0x02>(K2, K3);
643 const __m128i K5 = aes_256_key_expansion(K3, K4);
644
645 const __m128i K6 = aes_128_key_expansion<0x04>(K4, K5);
646 const __m128i K7 = aes_256_key_expansion(K5, K6);
647
648 const __m128i K8 = aes_128_key_expansion<0x08>(K6, K7);
649 const __m128i K9 = aes_256_key_expansion(K7, K8);
650
651 const __m128i K10 = aes_128_key_expansion<0x10>(K8, K9);
652 const __m128i K11 = aes_256_key_expansion(K9, K10);
653
654 const __m128i K12 = aes_128_key_expansion<0x20>(K10, K11);
655 const __m128i K13 = aes_256_key_expansion(K11, K12);
656
657 const __m128i K14 = aes_128_key_expansion<0x40>(K12, K13);
658
659 __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
660 _mm_storeu_si128(EK_mm, K0);
661 _mm_storeu_si128(EK_mm + 1, K1);
662 _mm_storeu_si128(EK_mm + 2, K2);
663 _mm_storeu_si128(EK_mm + 3, K3);
664 _mm_storeu_si128(EK_mm + 4, K4);
665 _mm_storeu_si128(EK_mm + 5, K5);
666 _mm_storeu_si128(EK_mm + 6, K6);
667 _mm_storeu_si128(EK_mm + 7, K7);
668 _mm_storeu_si128(EK_mm + 8, K8);
669 _mm_storeu_si128(EK_mm + 9, K9);
670 _mm_storeu_si128(EK_mm + 10, K10);
671 _mm_storeu_si128(EK_mm + 11, K11);
672 _mm_storeu_si128(EK_mm + 12, K12);
673 _mm_storeu_si128(EK_mm + 13, K13);
674 _mm_storeu_si128(EK_mm + 14, K14);
675
676 // Now generate decryption keys
677 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
678 _mm_storeu_si128(DK_mm, K14);
679 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
680 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
681 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
682 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
683 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
684 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
685 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
686 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
687 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
688 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
689 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
690 _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
691 _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
692 _mm_storeu_si128(DK_mm + 14, K0);
693}
694
695} // namespace Botan
void store_le(uint32_t out[4]) const noexcept
Definition simd_32.h:189
static SIMD_4x32 load_le(const void *in) noexcept
Definition simd_32.h:158
#define BOTAN_FUNC_ISA(isa)
Definition compiler.h:92
#define BOTAN_FORCE_INLINE
Definition compiler.h:165
#define BOTAN_FUNC_ISA_INLINE(isa)
Definition compiler.h:98
uint8x16_t uint8x16_t K2
Definition aes_armv8.cpp:32
constexpr auto load_le(ParamTs &&... params)
Definition loadstor.h:458
std::vector< T, secure_allocator< T > > secure_vector
Definition secmem.h:61