Botan 3.9.0
Crypto and TLS for C&
aes_ni.cpp
Go to the documentation of this file.
1/*
2* AES using AES-NI instructions
3* (C) 2009,2012 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#include <botan/internal/aes.h>
9
10#include <botan/internal/isa_extn.h>
11#include <botan/internal/loadstor.h>
12#include <botan/internal/simd_4x32.h>
13#include <wmmintrin.h>
14
15namespace Botan {
16
17namespace {
18
19template <uint8_t RC>
20BOTAN_FN_ISA_AESNI inline __m128i aes_128_key_expansion(__m128i key, __m128i key_getting_rcon) {
21 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key_getting_rcon, RC);
22 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3, 3, 3, 3));
23 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
24 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
25 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
26 return _mm_xor_si128(key, key_with_rcon);
27}
28
29BOTAN_FN_ISA_AESNI
30void aes_192_key_expansion(
31 __m128i* K1, __m128i* K2, __m128i key2_with_rcon, secure_vector<uint32_t>& out, size_t offset) {
32 __m128i key1 = *K1;
33 __m128i key2 = *K2;
34
35 key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1, 1, 1, 1));
36 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
37 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
38 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
39 key1 = _mm_xor_si128(key1, key2_with_rcon);
40
41 *K1 = key1;
42 _mm_storeu_si128(reinterpret_cast<__m128i*>(&out[offset]), key1);
43
44 if(offset == 48) { // last key
45 return;
46 }
47
48 key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
49 key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3, 3, 3, 3)));
50
51 *K2 = key2;
52 out[offset + 4] = _mm_cvtsi128_si32(key2);
53 out[offset + 5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
54}
55
56/*
57* The second half of the AES-256 key expansion (other half same as AES-128)
58*/
59BOTAN_FN_ISA_AESNI __m128i aes_256_key_expansion(__m128i key, __m128i key2) {
60 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
61 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2, 2, 2, 2));
62
63 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
64 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
65 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
66 return _mm_xor_si128(key, key_with_rcon);
67}
68
69BOTAN_FORCE_INLINE void keyxor(SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
70 B0 ^= K;
71 B1 ^= K;
72 B2 ^= K;
73 B3 ^= K;
74}
75
76BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesenc(SIMD_4x32 K, SIMD_4x32& B) {
77 B = SIMD_4x32(_mm_aesenc_si128(B.raw(), K.raw()));
78}
79
80BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesenc(
81 SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
82 B0 = SIMD_4x32(_mm_aesenc_si128(B0.raw(), K.raw()));
83 B1 = SIMD_4x32(_mm_aesenc_si128(B1.raw(), K.raw()));
84 B2 = SIMD_4x32(_mm_aesenc_si128(B2.raw(), K.raw()));
85 B3 = SIMD_4x32(_mm_aesenc_si128(B3.raw(), K.raw()));
86}
87
88BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesenclast(SIMD_4x32 K, SIMD_4x32& B) {
89 B = SIMD_4x32(_mm_aesenclast_si128(B.raw(), K.raw()));
90}
91
92BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesenclast(
93 SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
94 B0 = SIMD_4x32(_mm_aesenclast_si128(B0.raw(), K.raw()));
95 B1 = SIMD_4x32(_mm_aesenclast_si128(B1.raw(), K.raw()));
96 B2 = SIMD_4x32(_mm_aesenclast_si128(B2.raw(), K.raw()));
97 B3 = SIMD_4x32(_mm_aesenclast_si128(B3.raw(), K.raw()));
98}
99
100BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesdec(SIMD_4x32 K, SIMD_4x32& B) {
101 B = SIMD_4x32(_mm_aesdec_si128(B.raw(), K.raw()));
102}
103
104BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesdec(
105 SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
106 B0 = SIMD_4x32(_mm_aesdec_si128(B0.raw(), K.raw()));
107 B1 = SIMD_4x32(_mm_aesdec_si128(B1.raw(), K.raw()));
108 B2 = SIMD_4x32(_mm_aesdec_si128(B2.raw(), K.raw()));
109 B3 = SIMD_4x32(_mm_aesdec_si128(B3.raw(), K.raw()));
110}
111
112BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesdeclast(SIMD_4x32 K, SIMD_4x32& B) {
113 B = SIMD_4x32(_mm_aesdeclast_si128(B.raw(), K.raw()));
114}
115
116BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesdeclast(
117 SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
118 B0 = SIMD_4x32(_mm_aesdeclast_si128(B0.raw(), K.raw()));
119 B1 = SIMD_4x32(_mm_aesdeclast_si128(B1.raw(), K.raw()));
120 B2 = SIMD_4x32(_mm_aesdeclast_si128(B2.raw(), K.raw()));
121 B3 = SIMD_4x32(_mm_aesdeclast_si128(B3.raw(), K.raw()));
122}
123
124} // namespace
125
126/*
127* AES-128 Encryption
128*/
129BOTAN_FN_ISA_AESNI void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
130 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4 * 0]);
131 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4 * 1]);
132 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4 * 2]);
133 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4 * 3]);
134 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4 * 4]);
135 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4 * 5]);
136 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4 * 6]);
137 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4 * 7]);
138 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4 * 8]);
139 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4 * 9]);
140 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4 * 10]);
141
142 while(blocks >= 4) {
143 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
144 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
145 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
146 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
147
148 keyxor(K0, B0, B1, B2, B3);
149 aesenc(K1, B0, B1, B2, B3);
150 aesenc(K2, B0, B1, B2, B3);
151 aesenc(K3, B0, B1, B2, B3);
152 aesenc(K4, B0, B1, B2, B3);
153 aesenc(K5, B0, B1, B2, B3);
154 aesenc(K6, B0, B1, B2, B3);
155 aesenc(K7, B0, B1, B2, B3);
156 aesenc(K8, B0, B1, B2, B3);
157 aesenc(K9, B0, B1, B2, B3);
158 aesenclast(K10, B0, B1, B2, B3);
159
160 B0.store_le(out + 16 * 0);
161 B1.store_le(out + 16 * 1);
162 B2.store_le(out + 16 * 2);
163 B3.store_le(out + 16 * 3);
164
165 blocks -= 4;
166 in += 4 * 16;
167 out += 4 * 16;
168 }
169
170 for(size_t i = 0; i != blocks; ++i) {
171 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
172
173 B0 ^= K0;
174 aesenc(K1, B0);
175 aesenc(K2, B0);
176 aesenc(K3, B0);
177 aesenc(K4, B0);
178 aesenc(K5, B0);
179 aesenc(K6, B0);
180 aesenc(K7, B0);
181 aesenc(K8, B0);
182 aesenc(K9, B0);
183 aesenclast(K10, B0);
184
185 B0.store_le(out + 16 * i);
186 }
187}
188
189/*
190* AES-128 Decryption
191*/
192BOTAN_FN_ISA_AESNI void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
193 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4 * 0]);
194 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4 * 1]);
195 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4 * 2]);
196 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4 * 3]);
197 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4 * 4]);
198 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4 * 5]);
199 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4 * 6]);
200 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4 * 7]);
201 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4 * 8]);
202 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4 * 9]);
203 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4 * 10]);
204
205 while(blocks >= 4) {
206 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
207 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
208 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
209 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
210
211 keyxor(K0, B0, B1, B2, B3);
212 aesdec(K1, B0, B1, B2, B3);
213 aesdec(K2, B0, B1, B2, B3);
214 aesdec(K3, B0, B1, B2, B3);
215 aesdec(K4, B0, B1, B2, B3);
216 aesdec(K5, B0, B1, B2, B3);
217 aesdec(K6, B0, B1, B2, B3);
218 aesdec(K7, B0, B1, B2, B3);
219 aesdec(K8, B0, B1, B2, B3);
220 aesdec(K9, B0, B1, B2, B3);
221 aesdeclast(K10, B0, B1, B2, B3);
222
223 B0.store_le(out + 16 * 0);
224 B1.store_le(out + 16 * 1);
225 B2.store_le(out + 16 * 2);
226 B3.store_le(out + 16 * 3);
227
228 blocks -= 4;
229 in += 4 * 16;
230 out += 4 * 16;
231 }
232
233 for(size_t i = 0; i != blocks; ++i) {
234 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
235
236 B0 ^= K0;
237 aesdec(K1, B0);
238 aesdec(K2, B0);
239 aesdec(K3, B0);
240 aesdec(K4, B0);
241 aesdec(K5, B0);
242 aesdec(K6, B0);
243 aesdec(K7, B0);
244 aesdec(K8, B0);
245 aesdec(K9, B0);
246 aesdeclast(K10, B0);
247
248 B0.store_le(out + 16 * i);
249 }
250}
251
252/*
253* AES-128 Key Schedule
254*/
255BOTAN_FN_ISA_AESNI void AES_128::aesni_key_schedule(const uint8_t key[], size_t /*length*/) {
256 m_EK.resize(44);
257 m_DK.resize(44);
258
259 const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
260 const __m128i K1 = aes_128_key_expansion<0x01>(K0, K0);
261 const __m128i K2 = aes_128_key_expansion<0x02>(K1, K1);
262 const __m128i K3 = aes_128_key_expansion<0x04>(K2, K2);
263 const __m128i K4 = aes_128_key_expansion<0x08>(K3, K3);
264 const __m128i K5 = aes_128_key_expansion<0x10>(K4, K4);
265 const __m128i K6 = aes_128_key_expansion<0x20>(K5, K5);
266 const __m128i K7 = aes_128_key_expansion<0x40>(K6, K6);
267 const __m128i K8 = aes_128_key_expansion<0x80>(K7, K7);
268 const __m128i K9 = aes_128_key_expansion<0x1B>(K8, K8);
269 const __m128i K10 = aes_128_key_expansion<0x36>(K9, K9);
270
271 __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
272 _mm_storeu_si128(EK_mm, K0);
273 _mm_storeu_si128(EK_mm + 1, K1);
274 _mm_storeu_si128(EK_mm + 2, K2);
275 _mm_storeu_si128(EK_mm + 3, K3);
276 _mm_storeu_si128(EK_mm + 4, K4);
277 _mm_storeu_si128(EK_mm + 5, K5);
278 _mm_storeu_si128(EK_mm + 6, K6);
279 _mm_storeu_si128(EK_mm + 7, K7);
280 _mm_storeu_si128(EK_mm + 8, K8);
281 _mm_storeu_si128(EK_mm + 9, K9);
282 _mm_storeu_si128(EK_mm + 10, K10);
283
284 // Now generate decryption keys
285
286 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
287 _mm_storeu_si128(DK_mm, K10);
288 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
289 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
290 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
291 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
292 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
293 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
294 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
295 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
296 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
297 _mm_storeu_si128(DK_mm + 10, K0);
298}
299
300/*
301* AES-192 Encryption
302*/
303BOTAN_FN_ISA_AESNI void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
304 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4 * 0]);
305 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4 * 1]);
306 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4 * 2]);
307 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4 * 3]);
308 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4 * 4]);
309 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4 * 5]);
310 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4 * 6]);
311 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4 * 7]);
312 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4 * 8]);
313 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4 * 9]);
314 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4 * 10]);
315 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_EK[4 * 11]);
316 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_EK[4 * 12]);
317
318 while(blocks >= 4) {
319 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
320 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
321 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
322 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
323
324 keyxor(K0, B0, B1, B2, B3);
325 aesenc(K1, B0, B1, B2, B3);
326 aesenc(K2, B0, B1, B2, B3);
327 aesenc(K3, B0, B1, B2, B3);
328 aesenc(K4, B0, B1, B2, B3);
329 aesenc(K5, B0, B1, B2, B3);
330 aesenc(K6, B0, B1, B2, B3);
331 aesenc(K7, B0, B1, B2, B3);
332 aesenc(K8, B0, B1, B2, B3);
333 aesenc(K9, B0, B1, B2, B3);
334 aesenc(K10, B0, B1, B2, B3);
335 aesenc(K11, B0, B1, B2, B3);
336 aesenclast(K12, B0, B1, B2, B3);
337
338 B0.store_le(out + 16 * 0);
339 B1.store_le(out + 16 * 1);
340 B2.store_le(out + 16 * 2);
341 B3.store_le(out + 16 * 3);
342
343 blocks -= 4;
344 in += 4 * 16;
345 out += 4 * 16;
346 }
347
348 for(size_t i = 0; i != blocks; ++i) {
349 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
350
351 B0 ^= K0;
352
353 aesenc(K1, B0);
354 aesenc(K2, B0);
355 aesenc(K3, B0);
356 aesenc(K4, B0);
357 aesenc(K5, B0);
358 aesenc(K6, B0);
359 aesenc(K7, B0);
360 aesenc(K8, B0);
361 aesenc(K9, B0);
362 aesenc(K10, B0);
363 aesenc(K11, B0);
364 aesenclast(K12, B0);
365
366 B0.store_le(out + 16 * i);
367 }
368}
369
370/*
371* AES-192 Decryption
372*/
373BOTAN_FN_ISA_AESNI void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
374 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4 * 0]);
375 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4 * 1]);
376 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4 * 2]);
377 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4 * 3]);
378 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4 * 4]);
379 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4 * 5]);
380 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4 * 6]);
381 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4 * 7]);
382 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4 * 8]);
383 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4 * 9]);
384 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4 * 10]);
385 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_DK[4 * 11]);
386 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_DK[4 * 12]);
387
388 while(blocks >= 4) {
389 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
390 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
391 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
392 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
393
394 keyxor(K0, B0, B1, B2, B3);
395 aesdec(K1, B0, B1, B2, B3);
396 aesdec(K2, B0, B1, B2, B3);
397 aesdec(K3, B0, B1, B2, B3);
398 aesdec(K4, B0, B1, B2, B3);
399 aesdec(K5, B0, B1, B2, B3);
400 aesdec(K6, B0, B1, B2, B3);
401 aesdec(K7, B0, B1, B2, B3);
402 aesdec(K8, B0, B1, B2, B3);
403 aesdec(K9, B0, B1, B2, B3);
404 aesdec(K10, B0, B1, B2, B3);
405 aesdec(K11, B0, B1, B2, B3);
406 aesdeclast(K12, B0, B1, B2, B3);
407
408 B0.store_le(out + 16 * 0);
409 B1.store_le(out + 16 * 1);
410 B2.store_le(out + 16 * 2);
411 B3.store_le(out + 16 * 3);
412
413 blocks -= 4;
414 in += 4 * 16;
415 out += 4 * 16;
416 }
417
418 for(size_t i = 0; i != blocks; ++i) {
419 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
420
421 B0 ^= K0;
422
423 aesdec(K1, B0);
424 aesdec(K2, B0);
425 aesdec(K3, B0);
426 aesdec(K4, B0);
427 aesdec(K5, B0);
428 aesdec(K6, B0);
429 aesdec(K7, B0);
430 aesdec(K8, B0);
431 aesdec(K9, B0);
432 aesdec(K10, B0);
433 aesdec(K11, B0);
434 aesdeclast(K12, B0);
435
436 B0.store_le(out + 16 * i);
437 }
438}
439
440/*
441* AES-192 Key Schedule
442*/
443BOTAN_FN_ISA_AESNI void AES_192::aesni_key_schedule(const uint8_t key[], size_t /*length*/) {
444 m_EK.resize(52);
445 m_DK.resize(52);
446
447 __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
448 __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8));
449 K1 = _mm_srli_si128(K1, 8);
450
451 load_le(m_EK.data(), key, 6);
452
453 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x01), m_EK, 6);
454 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x02), m_EK, 12);
455 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x04), m_EK, 18);
456 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x08), m_EK, 24);
457 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x10), m_EK, 30);
458 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x20), m_EK, 36);
459 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x40), m_EK, 42);
460 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x80), m_EK, 48);
461
462 // Now generate decryption keys
463 const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data());
464
465 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
466 _mm_storeu_si128(DK_mm, _mm_loadu_si128(EK_mm + 12));
467 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
468 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
469 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
470 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
471 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
472 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
473 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
474 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
475 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
476 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
477 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
478 _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
479}
480
481/*
482* AES-256 Encryption
483*/
484BOTAN_FN_ISA_AESNI void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
485 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4 * 0]);
486 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4 * 1]);
487 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4 * 2]);
488 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4 * 3]);
489 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4 * 4]);
490 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4 * 5]);
491 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4 * 6]);
492 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4 * 7]);
493 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4 * 8]);
494 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4 * 9]);
495 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4 * 10]);
496 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_EK[4 * 11]);
497 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_EK[4 * 12]);
498 const SIMD_4x32 K13 = SIMD_4x32::load_le(&m_EK[4 * 13]);
499 const SIMD_4x32 K14 = SIMD_4x32::load_le(&m_EK[4 * 14]);
500
501 while(blocks >= 4) {
502 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
503 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
504 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
505 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
506
507 keyxor(K0, B0, B1, B2, B3);
508 aesenc(K1, B0, B1, B2, B3);
509 aesenc(K2, B0, B1, B2, B3);
510 aesenc(K3, B0, B1, B2, B3);
511 aesenc(K4, B0, B1, B2, B3);
512 aesenc(K5, B0, B1, B2, B3);
513 aesenc(K6, B0, B1, B2, B3);
514 aesenc(K7, B0, B1, B2, B3);
515 aesenc(K8, B0, B1, B2, B3);
516 aesenc(K9, B0, B1, B2, B3);
517 aesenc(K10, B0, B1, B2, B3);
518 aesenc(K11, B0, B1, B2, B3);
519 aesenc(K12, B0, B1, B2, B3);
520 aesenc(K13, B0, B1, B2, B3);
521 aesenclast(K14, B0, B1, B2, B3);
522
523 B0.store_le(out + 16 * 0);
524 B1.store_le(out + 16 * 1);
525 B2.store_le(out + 16 * 2);
526 B3.store_le(out + 16 * 3);
527
528 blocks -= 4;
529 in += 4 * 16;
530 out += 4 * 16;
531 }
532
533 for(size_t i = 0; i != blocks; ++i) {
534 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
535
536 B0 ^= K0;
537
538 aesenc(K1, B0);
539 aesenc(K2, B0);
540 aesenc(K3, B0);
541 aesenc(K4, B0);
542 aesenc(K5, B0);
543 aesenc(K6, B0);
544 aesenc(K7, B0);
545 aesenc(K8, B0);
546 aesenc(K9, B0);
547 aesenc(K10, B0);
548 aesenc(K11, B0);
549 aesenc(K12, B0);
550 aesenc(K13, B0);
551 aesenclast(K14, B0);
552
553 B0.store_le(out + 16 * i);
554 }
555}
556
557/*
558* AES-256 Decryption
559*/
560BOTAN_FN_ISA_AESNI void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
561 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4 * 0]);
562 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4 * 1]);
563 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4 * 2]);
564 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4 * 3]);
565 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4 * 4]);
566 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4 * 5]);
567 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4 * 6]);
568 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4 * 7]);
569 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4 * 8]);
570 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4 * 9]);
571 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4 * 10]);
572 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_DK[4 * 11]);
573 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_DK[4 * 12]);
574 const SIMD_4x32 K13 = SIMD_4x32::load_le(&m_DK[4 * 13]);
575 const SIMD_4x32 K14 = SIMD_4x32::load_le(&m_DK[4 * 14]);
576
577 while(blocks >= 4) {
578 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
579 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
580 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
581 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
582
583 keyxor(K0, B0, B1, B2, B3);
584 aesdec(K1, B0, B1, B2, B3);
585 aesdec(K2, B0, B1, B2, B3);
586 aesdec(K3, B0, B1, B2, B3);
587 aesdec(K4, B0, B1, B2, B3);
588 aesdec(K5, B0, B1, B2, B3);
589 aesdec(K6, B0, B1, B2, B3);
590 aesdec(K7, B0, B1, B2, B3);
591 aesdec(K8, B0, B1, B2, B3);
592 aesdec(K9, B0, B1, B2, B3);
593 aesdec(K10, B0, B1, B2, B3);
594 aesdec(K11, B0, B1, B2, B3);
595 aesdec(K12, B0, B1, B2, B3);
596 aesdec(K13, B0, B1, B2, B3);
597 aesdeclast(K14, B0, B1, B2, B3);
598
599 B0.store_le(out + 16 * 0);
600 B1.store_le(out + 16 * 1);
601 B2.store_le(out + 16 * 2);
602 B3.store_le(out + 16 * 3);
603
604 blocks -= 4;
605 in += 4 * 16;
606 out += 4 * 16;
607 }
608
609 for(size_t i = 0; i != blocks; ++i) {
610 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
611
612 B0 ^= K0;
613
614 aesdec(K1, B0);
615 aesdec(K2, B0);
616 aesdec(K3, B0);
617 aesdec(K4, B0);
618 aesdec(K5, B0);
619 aesdec(K6, B0);
620 aesdec(K7, B0);
621 aesdec(K8, B0);
622 aesdec(K9, B0);
623 aesdec(K10, B0);
624 aesdec(K11, B0);
625 aesdec(K12, B0);
626 aesdec(K13, B0);
627 aesdeclast(K14, B0);
628
629 B0.store_le(out + 16 * i);
630 }
631}
632
633/*
634* AES-256 Key Schedule
635*/
636BOTAN_FN_ISA_AESNI void AES_256::aesni_key_schedule(const uint8_t key[], size_t /*length*/) {
637 m_EK.resize(60);
638 m_DK.resize(60);
639
640 const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
641 const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16));
642
643 const __m128i K2 = aes_128_key_expansion<0x01>(K0, K1);
644 const __m128i K3 = aes_256_key_expansion(K1, K2);
645
646 const __m128i K4 = aes_128_key_expansion<0x02>(K2, K3);
647 const __m128i K5 = aes_256_key_expansion(K3, K4);
648
649 const __m128i K6 = aes_128_key_expansion<0x04>(K4, K5);
650 const __m128i K7 = aes_256_key_expansion(K5, K6);
651
652 const __m128i K8 = aes_128_key_expansion<0x08>(K6, K7);
653 const __m128i K9 = aes_256_key_expansion(K7, K8);
654
655 const __m128i K10 = aes_128_key_expansion<0x10>(K8, K9);
656 const __m128i K11 = aes_256_key_expansion(K9, K10);
657
658 const __m128i K12 = aes_128_key_expansion<0x20>(K10, K11);
659 const __m128i K13 = aes_256_key_expansion(K11, K12);
660
661 const __m128i K14 = aes_128_key_expansion<0x40>(K12, K13);
662
663 __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
664 _mm_storeu_si128(EK_mm, K0);
665 _mm_storeu_si128(EK_mm + 1, K1);
666 _mm_storeu_si128(EK_mm + 2, K2);
667 _mm_storeu_si128(EK_mm + 3, K3);
668 _mm_storeu_si128(EK_mm + 4, K4);
669 _mm_storeu_si128(EK_mm + 5, K5);
670 _mm_storeu_si128(EK_mm + 6, K6);
671 _mm_storeu_si128(EK_mm + 7, K7);
672 _mm_storeu_si128(EK_mm + 8, K8);
673 _mm_storeu_si128(EK_mm + 9, K9);
674 _mm_storeu_si128(EK_mm + 10, K10);
675 _mm_storeu_si128(EK_mm + 11, K11);
676 _mm_storeu_si128(EK_mm + 12, K12);
677 _mm_storeu_si128(EK_mm + 13, K13);
678 _mm_storeu_si128(EK_mm + 14, K14);
679
680 // Now generate decryption keys
681 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
682 _mm_storeu_si128(DK_mm, K14);
683 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
684 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
685 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
686 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
687 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
688 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
689 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
690 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
691 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
692 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
693 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
694 _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
695 _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
696 _mm_storeu_si128(DK_mm + 14, K0);
697}
698
699} // namespace Botan
static SIMD_4x32 load_le(const void *in) noexcept
Definition simd_4x32.h:149
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
constexpr uint32_t K1
Definition sha1_f.h:16
constexpr uint32_t K4
Definition sha1_f.h:19
constexpr uint32_t K3
Definition sha1_f.h:18
constexpr uint32_t K2
Definition sha1_f.h:17
constexpr auto load_le(ParamTs &&... params)
Definition loadstor.h:495
std::vector< T, secure_allocator< T > > secure_vector
Definition secmem.h:69