Botan 3.10.0
Crypto and TLS for C&
aes_ni.cpp
Go to the documentation of this file.
1/*
2* AES using AES-NI instructions
3* (C) 2009,2012 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#include <botan/internal/aes.h>
9
10#include <botan/internal/isa_extn.h>
11#include <botan/internal/loadstor.h>
12#include <botan/internal/simd_4x32.h>
13#include <wmmintrin.h>
14
15namespace Botan {
16
17namespace {
18
19// NOLINTBEGIN(portability-simd-intrinsics)
20
21template <uint8_t RC>
22BOTAN_FN_ISA_AESNI inline __m128i aes_128_key_expansion(__m128i key, __m128i key_getting_rcon) {
23 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key_getting_rcon, RC);
24 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3, 3, 3, 3));
25 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
26 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
27 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
28 return _mm_xor_si128(key, key_with_rcon);
29}
30
31BOTAN_FN_ISA_AESNI
32void aes_192_key_expansion(
33 __m128i* K1, __m128i* K2, __m128i key2_with_rcon, secure_vector<uint32_t>& out, size_t offset) {
34 __m128i key1 = *K1;
35 __m128i key2 = *K2;
36
37 key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1, 1, 1, 1));
38 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
39 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
40 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
41 key1 = _mm_xor_si128(key1, key2_with_rcon);
42
43 *K1 = key1;
44 _mm_storeu_si128(reinterpret_cast<__m128i*>(&out[offset]), key1);
45
46 if(offset == 48) { // last key
47 return;
48 }
49
50 key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
51 key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3, 3, 3, 3)));
52
53 *K2 = key2;
54 out[offset + 4] = _mm_cvtsi128_si32(key2);
55 out[offset + 5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
56}
57
58/*
59* The second half of the AES-256 key expansion (other half same as AES-128)
60*/
61BOTAN_FN_ISA_AESNI __m128i aes_256_key_expansion(__m128i key, __m128i key2) {
62 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
63 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2, 2, 2, 2));
64
65 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
66 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
67 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
68 return _mm_xor_si128(key, key_with_rcon);
69}
70
71BOTAN_FORCE_INLINE void keyxor(SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
72 B0 ^= K;
73 B1 ^= K;
74 B2 ^= K;
75 B3 ^= K;
76}
77
78BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesenc(SIMD_4x32 K, SIMD_4x32& B) {
79 B = SIMD_4x32(_mm_aesenc_si128(B.raw(), K.raw()));
80}
81
82BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesenc(
83 SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
84 B0 = SIMD_4x32(_mm_aesenc_si128(B0.raw(), K.raw()));
85 B1 = SIMD_4x32(_mm_aesenc_si128(B1.raw(), K.raw()));
86 B2 = SIMD_4x32(_mm_aesenc_si128(B2.raw(), K.raw()));
87 B3 = SIMD_4x32(_mm_aesenc_si128(B3.raw(), K.raw()));
88}
89
90BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesenclast(SIMD_4x32 K, SIMD_4x32& B) {
91 B = SIMD_4x32(_mm_aesenclast_si128(B.raw(), K.raw()));
92}
93
94BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesenclast(
95 SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
96 B0 = SIMD_4x32(_mm_aesenclast_si128(B0.raw(), K.raw()));
97 B1 = SIMD_4x32(_mm_aesenclast_si128(B1.raw(), K.raw()));
98 B2 = SIMD_4x32(_mm_aesenclast_si128(B2.raw(), K.raw()));
99 B3 = SIMD_4x32(_mm_aesenclast_si128(B3.raw(), K.raw()));
100}
101
102BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesdec(SIMD_4x32 K, SIMD_4x32& B) {
103 B = SIMD_4x32(_mm_aesdec_si128(B.raw(), K.raw()));
104}
105
106BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesdec(
107 SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
108 B0 = SIMD_4x32(_mm_aesdec_si128(B0.raw(), K.raw()));
109 B1 = SIMD_4x32(_mm_aesdec_si128(B1.raw(), K.raw()));
110 B2 = SIMD_4x32(_mm_aesdec_si128(B2.raw(), K.raw()));
111 B3 = SIMD_4x32(_mm_aesdec_si128(B3.raw(), K.raw()));
112}
113
114BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesdeclast(SIMD_4x32 K, SIMD_4x32& B) {
115 B = SIMD_4x32(_mm_aesdeclast_si128(B.raw(), K.raw()));
116}
117
118BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesdeclast(
119 SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
120 B0 = SIMD_4x32(_mm_aesdeclast_si128(B0.raw(), K.raw()));
121 B1 = SIMD_4x32(_mm_aesdeclast_si128(B1.raw(), K.raw()));
122 B2 = SIMD_4x32(_mm_aesdeclast_si128(B2.raw(), K.raw()));
123 B3 = SIMD_4x32(_mm_aesdeclast_si128(B3.raw(), K.raw()));
124}
125
126// NOLINTEND(portability-simd-intrinsics)
127
128} // namespace
129
130/*
131* AES-128 Encryption
132*/
133BOTAN_FN_ISA_AESNI void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
134 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4 * 0]);
135 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4 * 1]);
136 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4 * 2]);
137 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4 * 3]);
138 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4 * 4]);
139 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4 * 5]);
140 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4 * 6]);
141 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4 * 7]);
142 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4 * 8]);
143 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4 * 9]);
144 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4 * 10]);
145
146 while(blocks >= 4) {
147 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
148 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
149 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
150 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
151
152 keyxor(K0, B0, B1, B2, B3);
153 aesenc(K1, B0, B1, B2, B3);
154 aesenc(K2, B0, B1, B2, B3);
155 aesenc(K3, B0, B1, B2, B3);
156 aesenc(K4, B0, B1, B2, B3);
157 aesenc(K5, B0, B1, B2, B3);
158 aesenc(K6, B0, B1, B2, B3);
159 aesenc(K7, B0, B1, B2, B3);
160 aesenc(K8, B0, B1, B2, B3);
161 aesenc(K9, B0, B1, B2, B3);
162 aesenclast(K10, B0, B1, B2, B3);
163
164 B0.store_le(out + 16 * 0);
165 B1.store_le(out + 16 * 1);
166 B2.store_le(out + 16 * 2);
167 B3.store_le(out + 16 * 3);
168
169 blocks -= 4;
170 in += 4 * 16;
171 out += 4 * 16;
172 }
173
174 for(size_t i = 0; i != blocks; ++i) {
175 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
176
177 B0 ^= K0;
178 aesenc(K1, B0);
179 aesenc(K2, B0);
180 aesenc(K3, B0);
181 aesenc(K4, B0);
182 aesenc(K5, B0);
183 aesenc(K6, B0);
184 aesenc(K7, B0);
185 aesenc(K8, B0);
186 aesenc(K9, B0);
187 aesenclast(K10, B0);
188
189 B0.store_le(out + 16 * i);
190 }
191}
192
193/*
194* AES-128 Decryption
195*/
196BOTAN_FN_ISA_AESNI void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
197 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4 * 0]);
198 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4 * 1]);
199 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4 * 2]);
200 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4 * 3]);
201 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4 * 4]);
202 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4 * 5]);
203 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4 * 6]);
204 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4 * 7]);
205 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4 * 8]);
206 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4 * 9]);
207 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4 * 10]);
208
209 while(blocks >= 4) {
210 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
211 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
212 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
213 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
214
215 keyxor(K0, B0, B1, B2, B3);
216 aesdec(K1, B0, B1, B2, B3);
217 aesdec(K2, B0, B1, B2, B3);
218 aesdec(K3, B0, B1, B2, B3);
219 aesdec(K4, B0, B1, B2, B3);
220 aesdec(K5, B0, B1, B2, B3);
221 aesdec(K6, B0, B1, B2, B3);
222 aesdec(K7, B0, B1, B2, B3);
223 aesdec(K8, B0, B1, B2, B3);
224 aesdec(K9, B0, B1, B2, B3);
225 aesdeclast(K10, B0, B1, B2, B3);
226
227 B0.store_le(out + 16 * 0);
228 B1.store_le(out + 16 * 1);
229 B2.store_le(out + 16 * 2);
230 B3.store_le(out + 16 * 3);
231
232 blocks -= 4;
233 in += 4 * 16;
234 out += 4 * 16;
235 }
236
237 for(size_t i = 0; i != blocks; ++i) {
238 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
239
240 B0 ^= K0;
241 aesdec(K1, B0);
242 aesdec(K2, B0);
243 aesdec(K3, B0);
244 aesdec(K4, B0);
245 aesdec(K5, B0);
246 aesdec(K6, B0);
247 aesdec(K7, B0);
248 aesdec(K8, B0);
249 aesdec(K9, B0);
250 aesdeclast(K10, B0);
251
252 B0.store_le(out + 16 * i);
253 }
254}
255
256/*
257* AES-128 Key Schedule
258*/
259BOTAN_FN_ISA_AESNI void AES_128::aesni_key_schedule(const uint8_t key[], size_t /*length*/) {
260 m_EK.resize(44);
261 m_DK.resize(44);
262
263 // NOLINTBEGIN(portability-simd-intrinsics) TODO convert to using SIMD_4x32
264
265 const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
266 const __m128i K1 = aes_128_key_expansion<0x01>(K0, K0);
267 const __m128i K2 = aes_128_key_expansion<0x02>(K1, K1);
268 const __m128i K3 = aes_128_key_expansion<0x04>(K2, K2);
269 const __m128i K4 = aes_128_key_expansion<0x08>(K3, K3);
270 const __m128i K5 = aes_128_key_expansion<0x10>(K4, K4);
271 const __m128i K6 = aes_128_key_expansion<0x20>(K5, K5);
272 const __m128i K7 = aes_128_key_expansion<0x40>(K6, K6);
273 const __m128i K8 = aes_128_key_expansion<0x80>(K7, K7);
274 const __m128i K9 = aes_128_key_expansion<0x1B>(K8, K8);
275 const __m128i K10 = aes_128_key_expansion<0x36>(K9, K9);
276
277 __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
278 _mm_storeu_si128(EK_mm, K0);
279 _mm_storeu_si128(EK_mm + 1, K1);
280 _mm_storeu_si128(EK_mm + 2, K2);
281 _mm_storeu_si128(EK_mm + 3, K3);
282 _mm_storeu_si128(EK_mm + 4, K4);
283 _mm_storeu_si128(EK_mm + 5, K5);
284 _mm_storeu_si128(EK_mm + 6, K6);
285 _mm_storeu_si128(EK_mm + 7, K7);
286 _mm_storeu_si128(EK_mm + 8, K8);
287 _mm_storeu_si128(EK_mm + 9, K9);
288 _mm_storeu_si128(EK_mm + 10, K10);
289
290 // Now generate decryption keys
291
292 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
293 _mm_storeu_si128(DK_mm, K10);
294 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
295 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
296 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
297 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
298 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
299 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
300 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
301 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
302 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
303 _mm_storeu_si128(DK_mm + 10, K0);
304
305 // NOLINTEND(portability-simd-intrinsics)
306}
307
308/*
309* AES-192 Encryption
310*/
311BOTAN_FN_ISA_AESNI void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
312 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4 * 0]);
313 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4 * 1]);
314 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4 * 2]);
315 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4 * 3]);
316 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4 * 4]);
317 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4 * 5]);
318 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4 * 6]);
319 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4 * 7]);
320 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4 * 8]);
321 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4 * 9]);
322 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4 * 10]);
323 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_EK[4 * 11]);
324 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_EK[4 * 12]);
325
326 while(blocks >= 4) {
327 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
328 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
329 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
330 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
331
332 keyxor(K0, B0, B1, B2, B3);
333 aesenc(K1, B0, B1, B2, B3);
334 aesenc(K2, B0, B1, B2, B3);
335 aesenc(K3, B0, B1, B2, B3);
336 aesenc(K4, B0, B1, B2, B3);
337 aesenc(K5, B0, B1, B2, B3);
338 aesenc(K6, B0, B1, B2, B3);
339 aesenc(K7, B0, B1, B2, B3);
340 aesenc(K8, B0, B1, B2, B3);
341 aesenc(K9, B0, B1, B2, B3);
342 aesenc(K10, B0, B1, B2, B3);
343 aesenc(K11, B0, B1, B2, B3);
344 aesenclast(K12, B0, B1, B2, B3);
345
346 B0.store_le(out + 16 * 0);
347 B1.store_le(out + 16 * 1);
348 B2.store_le(out + 16 * 2);
349 B3.store_le(out + 16 * 3);
350
351 blocks -= 4;
352 in += 4 * 16;
353 out += 4 * 16;
354 }
355
356 for(size_t i = 0; i != blocks; ++i) {
357 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
358
359 B0 ^= K0;
360
361 aesenc(K1, B0);
362 aesenc(K2, B0);
363 aesenc(K3, B0);
364 aesenc(K4, B0);
365 aesenc(K5, B0);
366 aesenc(K6, B0);
367 aesenc(K7, B0);
368 aesenc(K8, B0);
369 aesenc(K9, B0);
370 aesenc(K10, B0);
371 aesenc(K11, B0);
372 aesenclast(K12, B0);
373
374 B0.store_le(out + 16 * i);
375 }
376}
377
378/*
379* AES-192 Decryption
380*/
381BOTAN_FN_ISA_AESNI void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
382 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4 * 0]);
383 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4 * 1]);
384 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4 * 2]);
385 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4 * 3]);
386 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4 * 4]);
387 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4 * 5]);
388 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4 * 6]);
389 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4 * 7]);
390 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4 * 8]);
391 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4 * 9]);
392 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4 * 10]);
393 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_DK[4 * 11]);
394 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_DK[4 * 12]);
395
396 while(blocks >= 4) {
397 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
398 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
399 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
400 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
401
402 keyxor(K0, B0, B1, B2, B3);
403 aesdec(K1, B0, B1, B2, B3);
404 aesdec(K2, B0, B1, B2, B3);
405 aesdec(K3, B0, B1, B2, B3);
406 aesdec(K4, B0, B1, B2, B3);
407 aesdec(K5, B0, B1, B2, B3);
408 aesdec(K6, B0, B1, B2, B3);
409 aesdec(K7, B0, B1, B2, B3);
410 aesdec(K8, B0, B1, B2, B3);
411 aesdec(K9, B0, B1, B2, B3);
412 aesdec(K10, B0, B1, B2, B3);
413 aesdec(K11, B0, B1, B2, B3);
414 aesdeclast(K12, B0, B1, B2, B3);
415
416 B0.store_le(out + 16 * 0);
417 B1.store_le(out + 16 * 1);
418 B2.store_le(out + 16 * 2);
419 B3.store_le(out + 16 * 3);
420
421 blocks -= 4;
422 in += 4 * 16;
423 out += 4 * 16;
424 }
425
426 for(size_t i = 0; i != blocks; ++i) {
427 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
428
429 B0 ^= K0;
430
431 aesdec(K1, B0);
432 aesdec(K2, B0);
433 aesdec(K3, B0);
434 aesdec(K4, B0);
435 aesdec(K5, B0);
436 aesdec(K6, B0);
437 aesdec(K7, B0);
438 aesdec(K8, B0);
439 aesdec(K9, B0);
440 aesdec(K10, B0);
441 aesdec(K11, B0);
442 aesdeclast(K12, B0);
443
444 B0.store_le(out + 16 * i);
445 }
446}
447
448/*
449* AES-192 Key Schedule
450*/
451BOTAN_FN_ISA_AESNI void AES_192::aesni_key_schedule(const uint8_t key[], size_t /*length*/) {
452 m_EK.resize(52);
453 m_DK.resize(52);
454
455 // NOLINTBEGIN(portability-simd-intrinsics) TODO convert to using SIMD_4x32
456
457 __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
458 __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8));
459 K1 = _mm_srli_si128(K1, 8);
460
461 load_le(m_EK.data(), key, 6);
462
463 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x01), m_EK, 6);
464 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x02), m_EK, 12);
465 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x04), m_EK, 18);
466 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x08), m_EK, 24);
467 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x10), m_EK, 30);
468 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x20), m_EK, 36);
469 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x40), m_EK, 42);
470 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x80), m_EK, 48);
471
472 // Now generate decryption keys
473 const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data());
474
475 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
476 _mm_storeu_si128(DK_mm, _mm_loadu_si128(EK_mm + 12));
477 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
478 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
479 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
480 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
481 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
482 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
483 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
484 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
485 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
486 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
487 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
488 _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
489
490 // NOLINTEND(portability-simd-intrinsics)
491}
492
493/*
494* AES-256 Encryption
495*/
496BOTAN_FN_ISA_AESNI void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
497 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4 * 0]);
498 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4 * 1]);
499 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4 * 2]);
500 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4 * 3]);
501 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4 * 4]);
502 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4 * 5]);
503 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4 * 6]);
504 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4 * 7]);
505 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4 * 8]);
506 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4 * 9]);
507 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4 * 10]);
508 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_EK[4 * 11]);
509 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_EK[4 * 12]);
510 const SIMD_4x32 K13 = SIMD_4x32::load_le(&m_EK[4 * 13]);
511 const SIMD_4x32 K14 = SIMD_4x32::load_le(&m_EK[4 * 14]);
512
513 while(blocks >= 4) {
514 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
515 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
516 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
517 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
518
519 keyxor(K0, B0, B1, B2, B3);
520 aesenc(K1, B0, B1, B2, B3);
521 aesenc(K2, B0, B1, B2, B3);
522 aesenc(K3, B0, B1, B2, B3);
523 aesenc(K4, B0, B1, B2, B3);
524 aesenc(K5, B0, B1, B2, B3);
525 aesenc(K6, B0, B1, B2, B3);
526 aesenc(K7, B0, B1, B2, B3);
527 aesenc(K8, B0, B1, B2, B3);
528 aesenc(K9, B0, B1, B2, B3);
529 aesenc(K10, B0, B1, B2, B3);
530 aesenc(K11, B0, B1, B2, B3);
531 aesenc(K12, B0, B1, B2, B3);
532 aesenc(K13, B0, B1, B2, B3);
533 aesenclast(K14, B0, B1, B2, B3);
534
535 B0.store_le(out + 16 * 0);
536 B1.store_le(out + 16 * 1);
537 B2.store_le(out + 16 * 2);
538 B3.store_le(out + 16 * 3);
539
540 blocks -= 4;
541 in += 4 * 16;
542 out += 4 * 16;
543 }
544
545 for(size_t i = 0; i != blocks; ++i) {
546 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
547
548 B0 ^= K0;
549
550 aesenc(K1, B0);
551 aesenc(K2, B0);
552 aesenc(K3, B0);
553 aesenc(K4, B0);
554 aesenc(K5, B0);
555 aesenc(K6, B0);
556 aesenc(K7, B0);
557 aesenc(K8, B0);
558 aesenc(K9, B0);
559 aesenc(K10, B0);
560 aesenc(K11, B0);
561 aesenc(K12, B0);
562 aesenc(K13, B0);
563 aesenclast(K14, B0);
564
565 B0.store_le(out + 16 * i);
566 }
567}
568
569/*
570* AES-256 Decryption
571*/
572BOTAN_FN_ISA_AESNI void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
573 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4 * 0]);
574 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4 * 1]);
575 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4 * 2]);
576 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4 * 3]);
577 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4 * 4]);
578 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4 * 5]);
579 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4 * 6]);
580 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4 * 7]);
581 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4 * 8]);
582 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4 * 9]);
583 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4 * 10]);
584 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_DK[4 * 11]);
585 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_DK[4 * 12]);
586 const SIMD_4x32 K13 = SIMD_4x32::load_le(&m_DK[4 * 13]);
587 const SIMD_4x32 K14 = SIMD_4x32::load_le(&m_DK[4 * 14]);
588
589 while(blocks >= 4) {
590 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
591 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
592 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
593 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
594
595 keyxor(K0, B0, B1, B2, B3);
596 aesdec(K1, B0, B1, B2, B3);
597 aesdec(K2, B0, B1, B2, B3);
598 aesdec(K3, B0, B1, B2, B3);
599 aesdec(K4, B0, B1, B2, B3);
600 aesdec(K5, B0, B1, B2, B3);
601 aesdec(K6, B0, B1, B2, B3);
602 aesdec(K7, B0, B1, B2, B3);
603 aesdec(K8, B0, B1, B2, B3);
604 aesdec(K9, B0, B1, B2, B3);
605 aesdec(K10, B0, B1, B2, B3);
606 aesdec(K11, B0, B1, B2, B3);
607 aesdec(K12, B0, B1, B2, B3);
608 aesdec(K13, B0, B1, B2, B3);
609 aesdeclast(K14, B0, B1, B2, B3);
610
611 B0.store_le(out + 16 * 0);
612 B1.store_le(out + 16 * 1);
613 B2.store_le(out + 16 * 2);
614 B3.store_le(out + 16 * 3);
615
616 blocks -= 4;
617 in += 4 * 16;
618 out += 4 * 16;
619 }
620
621 for(size_t i = 0; i != blocks; ++i) {
622 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
623
624 B0 ^= K0;
625
626 aesdec(K1, B0);
627 aesdec(K2, B0);
628 aesdec(K3, B0);
629 aesdec(K4, B0);
630 aesdec(K5, B0);
631 aesdec(K6, B0);
632 aesdec(K7, B0);
633 aesdec(K8, B0);
634 aesdec(K9, B0);
635 aesdec(K10, B0);
636 aesdec(K11, B0);
637 aesdec(K12, B0);
638 aesdec(K13, B0);
639 aesdeclast(K14, B0);
640
641 B0.store_le(out + 16 * i);
642 }
643}
644
645/*
646* AES-256 Key Schedule
647*/
648BOTAN_FN_ISA_AESNI void AES_256::aesni_key_schedule(const uint8_t key[], size_t /*length*/) {
649 m_EK.resize(60);
650 m_DK.resize(60);
651
652 // NOLINTBEGIN(portability-simd-intrinsics) TODO convert to using SIMD_4x32
653
654 const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
655 const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16));
656
657 const __m128i K2 = aes_128_key_expansion<0x01>(K0, K1);
658 const __m128i K3 = aes_256_key_expansion(K1, K2);
659
660 const __m128i K4 = aes_128_key_expansion<0x02>(K2, K3);
661 const __m128i K5 = aes_256_key_expansion(K3, K4);
662
663 const __m128i K6 = aes_128_key_expansion<0x04>(K4, K5);
664 const __m128i K7 = aes_256_key_expansion(K5, K6);
665
666 const __m128i K8 = aes_128_key_expansion<0x08>(K6, K7);
667 const __m128i K9 = aes_256_key_expansion(K7, K8);
668
669 const __m128i K10 = aes_128_key_expansion<0x10>(K8, K9);
670 const __m128i K11 = aes_256_key_expansion(K9, K10);
671
672 const __m128i K12 = aes_128_key_expansion<0x20>(K10, K11);
673 const __m128i K13 = aes_256_key_expansion(K11, K12);
674
675 const __m128i K14 = aes_128_key_expansion<0x40>(K12, K13);
676
677 __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
678 _mm_storeu_si128(EK_mm, K0);
679 _mm_storeu_si128(EK_mm + 1, K1);
680 _mm_storeu_si128(EK_mm + 2, K2);
681 _mm_storeu_si128(EK_mm + 3, K3);
682 _mm_storeu_si128(EK_mm + 4, K4);
683 _mm_storeu_si128(EK_mm + 5, K5);
684 _mm_storeu_si128(EK_mm + 6, K6);
685 _mm_storeu_si128(EK_mm + 7, K7);
686 _mm_storeu_si128(EK_mm + 8, K8);
687 _mm_storeu_si128(EK_mm + 9, K9);
688 _mm_storeu_si128(EK_mm + 10, K10);
689 _mm_storeu_si128(EK_mm + 11, K11);
690 _mm_storeu_si128(EK_mm + 12, K12);
691 _mm_storeu_si128(EK_mm + 13, K13);
692 _mm_storeu_si128(EK_mm + 14, K14);
693
694 // Now generate decryption keys
695 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
696 _mm_storeu_si128(DK_mm, K14);
697 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
698 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
699 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
700 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
701 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
702 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
703 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
704 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
705 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
706 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
707 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
708 _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
709 _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
710 _mm_storeu_si128(DK_mm + 14, K0);
711
712 // NOLINTEND(portability-simd-intrinsics)
713}
714
715} // namespace Botan
static SIMD_4x32 load_le(const void *in) noexcept
Definition simd_4x32.h:149
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
constexpr uint32_t K1
Definition sha1_f.h:16
constexpr uint32_t K4
Definition sha1_f.h:19
constexpr uint32_t K3
Definition sha1_f.h:18
constexpr uint32_t K2
Definition sha1_f.h:17
constexpr auto load_le(ParamTs &&... params)
Definition loadstor.h:495
std::vector< T, secure_allocator< T > > secure_vector
Definition secmem.h:69