Botan 3.11.0
Crypto and TLS for C&
aes_ni.cpp
Go to the documentation of this file.
1/*
2* AES using AES-NI instructions
3* (C) 2009,2012 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#include <botan/internal/aes.h>
9
10#include <botan/internal/isa_extn.h>
11#include <botan/internal/loadstor.h>
12#include <botan/internal/simd_4x32.h>
13#include <wmmintrin.h>
14
15namespace Botan {
16
17namespace {
18
19// NOLINTBEGIN(portability-simd-intrinsics)
20
21template <uint8_t RC>
22BOTAN_FN_ISA_AESNI inline __m128i aes_128_key_expansion(__m128i key, __m128i key_getting_rcon) {
23 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key_getting_rcon, RC);
24 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3, 3, 3, 3));
25 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
26 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
27 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
28 return _mm_xor_si128(key, key_with_rcon);
29}
30
31BOTAN_FN_ISA_AESNI
32void aes_192_key_expansion(
33 __m128i* K1, __m128i* K2, __m128i key2_with_rcon, secure_vector<uint32_t>& out, size_t offset) {
34 __m128i key1 = *K1;
35 __m128i key2 = *K2;
36
37 key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1, 1, 1, 1));
38 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
39 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
40 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
41 key1 = _mm_xor_si128(key1, key2_with_rcon);
42
43 *K1 = key1;
44 _mm_storeu_si128(reinterpret_cast<__m128i*>(&out[offset]), key1);
45
46 if(offset == 48) { // last key
47 return;
48 }
49
50 key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
51 key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3, 3, 3, 3)));
52
53 *K2 = key2;
54 out[offset + 4] = _mm_cvtsi128_si32(key2);
55 out[offset + 5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
56}
57
58/*
59* The second half of the AES-256 key expansion (other half same as AES-128)
60*/
61BOTAN_FN_ISA_AESNI __m128i aes_256_key_expansion(__m128i key, __m128i key2) {
62 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
63 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2, 2, 2, 2));
64
65 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
66 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
67 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
68 return _mm_xor_si128(key, key_with_rcon);
69}
70
71BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void keyxor(
72 SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
73 B0 ^= K;
74 B1 ^= K;
75 B2 ^= K;
76 B3 ^= K;
77}
78
79BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesenc(SIMD_4x32 K, SIMD_4x32& B) {
80 B = SIMD_4x32(_mm_aesenc_si128(B.raw(), K.raw()));
81}
82
83BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesenc(
84 SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
85 B0 = SIMD_4x32(_mm_aesenc_si128(B0.raw(), K.raw()));
86 B1 = SIMD_4x32(_mm_aesenc_si128(B1.raw(), K.raw()));
87 B2 = SIMD_4x32(_mm_aesenc_si128(B2.raw(), K.raw()));
88 B3 = SIMD_4x32(_mm_aesenc_si128(B3.raw(), K.raw()));
89}
90
91BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesenclast(SIMD_4x32 K, SIMD_4x32& B) {
92 B = SIMD_4x32(_mm_aesenclast_si128(B.raw(), K.raw()));
93}
94
95BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesenclast(
96 SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
97 B0 = SIMD_4x32(_mm_aesenclast_si128(B0.raw(), K.raw()));
98 B1 = SIMD_4x32(_mm_aesenclast_si128(B1.raw(), K.raw()));
99 B2 = SIMD_4x32(_mm_aesenclast_si128(B2.raw(), K.raw()));
100 B3 = SIMD_4x32(_mm_aesenclast_si128(B3.raw(), K.raw()));
101}
102
103BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesdec(SIMD_4x32 K, SIMD_4x32& B) {
104 B = SIMD_4x32(_mm_aesdec_si128(B.raw(), K.raw()));
105}
106
107BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesdec(
108 SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
109 B0 = SIMD_4x32(_mm_aesdec_si128(B0.raw(), K.raw()));
110 B1 = SIMD_4x32(_mm_aesdec_si128(B1.raw(), K.raw()));
111 B2 = SIMD_4x32(_mm_aesdec_si128(B2.raw(), K.raw()));
112 B3 = SIMD_4x32(_mm_aesdec_si128(B3.raw(), K.raw()));
113}
114
115BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesdeclast(SIMD_4x32 K, SIMD_4x32& B) {
116 B = SIMD_4x32(_mm_aesdeclast_si128(B.raw(), K.raw()));
117}
118
119BOTAN_FORCE_INLINE BOTAN_FN_ISA_AESNI void aesdeclast(
120 SIMD_4x32 K, SIMD_4x32& B0, SIMD_4x32& B1, SIMD_4x32& B2, SIMD_4x32& B3) {
121 B0 = SIMD_4x32(_mm_aesdeclast_si128(B0.raw(), K.raw()));
122 B1 = SIMD_4x32(_mm_aesdeclast_si128(B1.raw(), K.raw()));
123 B2 = SIMD_4x32(_mm_aesdeclast_si128(B2.raw(), K.raw()));
124 B3 = SIMD_4x32(_mm_aesdeclast_si128(B3.raw(), K.raw()));
125}
126
127// NOLINTEND(portability-simd-intrinsics)
128
129} // namespace
130
131/*
132* AES-128 Encryption
133*/
134BOTAN_FN_ISA_AESNI void AES_128::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
135 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4 * 0]);
136 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4 * 1]);
137 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4 * 2]);
138 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4 * 3]);
139 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4 * 4]);
140 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4 * 5]);
141 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4 * 6]);
142 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4 * 7]);
143 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4 * 8]);
144 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4 * 9]);
145 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4 * 10]);
146
147 while(blocks >= 4) {
148 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
149 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
150 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
151 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
152
153 keyxor(K0, B0, B1, B2, B3);
154 aesenc(K1, B0, B1, B2, B3);
155 aesenc(K2, B0, B1, B2, B3);
156 aesenc(K3, B0, B1, B2, B3);
157 aesenc(K4, B0, B1, B2, B3);
158 aesenc(K5, B0, B1, B2, B3);
159 aesenc(K6, B0, B1, B2, B3);
160 aesenc(K7, B0, B1, B2, B3);
161 aesenc(K8, B0, B1, B2, B3);
162 aesenc(K9, B0, B1, B2, B3);
163 aesenclast(K10, B0, B1, B2, B3);
164
165 B0.store_le(out + 16 * 0);
166 B1.store_le(out + 16 * 1);
167 B2.store_le(out + 16 * 2);
168 B3.store_le(out + 16 * 3);
169
170 blocks -= 4;
171 in += 4 * 16;
172 out += 4 * 16;
173 }
174
175 for(size_t i = 0; i != blocks; ++i) {
176 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
177
178 B0 ^= K0;
179 aesenc(K1, B0);
180 aesenc(K2, B0);
181 aesenc(K3, B0);
182 aesenc(K4, B0);
183 aesenc(K5, B0);
184 aesenc(K6, B0);
185 aesenc(K7, B0);
186 aesenc(K8, B0);
187 aesenc(K9, B0);
188 aesenclast(K10, B0);
189
190 B0.store_le(out + 16 * i);
191 }
192}
193
194/*
195* AES-128 Decryption
196*/
197BOTAN_FN_ISA_AESNI void AES_128::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
198 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4 * 0]);
199 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4 * 1]);
200 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4 * 2]);
201 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4 * 3]);
202 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4 * 4]);
203 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4 * 5]);
204 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4 * 6]);
205 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4 * 7]);
206 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4 * 8]);
207 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4 * 9]);
208 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4 * 10]);
209
210 while(blocks >= 4) {
211 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
212 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
213 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
214 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
215
216 keyxor(K0, B0, B1, B2, B3);
217 aesdec(K1, B0, B1, B2, B3);
218 aesdec(K2, B0, B1, B2, B3);
219 aesdec(K3, B0, B1, B2, B3);
220 aesdec(K4, B0, B1, B2, B3);
221 aesdec(K5, B0, B1, B2, B3);
222 aesdec(K6, B0, B1, B2, B3);
223 aesdec(K7, B0, B1, B2, B3);
224 aesdec(K8, B0, B1, B2, B3);
225 aesdec(K9, B0, B1, B2, B3);
226 aesdeclast(K10, B0, B1, B2, B3);
227
228 B0.store_le(out + 16 * 0);
229 B1.store_le(out + 16 * 1);
230 B2.store_le(out + 16 * 2);
231 B3.store_le(out + 16 * 3);
232
233 blocks -= 4;
234 in += 4 * 16;
235 out += 4 * 16;
236 }
237
238 for(size_t i = 0; i != blocks; ++i) {
239 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
240
241 B0 ^= K0;
242 aesdec(K1, B0);
243 aesdec(K2, B0);
244 aesdec(K3, B0);
245 aesdec(K4, B0);
246 aesdec(K5, B0);
247 aesdec(K6, B0);
248 aesdec(K7, B0);
249 aesdec(K8, B0);
250 aesdec(K9, B0);
251 aesdeclast(K10, B0);
252
253 B0.store_le(out + 16 * i);
254 }
255}
256
257/*
258* AES-128 Key Schedule
259*/
260BOTAN_FN_ISA_AESNI void AES_128::aesni_key_schedule(const uint8_t key[], size_t /*length*/) {
261 m_EK.resize(44);
262 m_DK.resize(44);
263
264 // NOLINTBEGIN(portability-simd-intrinsics) TODO convert to using SIMD_4x32
265
266 const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
267 const __m128i K1 = aes_128_key_expansion<0x01>(K0, K0);
268 const __m128i K2 = aes_128_key_expansion<0x02>(K1, K1);
269 const __m128i K3 = aes_128_key_expansion<0x04>(K2, K2);
270 const __m128i K4 = aes_128_key_expansion<0x08>(K3, K3);
271 const __m128i K5 = aes_128_key_expansion<0x10>(K4, K4);
272 const __m128i K6 = aes_128_key_expansion<0x20>(K5, K5);
273 const __m128i K7 = aes_128_key_expansion<0x40>(K6, K6);
274 const __m128i K8 = aes_128_key_expansion<0x80>(K7, K7);
275 const __m128i K9 = aes_128_key_expansion<0x1B>(K8, K8);
276 const __m128i K10 = aes_128_key_expansion<0x36>(K9, K9);
277
278 __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
279 _mm_storeu_si128(EK_mm, K0);
280 _mm_storeu_si128(EK_mm + 1, K1);
281 _mm_storeu_si128(EK_mm + 2, K2);
282 _mm_storeu_si128(EK_mm + 3, K3);
283 _mm_storeu_si128(EK_mm + 4, K4);
284 _mm_storeu_si128(EK_mm + 5, K5);
285 _mm_storeu_si128(EK_mm + 6, K6);
286 _mm_storeu_si128(EK_mm + 7, K7);
287 _mm_storeu_si128(EK_mm + 8, K8);
288 _mm_storeu_si128(EK_mm + 9, K9);
289 _mm_storeu_si128(EK_mm + 10, K10);
290
291 // Now generate decryption keys
292
293 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
294 _mm_storeu_si128(DK_mm, K10);
295 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
296 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
297 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
298 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
299 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
300 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
301 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
302 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
303 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
304 _mm_storeu_si128(DK_mm + 10, K0);
305
306 // NOLINTEND(portability-simd-intrinsics)
307}
308
309/*
310* AES-192 Encryption
311*/
312BOTAN_FN_ISA_AESNI void AES_192::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
313 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4 * 0]);
314 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4 * 1]);
315 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4 * 2]);
316 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4 * 3]);
317 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4 * 4]);
318 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4 * 5]);
319 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4 * 6]);
320 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4 * 7]);
321 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4 * 8]);
322 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4 * 9]);
323 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4 * 10]);
324 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_EK[4 * 11]);
325 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_EK[4 * 12]);
326
327 while(blocks >= 4) {
328 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
329 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
330 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
331 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
332
333 keyxor(K0, B0, B1, B2, B3);
334 aesenc(K1, B0, B1, B2, B3);
335 aesenc(K2, B0, B1, B2, B3);
336 aesenc(K3, B0, B1, B2, B3);
337 aesenc(K4, B0, B1, B2, B3);
338 aesenc(K5, B0, B1, B2, B3);
339 aesenc(K6, B0, B1, B2, B3);
340 aesenc(K7, B0, B1, B2, B3);
341 aesenc(K8, B0, B1, B2, B3);
342 aesenc(K9, B0, B1, B2, B3);
343 aesenc(K10, B0, B1, B2, B3);
344 aesenc(K11, B0, B1, B2, B3);
345 aesenclast(K12, B0, B1, B2, B3);
346
347 B0.store_le(out + 16 * 0);
348 B1.store_le(out + 16 * 1);
349 B2.store_le(out + 16 * 2);
350 B3.store_le(out + 16 * 3);
351
352 blocks -= 4;
353 in += 4 * 16;
354 out += 4 * 16;
355 }
356
357 for(size_t i = 0; i != blocks; ++i) {
358 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
359
360 B0 ^= K0;
361
362 aesenc(K1, B0);
363 aesenc(K2, B0);
364 aesenc(K3, B0);
365 aesenc(K4, B0);
366 aesenc(K5, B0);
367 aesenc(K6, B0);
368 aesenc(K7, B0);
369 aesenc(K8, B0);
370 aesenc(K9, B0);
371 aesenc(K10, B0);
372 aesenc(K11, B0);
373 aesenclast(K12, B0);
374
375 B0.store_le(out + 16 * i);
376 }
377}
378
379/*
380* AES-192 Decryption
381*/
382BOTAN_FN_ISA_AESNI void AES_192::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
383 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4 * 0]);
384 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4 * 1]);
385 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4 * 2]);
386 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4 * 3]);
387 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4 * 4]);
388 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4 * 5]);
389 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4 * 6]);
390 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4 * 7]);
391 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4 * 8]);
392 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4 * 9]);
393 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4 * 10]);
394 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_DK[4 * 11]);
395 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_DK[4 * 12]);
396
397 while(blocks >= 4) {
398 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
399 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
400 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
401 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
402
403 keyxor(K0, B0, B1, B2, B3);
404 aesdec(K1, B0, B1, B2, B3);
405 aesdec(K2, B0, B1, B2, B3);
406 aesdec(K3, B0, B1, B2, B3);
407 aesdec(K4, B0, B1, B2, B3);
408 aesdec(K5, B0, B1, B2, B3);
409 aesdec(K6, B0, B1, B2, B3);
410 aesdec(K7, B0, B1, B2, B3);
411 aesdec(K8, B0, B1, B2, B3);
412 aesdec(K9, B0, B1, B2, B3);
413 aesdec(K10, B0, B1, B2, B3);
414 aesdec(K11, B0, B1, B2, B3);
415 aesdeclast(K12, B0, B1, B2, B3);
416
417 B0.store_le(out + 16 * 0);
418 B1.store_le(out + 16 * 1);
419 B2.store_le(out + 16 * 2);
420 B3.store_le(out + 16 * 3);
421
422 blocks -= 4;
423 in += 4 * 16;
424 out += 4 * 16;
425 }
426
427 for(size_t i = 0; i != blocks; ++i) {
428 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
429
430 B0 ^= K0;
431
432 aesdec(K1, B0);
433 aesdec(K2, B0);
434 aesdec(K3, B0);
435 aesdec(K4, B0);
436 aesdec(K5, B0);
437 aesdec(K6, B0);
438 aesdec(K7, B0);
439 aesdec(K8, B0);
440 aesdec(K9, B0);
441 aesdec(K10, B0);
442 aesdec(K11, B0);
443 aesdeclast(K12, B0);
444
445 B0.store_le(out + 16 * i);
446 }
447}
448
449/*
450* AES-192 Key Schedule
451*/
452BOTAN_FN_ISA_AESNI void AES_192::aesni_key_schedule(const uint8_t key[], size_t /*length*/) {
453 m_EK.resize(52);
454 m_DK.resize(52);
455
456 // NOLINTBEGIN(portability-simd-intrinsics) TODO convert to using SIMD_4x32
457
458 __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
459 __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8));
460 K1 = _mm_srli_si128(K1, 8);
461
462 load_le(m_EK.data(), key, 6);
463
464 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x01), m_EK, 6);
465 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x02), m_EK, 12);
466 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x04), m_EK, 18);
467 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x08), m_EK, 24);
468 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x10), m_EK, 30);
469 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x20), m_EK, 36);
470 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x40), m_EK, 42);
471 aes_192_key_expansion(&K0, &K1, _mm_aeskeygenassist_si128(K1, 0x80), m_EK, 48);
472
473 // Now generate decryption keys
474 const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data());
475
476 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
477 _mm_storeu_si128(DK_mm, _mm_loadu_si128(EK_mm + 12));
478 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
479 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
480 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
481 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
482 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
483 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
484 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
485 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
486 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
487 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
488 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
489 _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
490
491 // NOLINTEND(portability-simd-intrinsics)
492}
493
494/*
495* AES-256 Encryption
496*/
497BOTAN_FN_ISA_AESNI void AES_256::hw_aes_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
498 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_EK[4 * 0]);
499 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_EK[4 * 1]);
500 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_EK[4 * 2]);
501 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_EK[4 * 3]);
502 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_EK[4 * 4]);
503 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_EK[4 * 5]);
504 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_EK[4 * 6]);
505 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_EK[4 * 7]);
506 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_EK[4 * 8]);
507 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_EK[4 * 9]);
508 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_EK[4 * 10]);
509 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_EK[4 * 11]);
510 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_EK[4 * 12]);
511 const SIMD_4x32 K13 = SIMD_4x32::load_le(&m_EK[4 * 13]);
512 const SIMD_4x32 K14 = SIMD_4x32::load_le(&m_EK[4 * 14]);
513
514 while(blocks >= 4) {
515 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
516 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
517 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
518 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
519
520 keyxor(K0, B0, B1, B2, B3);
521 aesenc(K1, B0, B1, B2, B3);
522 aesenc(K2, B0, B1, B2, B3);
523 aesenc(K3, B0, B1, B2, B3);
524 aesenc(K4, B0, B1, B2, B3);
525 aesenc(K5, B0, B1, B2, B3);
526 aesenc(K6, B0, B1, B2, B3);
527 aesenc(K7, B0, B1, B2, B3);
528 aesenc(K8, B0, B1, B2, B3);
529 aesenc(K9, B0, B1, B2, B3);
530 aesenc(K10, B0, B1, B2, B3);
531 aesenc(K11, B0, B1, B2, B3);
532 aesenc(K12, B0, B1, B2, B3);
533 aesenc(K13, B0, B1, B2, B3);
534 aesenclast(K14, B0, B1, B2, B3);
535
536 B0.store_le(out + 16 * 0);
537 B1.store_le(out + 16 * 1);
538 B2.store_le(out + 16 * 2);
539 B3.store_le(out + 16 * 3);
540
541 blocks -= 4;
542 in += 4 * 16;
543 out += 4 * 16;
544 }
545
546 for(size_t i = 0; i != blocks; ++i) {
547 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
548
549 B0 ^= K0;
550
551 aesenc(K1, B0);
552 aesenc(K2, B0);
553 aesenc(K3, B0);
554 aesenc(K4, B0);
555 aesenc(K5, B0);
556 aesenc(K6, B0);
557 aesenc(K7, B0);
558 aesenc(K8, B0);
559 aesenc(K9, B0);
560 aesenc(K10, B0);
561 aesenc(K11, B0);
562 aesenc(K12, B0);
563 aesenc(K13, B0);
564 aesenclast(K14, B0);
565
566 B0.store_le(out + 16 * i);
567 }
568}
569
570/*
571* AES-256 Decryption
572*/
573BOTAN_FN_ISA_AESNI void AES_256::hw_aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
574 const SIMD_4x32 K0 = SIMD_4x32::load_le(&m_DK[4 * 0]);
575 const SIMD_4x32 K1 = SIMD_4x32::load_le(&m_DK[4 * 1]);
576 const SIMD_4x32 K2 = SIMD_4x32::load_le(&m_DK[4 * 2]);
577 const SIMD_4x32 K3 = SIMD_4x32::load_le(&m_DK[4 * 3]);
578 const SIMD_4x32 K4 = SIMD_4x32::load_le(&m_DK[4 * 4]);
579 const SIMD_4x32 K5 = SIMD_4x32::load_le(&m_DK[4 * 5]);
580 const SIMD_4x32 K6 = SIMD_4x32::load_le(&m_DK[4 * 6]);
581 const SIMD_4x32 K7 = SIMD_4x32::load_le(&m_DK[4 * 7]);
582 const SIMD_4x32 K8 = SIMD_4x32::load_le(&m_DK[4 * 8]);
583 const SIMD_4x32 K9 = SIMD_4x32::load_le(&m_DK[4 * 9]);
584 const SIMD_4x32 K10 = SIMD_4x32::load_le(&m_DK[4 * 10]);
585 const SIMD_4x32 K11 = SIMD_4x32::load_le(&m_DK[4 * 11]);
586 const SIMD_4x32 K12 = SIMD_4x32::load_le(&m_DK[4 * 12]);
587 const SIMD_4x32 K13 = SIMD_4x32::load_le(&m_DK[4 * 13]);
588 const SIMD_4x32 K14 = SIMD_4x32::load_le(&m_DK[4 * 14]);
589
590 while(blocks >= 4) {
591 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * 0);
592 SIMD_4x32 B1 = SIMD_4x32::load_le(in + 16 * 1);
593 SIMD_4x32 B2 = SIMD_4x32::load_le(in + 16 * 2);
594 SIMD_4x32 B3 = SIMD_4x32::load_le(in + 16 * 3);
595
596 keyxor(K0, B0, B1, B2, B3);
597 aesdec(K1, B0, B1, B2, B3);
598 aesdec(K2, B0, B1, B2, B3);
599 aesdec(K3, B0, B1, B2, B3);
600 aesdec(K4, B0, B1, B2, B3);
601 aesdec(K5, B0, B1, B2, B3);
602 aesdec(K6, B0, B1, B2, B3);
603 aesdec(K7, B0, B1, B2, B3);
604 aesdec(K8, B0, B1, B2, B3);
605 aesdec(K9, B0, B1, B2, B3);
606 aesdec(K10, B0, B1, B2, B3);
607 aesdec(K11, B0, B1, B2, B3);
608 aesdec(K12, B0, B1, B2, B3);
609 aesdec(K13, B0, B1, B2, B3);
610 aesdeclast(K14, B0, B1, B2, B3);
611
612 B0.store_le(out + 16 * 0);
613 B1.store_le(out + 16 * 1);
614 B2.store_le(out + 16 * 2);
615 B3.store_le(out + 16 * 3);
616
617 blocks -= 4;
618 in += 4 * 16;
619 out += 4 * 16;
620 }
621
622 for(size_t i = 0; i != blocks; ++i) {
623 SIMD_4x32 B0 = SIMD_4x32::load_le(in + 16 * i);
624
625 B0 ^= K0;
626
627 aesdec(K1, B0);
628 aesdec(K2, B0);
629 aesdec(K3, B0);
630 aesdec(K4, B0);
631 aesdec(K5, B0);
632 aesdec(K6, B0);
633 aesdec(K7, B0);
634 aesdec(K8, B0);
635 aesdec(K9, B0);
636 aesdec(K10, B0);
637 aesdec(K11, B0);
638 aesdec(K12, B0);
639 aesdec(K13, B0);
640 aesdeclast(K14, B0);
641
642 B0.store_le(out + 16 * i);
643 }
644}
645
646/*
647* AES-256 Key Schedule
648*/
649BOTAN_FN_ISA_AESNI void AES_256::aesni_key_schedule(const uint8_t key[], size_t /*length*/) {
650 m_EK.resize(60);
651 m_DK.resize(60);
652
653 // NOLINTBEGIN(portability-simd-intrinsics) TODO convert to using SIMD_4x32
654
655 const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
656 const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16));
657
658 const __m128i K2 = aes_128_key_expansion<0x01>(K0, K1);
659 const __m128i K3 = aes_256_key_expansion(K1, K2);
660
661 const __m128i K4 = aes_128_key_expansion<0x02>(K2, K3);
662 const __m128i K5 = aes_256_key_expansion(K3, K4);
663
664 const __m128i K6 = aes_128_key_expansion<0x04>(K4, K5);
665 const __m128i K7 = aes_256_key_expansion(K5, K6);
666
667 const __m128i K8 = aes_128_key_expansion<0x08>(K6, K7);
668 const __m128i K9 = aes_256_key_expansion(K7, K8);
669
670 const __m128i K10 = aes_128_key_expansion<0x10>(K8, K9);
671 const __m128i K11 = aes_256_key_expansion(K9, K10);
672
673 const __m128i K12 = aes_128_key_expansion<0x20>(K10, K11);
674 const __m128i K13 = aes_256_key_expansion(K11, K12);
675
676 const __m128i K14 = aes_128_key_expansion<0x40>(K12, K13);
677
678 __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
679 _mm_storeu_si128(EK_mm, K0);
680 _mm_storeu_si128(EK_mm + 1, K1);
681 _mm_storeu_si128(EK_mm + 2, K2);
682 _mm_storeu_si128(EK_mm + 3, K3);
683 _mm_storeu_si128(EK_mm + 4, K4);
684 _mm_storeu_si128(EK_mm + 5, K5);
685 _mm_storeu_si128(EK_mm + 6, K6);
686 _mm_storeu_si128(EK_mm + 7, K7);
687 _mm_storeu_si128(EK_mm + 8, K8);
688 _mm_storeu_si128(EK_mm + 9, K9);
689 _mm_storeu_si128(EK_mm + 10, K10);
690 _mm_storeu_si128(EK_mm + 11, K11);
691 _mm_storeu_si128(EK_mm + 12, K12);
692 _mm_storeu_si128(EK_mm + 13, K13);
693 _mm_storeu_si128(EK_mm + 14, K14);
694
695 // Now generate decryption keys
696 __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
697 _mm_storeu_si128(DK_mm, K14);
698 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
699 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
700 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
701 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
702 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
703 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
704 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
705 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
706 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
707 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
708 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
709 _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
710 _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
711 _mm_storeu_si128(DK_mm + 14, K0);
712
713 // NOLINTEND(portability-simd-intrinsics)
714}
715
716} // namespace Botan
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 load_le(const void *in) noexcept
Definition simd_4x32.h:162
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
constexpr uint32_t K1
Definition sha1_f.h:16
constexpr uint32_t K4
Definition sha1_f.h:19
constexpr uint32_t K3
Definition sha1_f.h:18
constexpr uint32_t K2
Definition sha1_f.h:17
constexpr auto load_le(ParamTs &&... params)
Definition loadstor.h:495
std::vector< T, secure_allocator< T > > secure_vector
Definition secmem.h:68