Botan  2.4.0
Crypto and TLS for C++11
aes_ssse3.cpp
Go to the documentation of this file.
1 /*
2 * AES using SSSE3
3 * (C) 2010,2016 Jack Lloyd
4 *
5 * This is more or less a direct translation of public domain x86-64
6 * assembly written by Mike Hamburg, described in "Accelerating AES
7 * with Vector Permute Instructions" (CHES 2009). His original code is
8 * available at https://crypto.stanford.edu/vpaes/
9 *
10 * Botan is released under the Simplified BSD License (see license.txt)
11 */
12 
13 #include <botan/aes.h>
14 #include <botan/internal/ct_utils.h>
15 #include <tmmintrin.h>
16 
17 namespace Botan {
18 
19 namespace {
20 
21 const __m128i low_nibs = _mm_set1_epi8(0x0F);
22 
23 const __m128i k_ipt1 = _mm_set_epi32(
24  0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);
25 const __m128i k_ipt2 = _mm_set_epi32(
26  0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);
27 
28 const __m128i k_inv1 = _mm_set_epi32(
29  0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);
30 const __m128i k_inv2 = _mm_set_epi32(
31  0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);
32 
33 const __m128i sb1u = _mm_set_epi32(
34  0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);
35 const __m128i sb1t = _mm_set_epi32(
36  0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);
37 
38 const __m128i mc_forward[4] = {
39  _mm_set_epi32(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201),
40  _mm_set_epi32(0x00030201, 0x0C0F0E0D, 0x080B0A09, 0x04070605),
41  _mm_set_epi32(0x04070605, 0x00030201, 0x0C0F0E0D, 0x080B0A09),
42  _mm_set_epi32(0x080B0A09, 0x04070605, 0x00030201, 0x0C0F0E0D)
43 };
44 
45 const __m128i sr[4] = {
46  _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100),
47  _mm_set_epi32(0x0B06010C, 0x07020D08, 0x030E0904, 0x0F0A0500),
48  _mm_set_epi32(0x070E050C, 0x030A0108, 0x0F060D04, 0x0B020900),
49  _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00),
50 };
51 
52 #define mm_xor3(x, y, z) _mm_xor_si128(x, _mm_xor_si128(y, z))
53 
54 BOTAN_FUNC_ISA("ssse3")
55 __m128i aes_schedule_transform(__m128i input,
56  __m128i table_1,
57  __m128i table_2)
58  {
59  __m128i i_1 = _mm_and_si128(low_nibs, input);
60  __m128i i_2 = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input), 4);
61 
62  return _mm_xor_si128(
63  _mm_shuffle_epi8(table_1, i_1),
64  _mm_shuffle_epi8(table_2, i_2));
65  }
66 
67 BOTAN_FUNC_ISA("ssse3")
68 __m128i aes_schedule_mangle(__m128i k, uint8_t round_no)
69  {
70  __m128i t = _mm_shuffle_epi8(_mm_xor_si128(k, _mm_set1_epi8(0x5B)),
71  mc_forward[0]);
72 
73  __m128i t2 = t;
74 
75  t = _mm_shuffle_epi8(t, mc_forward[0]);
76 
77  t2 = mm_xor3(t2, t, _mm_shuffle_epi8(t, mc_forward[0]));
78 
79  return _mm_shuffle_epi8(t2, sr[round_no % 4]);
80  }
81 
82 BOTAN_FUNC_ISA("ssse3")
83 __m128i aes_schedule_192_smear(__m128i x, __m128i y)
84  {
85  return mm_xor3(y,
86  _mm_shuffle_epi32(x, 0xFE),
87  _mm_shuffle_epi32(y, 0x80));
88  }
89 
90 BOTAN_FUNC_ISA("ssse3")
91 __m128i aes_schedule_mangle_dec(__m128i k, uint8_t round_no)
92  {
93  const __m128i dsk[8] = {
94  _mm_set_epi32(0x4AED9334, 0x82255BFC, 0xB6116FC8, 0x7ED9A700),
95  _mm_set_epi32(0x8BB89FAC, 0xE9DAFDCE, 0x45765162, 0x27143300),
96  _mm_set_epi32(0x4622EE8A, 0xADC90561, 0x27438FEB, 0xCCA86400),
97  _mm_set_epi32(0x73AEE13C, 0xBD602FF2, 0x815C13CE, 0x4F92DD00),
98  _mm_set_epi32(0xF83F3EF9, 0xFA3D3CFB, 0x03C4C502, 0x01C6C700),
99  _mm_set_epi32(0xA5526A9D, 0x7384BC4B, 0xEE1921D6, 0x38CFF700),
100  _mm_set_epi32(0xA080D3F3, 0x10306343, 0xE3C390B0, 0x53732000),
101  _mm_set_epi32(0x2F45AEC4, 0x8CE60D67, 0xA0CA214B, 0x036982E8)
102  };
103 
104  __m128i t = aes_schedule_transform(k, dsk[0], dsk[1]);
105  __m128i output = _mm_shuffle_epi8(t, mc_forward[0]);
106 
107  t = aes_schedule_transform(t, dsk[2], dsk[3]);
108  output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
109 
110  t = aes_schedule_transform(t, dsk[4], dsk[5]);
111  output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
112 
113  t = aes_schedule_transform(t, dsk[6], dsk[7]);
114  output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
115 
116  return _mm_shuffle_epi8(output, sr[round_no % 4]);
117  }
118 
119 BOTAN_FUNC_ISA("ssse3")
120 __m128i aes_schedule_mangle_last(__m128i k, uint8_t round_no)
121  {
122  const __m128i out_tr1 = _mm_set_epi32(
123  0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);
124  const __m128i out_tr2 = _mm_set_epi32(
125  0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);
126 
127  k = _mm_shuffle_epi8(k, sr[round_no % 4]);
128  k = _mm_xor_si128(k, _mm_set1_epi8(0x5B));
129  return aes_schedule_transform(k, out_tr1, out_tr2);
130  }
131 
132 BOTAN_FUNC_ISA("ssse3")
133 __m128i aes_schedule_mangle_last_dec(__m128i k)
134  {
135  const __m128i deskew1 = _mm_set_epi32(
136  0x1DFEB95A, 0x5DBEF91A, 0x07E4A340, 0x47A4E300);
137  const __m128i deskew2 = _mm_set_epi32(
138  0x2841C2AB, 0xF49D1E77, 0x5F36B5DC, 0x83EA6900);
139 
140  k = _mm_xor_si128(k, _mm_set1_epi8(0x5B));
141  return aes_schedule_transform(k, deskew1, deskew2);
142  }
143 
144 BOTAN_FUNC_ISA("ssse3")
145 __m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2)
146  {
147  if(rcon)
148  {
149  input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15),
150  input2);
151 
152  *rcon = _mm_alignr_epi8(*rcon, *rcon, 15); // next rcon
153 
154  input1 = _mm_shuffle_epi32(input1, 0xFF); // rotate
155  input1 = _mm_alignr_epi8(input1, input1, 1);
156  }
157 
158  __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4));
159  smeared = mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B));
160 
161  __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4);
162 
163  input1 = _mm_and_si128(low_nibs, input1);
164 
165  __m128i t2 = _mm_shuffle_epi8(k_inv2, input1);
166 
167  input1 = _mm_xor_si128(input1, t);
168 
169  __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
170  __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1));
171 
172  __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3));
173  __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
174 
175  return mm_xor3(_mm_shuffle_epi8(sb1u, t5),
176  _mm_shuffle_epi8(sb1t, t6),
177  smeared);
178  }
179 
180 BOTAN_FUNC_ISA("ssse3")
181 __m128i aes_ssse3_encrypt(__m128i B, const __m128i* keys, size_t rounds)
182  {
183  const __m128i sb2u = _mm_set_epi32(
184  0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);
185  const __m128i sb2t = _mm_set_epi32(
186  0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);
187 
188  const __m128i sbou = _mm_set_epi32(
189  0x15AABF7A, 0xC502A878, 0xD0D26D17, 0x6FBDC700);
190  const __m128i sbot = _mm_set_epi32(
191  0x8E1E90D1, 0x412B35FA, 0xCFE474A5, 0x5FBB6A00);
192 
193  const __m128i mc_backward[4] = {
194  _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003),
195  _mm_set_epi32(0x0A09080B, 0x06050407, 0x02010003, 0x0E0D0C0F),
196  _mm_set_epi32(0x06050407, 0x02010003, 0x0E0D0C0F, 0x0A09080B),
197  _mm_set_epi32(0x02010003, 0x0E0D0C0F, 0x0A09080B, 0x06050407),
198  };
199 
200  B = mm_xor3(_mm_shuffle_epi8(k_ipt1, _mm_and_si128(low_nibs, B)),
201  _mm_shuffle_epi8(k_ipt2,
202  _mm_srli_epi32(
203  _mm_andnot_si128(low_nibs, B),
204  4)),
205  _mm_loadu_si128(keys));
206 
207  for(size_t r = 1; ; ++r)
208  {
209  const __m128i K = _mm_loadu_si128(keys + r);
210 
211  __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4);
212 
213  B = _mm_and_si128(low_nibs, B);
214 
215  __m128i t2 = _mm_shuffle_epi8(k_inv2, B);
216 
217  B = _mm_xor_si128(B, t);
218 
219  __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
220  __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
221 
222  __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
223  __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
224 
225  if(r == rounds)
226  {
227  B = _mm_shuffle_epi8(
228  mm_xor3(_mm_shuffle_epi8(sbou, t5),
229  _mm_shuffle_epi8(sbot, t6),
230  K),
231  sr[r % 4]);
232 
233  return B;
234  }
235 
236  __m128i t7 = mm_xor3(_mm_shuffle_epi8(sb1t, t6),
237  _mm_shuffle_epi8(sb1u, t5),
238  K);
239 
240  __m128i t8 = mm_xor3(_mm_shuffle_epi8(sb2t, t6),
241  _mm_shuffle_epi8(sb2u, t5),
242  _mm_shuffle_epi8(t7, mc_forward[r % 4]));
243 
244  B = mm_xor3(_mm_shuffle_epi8(t8, mc_forward[r % 4]),
245  _mm_shuffle_epi8(t7, mc_backward[r % 4]),
246  t8);
247  }
248  }
249 
250 BOTAN_FUNC_ISA("ssse3")
251 __m128i aes_ssse3_decrypt(__m128i B, const __m128i* keys, size_t rounds)
252  {
253  const __m128i k_dipt1 = _mm_set_epi32(
254  0x154A411E, 0x114E451A, 0x0F505B04, 0x0B545F00);
255  const __m128i k_dipt2 = _mm_set_epi32(
256  0x12771772, 0xF491F194, 0x86E383E6, 0x60056500);
257 
258  const __m128i sb9u = _mm_set_epi32(
259  0xCAD51F50, 0x4F994CC9, 0x851C0353, 0x9A86D600);
260  const __m128i sb9t = _mm_set_epi32(
261  0x725E2C9E, 0xB2FBA565, 0xC03B1789, 0xECD74900);
262 
263  const __m128i sbeu = _mm_set_epi32(
264  0x22426004, 0x64B4F6B0, 0x46F29296, 0x26D4D000);
265  const __m128i sbet = _mm_set_epi32(
266  0x9467F36B, 0x98593E32, 0x0C55A6CD, 0xFFAAC100);
267 
268  const __m128i sbdu = _mm_set_epi32(
269  0xF56E9B13, 0x882A4439, 0x7D57CCDF, 0xE6B1A200);
270  const __m128i sbdt = _mm_set_epi32(
271  0x2931180D, 0x15DEEFD3, 0x3CE2FAF7, 0x24C6CB00);
272 
273  const __m128i sbbu = _mm_set_epi32(
274  0x602646F6, 0xB0F2D404, 0xD0226492, 0x96B44200);
275  const __m128i sbbt = _mm_set_epi32(
276  0xF3FF0C3E, 0x3255AA6B, 0xC19498A6, 0xCD596700);
277 
278  __m128i mc = mc_forward[3];
279 
280  __m128i t =
281  _mm_shuffle_epi8(k_dipt2,
282  _mm_srli_epi32(
283  _mm_andnot_si128(low_nibs, B),
284  4));
285 
286  B = mm_xor3(t, _mm_loadu_si128(keys),
287  _mm_shuffle_epi8(k_dipt1, _mm_and_si128(B, low_nibs)));
288 
289  for(size_t r = 1; ; ++r)
290  {
291  const __m128i K = _mm_loadu_si128(keys + r);
292 
293  t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4);
294 
295  B = _mm_and_si128(low_nibs, B);
296 
297  __m128i t2 = _mm_shuffle_epi8(k_inv2, B);
298 
299  B = _mm_xor_si128(B, t);
300 
301  __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
302  __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
303  __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
304  __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
305 
306  if(r == rounds)
307  {
308  const __m128i sbou = _mm_set_epi32(
309  0xC7AA6DB9, 0xD4943E2D, 0x1387EA53, 0x7EF94000);
310  const __m128i sbot = _mm_set_epi32(
311  0xCA4B8159, 0xD8C58E9C, 0x12D7560F, 0x93441D00);
312 
313  __m128i x = _mm_shuffle_epi8(sbou, t5);
314  __m128i y = _mm_shuffle_epi8(sbot, t6);
315  x = _mm_xor_si128(x, K);
316  x = _mm_xor_si128(x, y);
317 
318  const uint32_t which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16;
319  return _mm_shuffle_epi8(x, sr[which_sr]);
320  }
321 
322  __m128i t8 = _mm_xor_si128(_mm_shuffle_epi8(sb9t, t6),
323  _mm_xor_si128(_mm_shuffle_epi8(sb9u, t5), K));
324 
325  __m128i t9 = mm_xor3(_mm_shuffle_epi8(t8, mc),
326  _mm_shuffle_epi8(sbdu, t5),
327  _mm_shuffle_epi8(sbdt, t6));
328 
329  __m128i t12 = _mm_xor_si128(
330  _mm_xor_si128(
331  _mm_shuffle_epi8(t9, mc),
332  _mm_shuffle_epi8(sbbu, t5)),
333  _mm_shuffle_epi8(sbbt, t6));
334 
335  B = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t12, mc),
336  _mm_shuffle_epi8(sbeu, t5)),
337  _mm_shuffle_epi8(sbet, t6));
338 
339  mc = _mm_alignr_epi8(mc, mc, 12);
340  }
341  }
342 
343 }
344 
345 /*
346 * AES-128 Encryption
347 */
348 BOTAN_FUNC_ISA("ssse3")
349 void AES_128::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
350  {
351  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
352  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
353 
354  const __m128i* keys = reinterpret_cast<const __m128i*>(m_EK.data());
355 
356  CT::poison(in, blocks * block_size());
357 
358  BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
359  {
360  __m128i B = _mm_loadu_si128(in_mm + i);
361  _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 10));
362  }
363 
364  CT::unpoison(in, blocks * block_size());
365  CT::unpoison(out, blocks * block_size());
366  }
367 
368 /*
369 * AES-128 Decryption
370 */
371 BOTAN_FUNC_ISA("ssse3")
372 void AES_128::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
373  {
374  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
375  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
376 
377  const __m128i* keys = reinterpret_cast<const __m128i*>(m_DK.data());
378 
379  CT::poison(in, blocks * block_size());
380 
381  BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
382  {
383  __m128i B = _mm_loadu_si128(in_mm + i);
384  _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 10));
385  }
386 
387  CT::unpoison(in, blocks * block_size());
388  CT::unpoison(out, blocks * block_size());
389  }
390 
391 /*
392 * AES-128 Key Schedule
393 */
394 BOTAN_FUNC_ISA("ssse3")
395 void AES_128::ssse3_key_schedule(const uint8_t keyb[], size_t)
396  {
397  __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
398  0x1F8391B9, 0xAF9DEEB6);
399 
400  __m128i key = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
401 
402  m_EK.resize(11*4);
403  m_DK.resize(11*4);
404 
405  __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
406  __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
407 
408  _mm_storeu_si128(DK_mm + 10, _mm_shuffle_epi8(key, sr[2]));
409 
410  key = aes_schedule_transform(key, k_ipt1, k_ipt2);
411 
412  _mm_storeu_si128(EK_mm, key);
413 
414  for(size_t i = 1; i != 10; ++i)
415  {
416  key = aes_schedule_round(&rcon, key, key);
417 
418  _mm_storeu_si128(EK_mm + i,
419  aes_schedule_mangle(key, (12-i) % 4));
420 
421  _mm_storeu_si128(DK_mm + (10-i),
422  aes_schedule_mangle_dec(key, (10-i) % 4));
423  }
424 
425  key = aes_schedule_round(&rcon, key, key);
426  _mm_storeu_si128(EK_mm + 10, aes_schedule_mangle_last(key, 2));
427  _mm_storeu_si128(DK_mm, aes_schedule_mangle_last_dec(key));
428  }
429 
430 /*
431 * AES-192 Encryption
432 */
433 BOTAN_FUNC_ISA("ssse3")
434 void AES_192::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
435  {
436  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
437  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
438 
439  const __m128i* keys = reinterpret_cast<const __m128i*>(m_EK.data());
440 
441  CT::poison(in, blocks * block_size());
442 
443  for(size_t i = 0; i != blocks; ++i)
444  {
445  __m128i B = _mm_loadu_si128(in_mm + i);
446  _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 12));
447  }
448 
449  CT::unpoison(in, blocks * block_size());
450  CT::unpoison(out, blocks * block_size());
451  }
452 
453 /*
454 * AES-192 Decryption
455 */
456 BOTAN_FUNC_ISA("ssse3")
457 void AES_192::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
458  {
459  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
460  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
461 
462  const __m128i* keys = reinterpret_cast<const __m128i*>(m_DK.data());
463 
464  CT::poison(in, blocks * block_size());
465 
466  for(size_t i = 0; i != blocks; ++i)
467  {
468  __m128i B = _mm_loadu_si128(in_mm + i);
469  _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 12));
470  }
471 
472  CT::unpoison(in, blocks * block_size());
473  CT::unpoison(out, blocks * block_size());
474  }
475 
476 /*
477 * AES-192 Key Schedule
478 */
479 BOTAN_FUNC_ISA("ssse3")
480 void AES_192::ssse3_key_schedule(const uint8_t keyb[], size_t)
481  {
482  __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
483  0x1F8391B9, 0xAF9DEEB6);
484 
485  m_EK.resize(13*4);
486  m_DK.resize(13*4);
487 
488  __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
489  __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
490 
491  __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
492  __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((keyb + 8)));
493 
494  _mm_storeu_si128(DK_mm + 12, _mm_shuffle_epi8(key1, sr[0]));
495 
496  key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
497  key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
498 
499  _mm_storeu_si128(EK_mm + 0, key1);
500 
501  // key2 with 8 high bytes masked off
502  __m128i t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8);
503 
504  for(size_t i = 0; i != 4; ++i)
505  {
506  key2 = aes_schedule_round(&rcon, key2, key1);
507 
508  _mm_storeu_si128(EK_mm + 3*i+1,
509  aes_schedule_mangle(_mm_alignr_epi8(key2, t, 8), (i+3)%4));
510  _mm_storeu_si128(DK_mm + 11-3*i,
511  aes_schedule_mangle_dec(_mm_alignr_epi8(key2, t, 8), (i+3)%4));
512 
513  t = aes_schedule_192_smear(key2, t);
514 
515  _mm_storeu_si128(EK_mm + 3*i+2,
516  aes_schedule_mangle(t, (i+2)%4));
517  _mm_storeu_si128(DK_mm + 10-3*i,
518  aes_schedule_mangle_dec(t, (i+2)%4));
519 
520  key2 = aes_schedule_round(&rcon, t, key2);
521 
522  if(i == 3)
523  {
524  _mm_storeu_si128(EK_mm + 3*i+3,
525  aes_schedule_mangle_last(key2, (i+1)%4));
526  _mm_storeu_si128(DK_mm + 9-3*i,
527  aes_schedule_mangle_last_dec(key2));
528  }
529  else
530  {
531  _mm_storeu_si128(EK_mm + 3*i+3,
532  aes_schedule_mangle(key2, (i+1)%4));
533  _mm_storeu_si128(DK_mm + 9-3*i,
534  aes_schedule_mangle_dec(key2, (i+1)%4));
535  }
536 
537  key1 = key2;
538  key2 = aes_schedule_192_smear(key2,
539  _mm_slli_si128(_mm_srli_si128(t, 8), 8));
540  t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8);
541  }
542  }
543 
544 /*
545 * AES-256 Encryption
546 */
547 BOTAN_FUNC_ISA("ssse3")
548 void AES_256::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
549  {
550  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
551  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
552 
553  const __m128i* keys = reinterpret_cast<const __m128i*>(m_EK.data());
554 
555  CT::poison(in, blocks * block_size());
556 
557  for(size_t i = 0; i != blocks; ++i)
558  {
559  __m128i B = _mm_loadu_si128(in_mm + i);
560  _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 14));
561  }
562 
563  CT::unpoison(in, blocks * block_size());
564  CT::unpoison(out, blocks * block_size());
565  }
566 
567 /*
568 * AES-256 Decryption
569 */
570 BOTAN_FUNC_ISA("ssse3")
571 void AES_256::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
572  {
573  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
574  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
575 
576  const __m128i* keys = reinterpret_cast<const __m128i*>(m_DK.data());
577 
578  CT::poison(in, blocks * block_size());
579 
580  for(size_t i = 0; i != blocks; ++i)
581  {
582  __m128i B = _mm_loadu_si128(in_mm + i);
583  _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 14));
584  }
585 
586  CT::unpoison(in, blocks * block_size());
587  CT::unpoison(out, blocks * block_size());
588  }
589 
590 /*
591 * AES-256 Key Schedule
592 */
593 BOTAN_FUNC_ISA("ssse3")
594 void AES_256::ssse3_key_schedule(const uint8_t keyb[], size_t)
595  {
596  __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
597  0x1F8391B9, 0xAF9DEEB6);
598 
599  m_EK.resize(15*4);
600  m_DK.resize(15*4);
601 
602  __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
603  __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
604 
605  __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
606  __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((keyb + 16)));
607 
608  _mm_storeu_si128(DK_mm + 14, _mm_shuffle_epi8(key1, sr[2]));
609 
610  key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
611  key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
612 
613  _mm_storeu_si128(EK_mm + 0, key1);
614  _mm_storeu_si128(EK_mm + 1, aes_schedule_mangle(key2, 3));
615 
616  _mm_storeu_si128(DK_mm + 13, aes_schedule_mangle_dec(key2, 1));
617 
618  for(size_t i = 2; i != 14; i += 2)
619  {
620  __m128i k_t = key2;
621  key1 = key2 = aes_schedule_round(&rcon, key2, key1);
622 
623  _mm_storeu_si128(EK_mm + i, aes_schedule_mangle(key2, i % 4));
624  _mm_storeu_si128(DK_mm + (14-i), aes_schedule_mangle_dec(key2, (i+2) % 4));
625 
626  key2 = aes_schedule_round(nullptr, _mm_shuffle_epi32(key2, 0xFF), k_t);
627  _mm_storeu_si128(EK_mm + i + 1, aes_schedule_mangle(key2, (i - 1) % 4));
628  _mm_storeu_si128(DK_mm + (13-i), aes_schedule_mangle_dec(key2, (i+1) % 4));
629  }
630 
631  key2 = aes_schedule_round(&rcon, key2, key1);
632 
633  _mm_storeu_si128(EK_mm + 14, aes_schedule_mangle_last(key2, 2));
634  _mm_storeu_si128(DK_mm + 0, aes_schedule_mangle_last_dec(key2));
635  }
636 
637 }
#define mm_xor3(x, y, z)
Definition: aes_ssse3.cpp:52
void poison(const T *p, size_t n)
Definition: ct_utils.h:46
#define BOTAN_PARALLEL_FOR
Definition: compiler.h:174
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:75
Definition: alg_id.cpp:13
void unpoison(const T *p, size_t n)
Definition: ct_utils.h:57