Botan 3.6.1
Crypto and TLS for C&
aes_vperm.cpp
Go to the documentation of this file.
1/*
2* AES using vector permutes (SSSE3, NEON)
3* (C) 2010,2016,2019 Jack Lloyd
4*
5* Based on public domain x86-64 assembly written by Mike Hamburg,
6* described in "Accelerating AES with Vector Permute Instructions"
7* (CHES 2009). His original code is available at
8* https://crypto.stanford.edu/vpaes/
9*
10* Botan is released under the Simplified BSD License (see license.txt)
11*/
12
13#include <botan/internal/aes.h>
14
15#include <botan/internal/ct_utils.h>
16#include <botan/internal/simd_32.h>
17
18#if defined(BOTAN_SIMD_USE_SSE2)
19 #include <tmmintrin.h>
20#endif
21
22namespace Botan {
23
24namespace {
25
26inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) shuffle(SIMD_4x32 a, SIMD_4x32 b) {
27#if defined(BOTAN_SIMD_USE_SSE2)
28 return SIMD_4x32(_mm_shuffle_epi8(a.raw(), b.raw()));
29#elif defined(BOTAN_SIMD_USE_NEON)
30 const uint8x16_t tbl = vreinterpretq_u8_u32(a.raw());
31 const uint8x16_t idx = vreinterpretq_u8_u32(b.raw());
32
33 #if defined(BOTAN_TARGET_ARCH_IS_ARM32)
34 const uint8x8x2_t tbl2 = {vget_low_u8(tbl), vget_high_u8(tbl)};
35
36 return SIMD_4x32(
37 vreinterpretq_u32_u8(vcombine_u8(vtbl2_u8(tbl2, vget_low_u8(idx)), vtbl2_u8(tbl2, vget_high_u8(idx)))));
38
39 #else
40 return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(tbl, idx)));
41 #endif
42
43#elif defined(BOTAN_SIMD_USE_ALTIVEC)
44
45 const auto zero = vec_splat_s8(0x00);
46 const auto mask = vec_cmplt(reinterpret_cast<__vector signed char>(b.raw()), zero);
47 const auto r = vec_perm(reinterpret_cast<__vector signed char>(a.raw()),
48 reinterpret_cast<__vector signed char>(a.raw()),
49 reinterpret_cast<__vector unsigned char>(b.raw()));
50 return SIMD_4x32(reinterpret_cast<__vector unsigned int>(vec_sel(r, zero, mask)));
51
52#else
53 #error "No shuffle implementation available"
54#endif
55}
56
57inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) alignr8(SIMD_4x32 a, SIMD_4x32 b) {
58#if defined(BOTAN_SIMD_USE_SSE2)
59 return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), 8));
60#elif defined(BOTAN_SIMD_USE_NEON)
61 return SIMD_4x32(vextq_u32(b.raw(), a.raw(), 2));
62#elif defined(BOTAN_SIMD_USE_ALTIVEC)
63 const __vector unsigned char mask = {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
64 return SIMD_4x32(vec_perm(b.raw(), a.raw(), mask));
65#else
66 #error "No alignr8 implementation available"
67#endif
68}
69
70const SIMD_4x32 k_ipt1 = SIMD_4x32(0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090);
71const SIMD_4x32 k_ipt2 = SIMD_4x32(0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC);
72
73const SIMD_4x32 k_inv1 = SIMD_4x32(0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309);
74const SIMD_4x32 k_inv2 = SIMD_4x32(0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C);
75
76const SIMD_4x32 sb1u = SIMD_4x32(0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E);
77const SIMD_4x32 sb1t = SIMD_4x32(0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1);
78const SIMD_4x32 sbou = SIMD_4x32(0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A);
79const SIMD_4x32 sbot = SIMD_4x32(0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1);
80
81const SIMD_4x32 sboud = SIMD_4x32(0x7EF94000, 0x1387EA53, 0xD4943E2D, 0xC7AA6DB9);
82const SIMD_4x32 sbotd = SIMD_4x32(0x93441D00, 0x12D7560F, 0xD8C58E9C, 0xCA4B8159);
83
84const SIMD_4x32 mc_forward[4] = {SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
85 SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
86 SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
87 SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09)};
88
89const SIMD_4x32 vperm_sr[4] = {
90 SIMD_4x32(0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C),
91 SIMD_4x32(0x0F0A0500, 0x030E0904, 0x07020D08, 0x0B06010C),
92 SIMD_4x32(0x0B020900, 0x0F060D04, 0x030A0108, 0x070E050C),
93 SIMD_4x32(0x070A0D00, 0x0B0E0104, 0x0F020508, 0x0306090C),
94};
95
96const SIMD_4x32 rcon[10] = {
97 SIMD_4x32(0x00000070, 0x00000000, 0x00000000, 0x00000000),
98 SIMD_4x32(0x0000002A, 0x00000000, 0x00000000, 0x00000000),
99 SIMD_4x32(0x00000098, 0x00000000, 0x00000000, 0x00000000),
100 SIMD_4x32(0x00000008, 0x00000000, 0x00000000, 0x00000000),
101 SIMD_4x32(0x0000004D, 0x00000000, 0x00000000, 0x00000000),
102 SIMD_4x32(0x0000007C, 0x00000000, 0x00000000, 0x00000000),
103 SIMD_4x32(0x0000007D, 0x00000000, 0x00000000, 0x00000000),
104 SIMD_4x32(0x00000081, 0x00000000, 0x00000000, 0x00000000),
105 SIMD_4x32(0x0000001F, 0x00000000, 0x00000000, 0x00000000),
106 SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000),
107};
108
109const SIMD_4x32 sb2u = SIMD_4x32(0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955);
110const SIMD_4x32 sb2t = SIMD_4x32(0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8);
111
112const SIMD_4x32 k_dipt1 = SIMD_4x32(0x0B545F00, 0x0F505B04, 0x114E451A, 0x154A411E);
113const SIMD_4x32 k_dipt2 = SIMD_4x32(0x60056500, 0x86E383E6, 0xF491F194, 0x12771772);
114
115const SIMD_4x32 sb9u = SIMD_4x32(0x9A86D600, 0x851C0353, 0x4F994CC9, 0xCAD51F50);
116const SIMD_4x32 sb9t = SIMD_4x32(0xECD74900, 0xC03B1789, 0xB2FBA565, 0x725E2C9E);
117
118const SIMD_4x32 sbeu = SIMD_4x32(0x26D4D000, 0x46F29296, 0x64B4F6B0, 0x22426004);
119const SIMD_4x32 sbet = SIMD_4x32(0xFFAAC100, 0x0C55A6CD, 0x98593E32, 0x9467F36B);
120
121const SIMD_4x32 sbdu = SIMD_4x32(0xE6B1A200, 0x7D57CCDF, 0x882A4439, 0xF56E9B13);
122const SIMD_4x32 sbdt = SIMD_4x32(0x24C6CB00, 0x3CE2FAF7, 0x15DEEFD3, 0x2931180D);
123
124const SIMD_4x32 sbbu = SIMD_4x32(0x96B44200, 0xD0226492, 0xB0F2D404, 0x602646F6);
125const SIMD_4x32 sbbt = SIMD_4x32(0xCD596700, 0xC19498A6, 0x3255AA6B, 0xF3FF0C3E);
126
127const SIMD_4x32 mcx[4] = {
128 SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09),
129 SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
130 SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
131 SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
132};
133
134const SIMD_4x32 mc_backward[4] = {
135 SIMD_4x32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F),
136 SIMD_4x32(0x0E0D0C0F, 0x02010003, 0x06050407, 0x0A09080B),
137 SIMD_4x32(0x0A09080B, 0x0E0D0C0F, 0x02010003, 0x06050407),
138 SIMD_4x32(0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x02010003),
139};
140
141const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
142
143inline SIMD_4x32 low_nibs(SIMD_4x32 x) {
144 return lo_nibs_mask & x;
145}
146
147inline SIMD_4x32 high_nibs(SIMD_4x32 x) {
148 return (x.shr<4>() & lo_nibs_mask);
149}
150
151inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_enc_first_round(SIMD_4x32 B, SIMD_4x32 K) {
152 return shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ K;
153}
154
155inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_enc_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) {
156 const SIMD_4x32 Bh = high_nibs(B);
157 SIMD_4x32 Bl = low_nibs(B);
158 const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
159 Bl ^= Bh;
160
161 const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
162 const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
163
164 const SIMD_4x32 t7 = shuffle(sb1t, t6) ^ shuffle(sb1u, t5) ^ K;
165 const SIMD_4x32 t8 = shuffle(sb2t, t6) ^ shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]);
166
167 return shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8;
168}
169
170inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_enc_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) {
171 const SIMD_4x32 Bh = high_nibs(B);
172 SIMD_4x32 Bl = low_nibs(B);
173 const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
174 Bl ^= Bh;
175
176 const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
177 const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
178
179 return shuffle(shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K, vperm_sr[r % 4]);
180}
181
182inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_dec_first_round(SIMD_4x32 B, SIMD_4x32 K) {
183 return shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ K;
184}
185
186inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_dec_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) {
187 const SIMD_4x32 Bh = high_nibs(B);
188 B = low_nibs(B);
189 const SIMD_4x32 t2 = shuffle(k_inv2, B);
190
191 B ^= Bh;
192
193 const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
194 const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
195
196 const SIMD_4x32 mc = mcx[(r - 1) % 4];
197
198 const SIMD_4x32 t8 = shuffle(sb9t, t6) ^ shuffle(sb9u, t5) ^ K;
199 const SIMD_4x32 t9 = shuffle(t8, mc) ^ shuffle(sbdu, t5) ^ shuffle(sbdt, t6);
200 const SIMD_4x32 t12 = shuffle(t9, mc) ^ shuffle(sbbu, t5) ^ shuffle(sbbt, t6);
201 return shuffle(t12, mc) ^ shuffle(sbeu, t5) ^ shuffle(sbet, t6);
202}
203
204inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_dec_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) {
205 const uint32_t which_sr = ((((r - 1) << 4) ^ 48) & 48) / 16;
206
207 const SIMD_4x32 Bh = high_nibs(B);
208 B = low_nibs(B);
209 const SIMD_4x32 t2 = shuffle(k_inv2, B);
210
211 B ^= Bh;
212
213 const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
214 const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
215
216 const SIMD_4x32 x = shuffle(sboud, t5) ^ shuffle(sbotd, t6) ^ K;
217 return shuffle(x, vperm_sr[which_sr]);
218}
219
220void BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
221 vperm_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, const SIMD_4x32 K[], size_t rounds) {
222 CT::poison(in, blocks * 16);
223
224 const size_t blocks2 = blocks - (blocks % 2);
225
226 for(size_t i = 0; i != blocks2; i += 2) {
227 SIMD_4x32 B0 = SIMD_4x32::load_le(in + i * 16);
228 SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i + 1) * 16);
229
230 B0 = aes_enc_first_round(B0, K[0]);
231 B1 = aes_enc_first_round(B1, K[0]);
232
233 for(size_t r = 1; r != rounds; ++r) {
234 B0 = aes_enc_round(B0, K[r], r);
235 B1 = aes_enc_round(B1, K[r], r);
236 }
237
238 B0 = aes_enc_last_round(B0, K[rounds], rounds);
239 B1 = aes_enc_last_round(B1, K[rounds], rounds);
240
241 B0.store_le(out + i * 16);
242 B1.store_le(out + (i + 1) * 16);
243 }
244
245 for(size_t i = blocks2; i < blocks; ++i) {
246 SIMD_4x32 B = SIMD_4x32::load_le(in + i * 16); // ???
247
248 B = aes_enc_first_round(B, K[0]);
249
250 for(size_t r = 1; r != rounds; ++r) {
251 B = aes_enc_round(B, K[r], r);
252 }
253
254 B = aes_enc_last_round(B, K[rounds], rounds);
255 B.store_le(out + i * 16);
256 }
257
258 CT::unpoison(in, blocks * 16);
259 CT::unpoison(out, blocks * 16);
260}
261
262void BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
263 vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, const SIMD_4x32 K[], size_t rounds) {
264 CT::poison(in, blocks * 16);
265
266 const size_t blocks2 = blocks - (blocks % 2);
267
268 for(size_t i = 0; i != blocks2; i += 2) {
269 SIMD_4x32 B0 = SIMD_4x32::load_le(in + i * 16);
270 SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i + 1) * 16);
271
272 B0 = aes_dec_first_round(B0, K[0]);
273 B1 = aes_dec_first_round(B1, K[0]);
274
275 for(size_t r = 1; r != rounds; ++r) {
276 B0 = aes_dec_round(B0, K[r], r);
277 B1 = aes_dec_round(B1, K[r], r);
278 }
279
280 B0 = aes_dec_last_round(B0, K[rounds], rounds);
281 B1 = aes_dec_last_round(B1, K[rounds], rounds);
282
283 B0.store_le(out + i * 16);
284 B1.store_le(out + (i + 1) * 16);
285 }
286
287 for(size_t i = blocks2; i < blocks; ++i) {
288 SIMD_4x32 B = SIMD_4x32::load_le(in + i * 16); // ???
289
290 B = aes_dec_first_round(B, K[0]);
291
292 for(size_t r = 1; r != rounds; ++r) {
293 B = aes_dec_round(B, K[r], r);
294 }
295
296 B = aes_dec_last_round(B, K[rounds], rounds);
297 B.store_le(out + i * 16);
298 }
299
300 CT::unpoison(in, blocks * 16);
301 CT::unpoison(out, blocks * 16);
302}
303
304} // namespace
305
306void AES_128::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
307 const SIMD_4x32 K[11] = {
308 SIMD_4x32(&m_EK[4 * 0]),
309 SIMD_4x32(&m_EK[4 * 1]),
310 SIMD_4x32(&m_EK[4 * 2]),
311 SIMD_4x32(&m_EK[4 * 3]),
312 SIMD_4x32(&m_EK[4 * 4]),
313 SIMD_4x32(&m_EK[4 * 5]),
314 SIMD_4x32(&m_EK[4 * 6]),
315 SIMD_4x32(&m_EK[4 * 7]),
316 SIMD_4x32(&m_EK[4 * 8]),
317 SIMD_4x32(&m_EK[4 * 9]),
318 SIMD_4x32(&m_EK[4 * 10]),
319 };
320
321 return vperm_encrypt_blocks(in, out, blocks, K, 10);
322}
323
324void AES_128::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
325 const SIMD_4x32 K[11] = {
326 SIMD_4x32(&m_DK[4 * 0]),
327 SIMD_4x32(&m_DK[4 * 1]),
328 SIMD_4x32(&m_DK[4 * 2]),
329 SIMD_4x32(&m_DK[4 * 3]),
330 SIMD_4x32(&m_DK[4 * 4]),
331 SIMD_4x32(&m_DK[4 * 5]),
332 SIMD_4x32(&m_DK[4 * 6]),
333 SIMD_4x32(&m_DK[4 * 7]),
334 SIMD_4x32(&m_DK[4 * 8]),
335 SIMD_4x32(&m_DK[4 * 9]),
336 SIMD_4x32(&m_DK[4 * 10]),
337 };
338
339 return vperm_decrypt_blocks(in, out, blocks, K, 10);
340}
341
342void AES_192::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
343 const SIMD_4x32 K[13] = {
344 SIMD_4x32(&m_EK[4 * 0]),
345 SIMD_4x32(&m_EK[4 * 1]),
346 SIMD_4x32(&m_EK[4 * 2]),
347 SIMD_4x32(&m_EK[4 * 3]),
348 SIMD_4x32(&m_EK[4 * 4]),
349 SIMD_4x32(&m_EK[4 * 5]),
350 SIMD_4x32(&m_EK[4 * 6]),
351 SIMD_4x32(&m_EK[4 * 7]),
352 SIMD_4x32(&m_EK[4 * 8]),
353 SIMD_4x32(&m_EK[4 * 9]),
354 SIMD_4x32(&m_EK[4 * 10]),
355 SIMD_4x32(&m_EK[4 * 11]),
356 SIMD_4x32(&m_EK[4 * 12]),
357 };
358
359 return vperm_encrypt_blocks(in, out, blocks, K, 12);
360}
361
362void AES_192::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
363 const SIMD_4x32 K[13] = {
364 SIMD_4x32(&m_DK[4 * 0]),
365 SIMD_4x32(&m_DK[4 * 1]),
366 SIMD_4x32(&m_DK[4 * 2]),
367 SIMD_4x32(&m_DK[4 * 3]),
368 SIMD_4x32(&m_DK[4 * 4]),
369 SIMD_4x32(&m_DK[4 * 5]),
370 SIMD_4x32(&m_DK[4 * 6]),
371 SIMD_4x32(&m_DK[4 * 7]),
372 SIMD_4x32(&m_DK[4 * 8]),
373 SIMD_4x32(&m_DK[4 * 9]),
374 SIMD_4x32(&m_DK[4 * 10]),
375 SIMD_4x32(&m_DK[4 * 11]),
376 SIMD_4x32(&m_DK[4 * 12]),
377 };
378
379 return vperm_decrypt_blocks(in, out, blocks, K, 12);
380}
381
382void AES_256::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
383 const SIMD_4x32 K[15] = {
384 SIMD_4x32(&m_EK[4 * 0]),
385 SIMD_4x32(&m_EK[4 * 1]),
386 SIMD_4x32(&m_EK[4 * 2]),
387 SIMD_4x32(&m_EK[4 * 3]),
388 SIMD_4x32(&m_EK[4 * 4]),
389 SIMD_4x32(&m_EK[4 * 5]),
390 SIMD_4x32(&m_EK[4 * 6]),
391 SIMD_4x32(&m_EK[4 * 7]),
392 SIMD_4x32(&m_EK[4 * 8]),
393 SIMD_4x32(&m_EK[4 * 9]),
394 SIMD_4x32(&m_EK[4 * 10]),
395 SIMD_4x32(&m_EK[4 * 11]),
396 SIMD_4x32(&m_EK[4 * 12]),
397 SIMD_4x32(&m_EK[4 * 13]),
398 SIMD_4x32(&m_EK[4 * 14]),
399 };
400
401 return vperm_encrypt_blocks(in, out, blocks, K, 14);
402}
403
404void AES_256::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
405 const SIMD_4x32 K[15] = {
406 SIMD_4x32(&m_DK[4 * 0]),
407 SIMD_4x32(&m_DK[4 * 1]),
408 SIMD_4x32(&m_DK[4 * 2]),
409 SIMD_4x32(&m_DK[4 * 3]),
410 SIMD_4x32(&m_DK[4 * 4]),
411 SIMD_4x32(&m_DK[4 * 5]),
412 SIMD_4x32(&m_DK[4 * 6]),
413 SIMD_4x32(&m_DK[4 * 7]),
414 SIMD_4x32(&m_DK[4 * 8]),
415 SIMD_4x32(&m_DK[4 * 9]),
416 SIMD_4x32(&m_DK[4 * 10]),
417 SIMD_4x32(&m_DK[4 * 11]),
418 SIMD_4x32(&m_DK[4 * 12]),
419 SIMD_4x32(&m_DK[4 * 13]),
420 SIMD_4x32(&m_DK[4 * 14]),
421 };
422
423 return vperm_decrypt_blocks(in, out, blocks, K, 14);
424}
425
426namespace {
427
428inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
429 aes_schedule_transform(SIMD_4x32 input, SIMD_4x32 table_1, SIMD_4x32 table_2) {
430 return shuffle(table_1, low_nibs(input)) ^ shuffle(table_2, high_nibs(input));
431}
432
433SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no) {
434 const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
435
436 SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0);
437 SIMD_4x32 t2 = t;
438 t = shuffle(t, mc_forward0);
439 t2 = t ^ t2 ^ shuffle(t, mc_forward0);
440 return shuffle(t2, vperm_sr[round_no % 4]);
441}
442
443SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle_dec(SIMD_4x32 k, uint8_t round_no) {
444 const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
445
446 const SIMD_4x32 dsk[8] = {
447 SIMD_4x32(0x7ED9A700, 0xB6116FC8, 0x82255BFC, 0x4AED9334),
448 SIMD_4x32(0x27143300, 0x45765162, 0xE9DAFDCE, 0x8BB89FAC),
449 SIMD_4x32(0xCCA86400, 0x27438FEB, 0xADC90561, 0x4622EE8A),
450 SIMD_4x32(0x4F92DD00, 0x815C13CE, 0xBD602FF2, 0x73AEE13C),
451 SIMD_4x32(0x01C6C700, 0x03C4C502, 0xFA3D3CFB, 0xF83F3EF9),
452 SIMD_4x32(0x38CFF700, 0xEE1921D6, 0x7384BC4B, 0xA5526A9D),
453 SIMD_4x32(0x53732000, 0xE3C390B0, 0x10306343, 0xA080D3F3),
454 SIMD_4x32(0x036982E8, 0xA0CA214B, 0x8CE60D67, 0x2F45AEC4),
455 };
456
457 SIMD_4x32 t = aes_schedule_transform(k, dsk[0], dsk[1]);
458 SIMD_4x32 output = shuffle(t, mc_forward0);
459
460 t = aes_schedule_transform(t, dsk[2], dsk[3]);
461 output = shuffle(t ^ output, mc_forward0);
462
463 t = aes_schedule_transform(t, dsk[4], dsk[5]);
464 output = shuffle(t ^ output, mc_forward0);
465
466 t = aes_schedule_transform(t, dsk[6], dsk[7]);
467 output = shuffle(t ^ output, mc_forward0);
468
469 return shuffle(output, vperm_sr[round_no % 4]);
470}
471
472SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no) {
473 const SIMD_4x32 out_tr1(0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121);
474 const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1);
475
476 k = shuffle(k, vperm_sr[round_no % 4]);
477 k ^= SIMD_4x32::splat_u8(0x5B);
478 return aes_schedule_transform(k, out_tr1, out_tr2);
479}
480
481SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_mangle_last_dec(SIMD_4x32 k) {
482 const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A);
483 const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB);
484
485 k ^= SIMD_4x32::splat_u8(0x5B);
486 return aes_schedule_transform(k, deskew1, deskew2);
487}
488
489SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2) {
490 SIMD_4x32 smeared = input2 ^ input2.shift_elems_left<1>();
491 smeared ^= smeared.shift_elems_left<2>();
492 smeared ^= SIMD_4x32::splat_u8(0x5B);
493
494 const SIMD_4x32 Bh = high_nibs(input1);
495 SIMD_4x32 Bl = low_nibs(input1);
496
497 const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
498
499 Bl ^= Bh;
500
501 SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
502 SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
503
504 return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6);
505}
506
507SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_round(SIMD_4x32 rc, SIMD_4x32 input1, SIMD_4x32 input2) {
508 // This byte shuffle is equivalent to alignr<1>(shuffle32(input1, (3,3,3,3)));
509 const SIMD_4x32 shuffle3333_15 = SIMD_4x32::splat(0x0C0F0E0D);
510 return aes_schedule_round(shuffle(input1, shuffle3333_15), input2 ^ rc);
511}
512
513SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y) {
514 const SIMD_4x32 shuffle3332 = SIMD_4x32(0x0B0A0908, 0x0F0E0D0C, 0x0F0E0D0C, 0x0F0E0D0C);
515 const SIMD_4x32 shuffle2000 = SIMD_4x32(0x03020100, 0x03020100, 0x03020100, 0x0B0A0908);
516
517 const SIMD_4x32 zero_top_half(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
518 y &= zero_top_half;
519 return y ^ shuffle(x, shuffle3332) ^ shuffle(y, shuffle2000);
520}
521
522} // namespace
523
524void AES_128::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) {
525 m_EK.resize(11 * 4);
526 m_DK.resize(11 * 4);
527
528 SIMD_4x32 key = SIMD_4x32::load_le(keyb);
529
530 shuffle(key, vperm_sr[2]).store_le(&m_DK[4 * 10]);
531
532 key = aes_schedule_transform(key, k_ipt1, k_ipt2);
533 key.store_le(&m_EK[0]);
534
535 for(size_t i = 1; i != 10; ++i) {
536 key = aes_schedule_round(rcon[i - 1], key, key);
537
538 aes_schedule_mangle(key, (12 - i) % 4).store_le(&m_EK[4 * i]);
539
540 aes_schedule_mangle_dec(key, (10 - i) % 4).store_le(&m_DK[4 * (10 - i)]);
541 }
542
543 key = aes_schedule_round(rcon[9], key, key);
544 aes_schedule_mangle_last(key, 2).store_le(&m_EK[4 * 10]);
545 aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]);
546}
547
548void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) {
549 m_EK.resize(13 * 4);
550 m_DK.resize(13 * 4);
551
552 SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
553 SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 8);
554
555 shuffle(key1, vperm_sr[0]).store_le(&m_DK[12 * 4]);
556
557 key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
558 key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
559
560 key1.store_le(&m_EK[0]);
561
562 for(size_t i = 0; i != 4; ++i) {
563 // key2 with 8 high bytes masked off
564 SIMD_4x32 t = key2;
565 key2 = aes_schedule_round(rcon[2 * i], key2, key1);
566 const SIMD_4x32 key2t = alignr8(key2, t);
567 aes_schedule_mangle(key2t, (i + 3) % 4).store_le(&m_EK[4 * (3 * i + 1)]);
568 aes_schedule_mangle_dec(key2t, (i + 3) % 4).store_le(&m_DK[4 * (11 - 3 * i)]);
569
570 t = aes_schedule_192_smear(key2, t);
571
572 aes_schedule_mangle(t, (i + 2) % 4).store_le(&m_EK[4 * (3 * i + 2)]);
573 aes_schedule_mangle_dec(t, (i + 2) % 4).store_le(&m_DK[4 * (10 - 3 * i)]);
574
575 key2 = aes_schedule_round(rcon[2 * i + 1], t, key2);
576
577 if(i == 3) {
578 aes_schedule_mangle_last(key2, (i + 1) % 4).store_le(&m_EK[4 * (3 * i + 3)]);
579 aes_schedule_mangle_last_dec(key2).store_le(&m_DK[4 * (9 - 3 * i)]);
580 } else {
581 aes_schedule_mangle(key2, (i + 1) % 4).store_le(&m_EK[4 * (3 * i + 3)]);
582 aes_schedule_mangle_dec(key2, (i + 1) % 4).store_le(&m_DK[4 * (9 - 3 * i)]);
583 }
584
585 key1 = key2;
586 key2 = aes_schedule_192_smear(key2, t);
587 }
588}
589
590void AES_256::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) {
591 m_EK.resize(15 * 4);
592 m_DK.resize(15 * 4);
593
594 SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
595 SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 16);
596
597 shuffle(key1, vperm_sr[2]).store_le(&m_DK[4 * 14]);
598
599 key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
600 key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
601
602 key1.store_le(&m_EK[0]);
603 aes_schedule_mangle(key2, 3).store_le(&m_EK[4]);
604
605 aes_schedule_mangle_dec(key2, 1).store_le(&m_DK[4 * 13]);
606
607 const SIMD_4x32 shuffle3333 = SIMD_4x32::splat(0x0F0E0D0C);
608
609 for(size_t i = 2; i != 14; i += 2) {
610 const SIMD_4x32 k_t = key2;
611 key1 = key2 = aes_schedule_round(rcon[(i / 2) - 1], key2, key1);
612
613 aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4 * i]);
614 aes_schedule_mangle_dec(key2, (i + 2) % 4).store_le(&m_DK[4 * (14 - i)]);
615
616 key2 = aes_schedule_round(shuffle(key2, shuffle3333), k_t);
617
618 aes_schedule_mangle(key2, (i - 1) % 4).store_le(&m_EK[4 * (i + 1)]);
619 aes_schedule_mangle_dec(key2, (i + 1) % 4).store_le(&m_DK[4 * (13 - i)]);
620 }
621
622 key2 = aes_schedule_round(rcon[6], key2, key1);
623
624 aes_schedule_mangle_last(key2, 2).store_le(&m_EK[4 * 14]);
625 aes_schedule_mangle_last_dec(key2).store_le(&m_DK[0]);
626}
627
628} // namespace Botan
static SIMD_4x32 load_le(const void *in) noexcept
Definition simd_32.h:159
static SIMD_4x32 splat_u8(uint8_t B) noexcept
Definition simd_32.h:145
static SIMD_4x32 splat(uint32_t B) noexcept
Definition simd_32.h:132
#define BOTAN_FUNC_ISA(isa)
Definition compiler.h:92
constexpr void unpoison(const T *p, size_t n)
Definition ct_utils.h:64
constexpr void poison(const T *p, size_t n)
Definition ct_utils.h:53
const SIMD_8x32 & b