Botan 3.11.1
Crypto and TLS for C&
aes_vperm.cpp
Go to the documentation of this file.
1/*
2* AES using vector permutes (SSSE3, NEON)
3* (C) 2010,2016,2019 Jack Lloyd
4*
5* Based on public domain x86-64 assembly written by Mike Hamburg,
6* described in "Accelerating AES with Vector Permute Instructions"
7* (CHES 2009). His original code is available at
8* https://crypto.stanford.edu/vpaes/
9*
10* Botan is released under the Simplified BSD License (see license.txt)
11*/
12
13#include <botan/internal/aes.h>
14
15#include <botan/internal/ct_utils.h>
16#include <botan/internal/isa_extn.h>
17#include <botan/internal/simd_4x32.h>
18#include <bit>
19#include <utility>
20
21namespace Botan {
22
23namespace {
24
25inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 shuffle(SIMD_4x32 tbl, SIMD_4x32 idx) {
26 if constexpr(std::endian::native == std::endian::little) {
27 return SIMD_4x32::byte_shuffle(tbl, idx);
28 } else {
29 return SIMD_4x32::byte_shuffle(tbl.bswap(), idx.bswap()).bswap();
30 }
31}
32
33inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 masked_shuffle(SIMD_4x32 tbl, SIMD_4x32 idx) {
34 if constexpr(std::endian::native == std::endian::little) {
35 return SIMD_4x32::masked_byte_shuffle(tbl, idx);
36 } else {
37 return SIMD_4x32::masked_byte_shuffle(tbl.bswap(), idx.bswap()).bswap();
38 }
39}
40
41inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 shiftrows(SIMD_4x32 x, size_t r) {
42 const SIMD_4x32 vperm_sr[4] = {
43 SIMD_4x32(0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C),
44 SIMD_4x32(0x0F0A0500, 0x030E0904, 0x07020D08, 0x0B06010C),
45 SIMD_4x32(0x0B020900, 0x0F060D04, 0x030A0108, 0x070E050C),
46 SIMD_4x32(0x070A0D00, 0x0B0E0104, 0x0F020508, 0x0306090C),
47 };
48
49 return shuffle(x, vperm_sr[r]);
50}
51
52inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 low_nibs(SIMD_4x32 x) {
53 const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
54 return lo_nibs_mask & x;
55}
56
57inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 high_nibs(SIMD_4x32 x) {
58 const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
59 return (x.shr<4>() & lo_nibs_mask);
60}
61
62inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_enc_first_round(SIMD_4x32 B, SIMD_4x32 K) {
63 const SIMD_4x32 k_ipt1 = SIMD_4x32(0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090);
64 const SIMD_4x32 k_ipt2 = SIMD_4x32(0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC);
65
66 return shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ K;
67}
68
69BOTAN_FORCE_INLINE BOTAN_FN_ISA_SIMD_4X32 std::pair<SIMD_4x32, SIMD_4x32> aes_decompose_kinv(const SIMD_4x32 B) {
70 const SIMD_4x32 k_inv1 = SIMD_4x32(0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309);
71 const SIMD_4x32 k_inv2 = SIMD_4x32(0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C);
72
73 const SIMD_4x32 Bh = high_nibs(B);
74 SIMD_4x32 Bl = low_nibs(B);
75 const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
76 Bl ^= Bh;
77
78 const SIMD_4x32 t5 = Bl ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
79 const SIMD_4x32 t6 = Bh ^ masked_shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
80
81 return std::make_pair(t5, t6);
82}
83
84inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_enc_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) {
85 const SIMD_4x32 sb2u = SIMD_4x32(0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955);
86 const SIMD_4x32 sb2t = SIMD_4x32(0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8);
87
88 const SIMD_4x32 mc_forward[4] = {SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
89 SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
90 SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
91 SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09)};
92 const SIMD_4x32 mc_backward[4] = {
93 SIMD_4x32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F),
94 SIMD_4x32(0x0E0D0C0F, 0x02010003, 0x06050407, 0x0A09080B),
95 SIMD_4x32(0x0A09080B, 0x0E0D0C0F, 0x02010003, 0x06050407),
96 SIMD_4x32(0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x02010003),
97 };
98 const SIMD_4x32 sb1u = SIMD_4x32(0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E);
99 const SIMD_4x32 sb1t = SIMD_4x32(0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1);
100
101 const auto [t5, t6] = aes_decompose_kinv(B);
102
103 const SIMD_4x32 t7 = masked_shuffle(sb1t, t6) ^ masked_shuffle(sb1u, t5) ^ K;
104 const SIMD_4x32 t8 = masked_shuffle(sb2t, t6) ^ masked_shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]);
105
106 return shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8;
107}
108
109inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_enc_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) {
110 const SIMD_4x32 sbou = SIMD_4x32(0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A);
111 const SIMD_4x32 sbot = SIMD_4x32(0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1);
112
113 const auto [t5, t6] = aes_decompose_kinv(B);
114
115 return shiftrows(masked_shuffle(sbou, t5) ^ masked_shuffle(sbot, t6) ^ K, r % 4);
116}
117
118inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_dec_first_round(SIMD_4x32 B, SIMD_4x32 K) {
119 const SIMD_4x32 k_dipt1 = SIMD_4x32(0x0B545F00, 0x0F505B04, 0x114E451A, 0x154A411E);
120 const SIMD_4x32 k_dipt2 = SIMD_4x32(0x60056500, 0x86E383E6, 0xF491F194, 0x12771772);
121
122 return shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ K;
123}
124
125inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_dec_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) {
126 const SIMD_4x32 mcx[4] = {
127 SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09),
128 SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
129 SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
130 SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
131 };
132
133 const SIMD_4x32 sbbu = SIMD_4x32(0x96B44200, 0xD0226492, 0xB0F2D404, 0x602646F6);
134 const SIMD_4x32 sbbt = SIMD_4x32(0xCD596700, 0xC19498A6, 0x3255AA6B, 0xF3FF0C3E);
135 const SIMD_4x32 sbdu = SIMD_4x32(0xE6B1A200, 0x7D57CCDF, 0x882A4439, 0xF56E9B13);
136 const SIMD_4x32 sbdt = SIMD_4x32(0x24C6CB00, 0x3CE2FAF7, 0x15DEEFD3, 0x2931180D);
137 const SIMD_4x32 sbeu = SIMD_4x32(0x26D4D000, 0x46F29296, 0x64B4F6B0, 0x22426004);
138 const SIMD_4x32 sbet = SIMD_4x32(0xFFAAC100, 0x0C55A6CD, 0x98593E32, 0x9467F36B);
139 const SIMD_4x32 sb9u = SIMD_4x32(0x9A86D600, 0x851C0353, 0x4F994CC9, 0xCAD51F50);
140 const SIMD_4x32 sb9t = SIMD_4x32(0xECD74900, 0xC03B1789, 0xB2FBA565, 0x725E2C9E);
141
142 const auto [t5, t6] = aes_decompose_kinv(B);
143
144 const SIMD_4x32 mc = mcx[(r - 1) % 4];
145
146 const SIMD_4x32 t8 = masked_shuffle(sb9t, t6) ^ masked_shuffle(sb9u, t5) ^ K;
147 const SIMD_4x32 t9 = shuffle(t8, mc) ^ masked_shuffle(sbdu, t5) ^ masked_shuffle(sbdt, t6);
148 const SIMD_4x32 t12 = shuffle(t9, mc) ^ masked_shuffle(sbbu, t5) ^ masked_shuffle(sbbt, t6);
149 return shuffle(t12, mc) ^ masked_shuffle(sbeu, t5) ^ masked_shuffle(sbet, t6);
150}
151
152inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_dec_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r) {
153 const SIMD_4x32 sboud = SIMD_4x32(0x7EF94000, 0x1387EA53, 0xD4943E2D, 0xC7AA6DB9);
154 const SIMD_4x32 sbotd = SIMD_4x32(0x93441D00, 0x12D7560F, 0xD8C58E9C, 0xCA4B8159);
155
156 const uint32_t which_sr = ((((r - 1) << 4) ^ 48) & 48) / 16;
157
158 const auto [t5, t6] = aes_decompose_kinv(B);
159
160 const SIMD_4x32 x = masked_shuffle(sboud, t5) ^ masked_shuffle(sbotd, t6) ^ K;
161 return shiftrows(x, which_sr);
162}
163
164void BOTAN_FN_ISA_SIMD_4X32
165vperm_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, const SIMD_4x32 K[], size_t rounds) {
166 CT::poison(in, blocks * 16);
167
168 const size_t blocks2 = blocks - (blocks % 2);
169
170 for(size_t i = 0; i != blocks2; i += 2) {
171 SIMD_4x32 B0 = SIMD_4x32::load_le(in + i * 16);
172 SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i + 1) * 16);
173
174 B0 = aes_enc_first_round(B0, K[0]);
175 B1 = aes_enc_first_round(B1, K[0]);
176
177 for(size_t r = 1; r != rounds; ++r) {
178 B0 = aes_enc_round(B0, K[r], r);
179 B1 = aes_enc_round(B1, K[r], r);
180 }
181
182 B0 = aes_enc_last_round(B0, K[rounds], rounds);
183 B1 = aes_enc_last_round(B1, K[rounds], rounds);
184
185 B0.store_le(out + i * 16);
186 B1.store_le(out + (i + 1) * 16);
187 }
188
189 for(size_t i = blocks2; i < blocks; ++i) {
190 SIMD_4x32 B = SIMD_4x32::load_le(in + i * 16); // ???
191
192 B = aes_enc_first_round(B, K[0]);
193
194 for(size_t r = 1; r != rounds; ++r) {
195 B = aes_enc_round(B, K[r], r);
196 }
197
198 B = aes_enc_last_round(B, K[rounds], rounds);
199 B.store_le(out + i * 16);
200 }
201
202 CT::unpoison(in, blocks * 16);
203 CT::unpoison(out, blocks * 16);
204}
205
206void BOTAN_FN_ISA_SIMD_4X32
207vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks, const SIMD_4x32 K[], size_t rounds) {
208 CT::poison(in, blocks * 16);
209
210 const size_t blocks2 = blocks - (blocks % 2);
211
212 for(size_t i = 0; i != blocks2; i += 2) {
213 SIMD_4x32 B0 = SIMD_4x32::load_le(in + i * 16);
214 SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i + 1) * 16);
215
216 B0 = aes_dec_first_round(B0, K[0]);
217 B1 = aes_dec_first_round(B1, K[0]);
218
219 for(size_t r = 1; r != rounds; ++r) {
220 B0 = aes_dec_round(B0, K[r], r);
221 B1 = aes_dec_round(B1, K[r], r);
222 }
223
224 B0 = aes_dec_last_round(B0, K[rounds], rounds);
225 B1 = aes_dec_last_round(B1, K[rounds], rounds);
226
227 B0.store_le(out + i * 16);
228 B1.store_le(out + (i + 1) * 16);
229 }
230
231 for(size_t i = blocks2; i < blocks; ++i) {
232 SIMD_4x32 B = SIMD_4x32::load_le(in + i * 16); // ???
233
234 B = aes_dec_first_round(B, K[0]);
235
236 for(size_t r = 1; r != rounds; ++r) {
237 B = aes_dec_round(B, K[r], r);
238 }
239
240 B = aes_dec_last_round(B, K[rounds], rounds);
241 B.store_le(out + i * 16);
242 }
243
244 CT::unpoison(in, blocks * 16);
245 CT::unpoison(out, blocks * 16);
246}
247
248} // namespace
249
250void BOTAN_FN_ISA_SIMD_4X32 AES_128::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
251 const SIMD_4x32 K[11] = {
252 SIMD_4x32::load_le(&m_EK[4 * 0]),
253 SIMD_4x32::load_le(&m_EK[4 * 1]),
254 SIMD_4x32::load_le(&m_EK[4 * 2]),
255 SIMD_4x32::load_le(&m_EK[4 * 3]),
256 SIMD_4x32::load_le(&m_EK[4 * 4]),
257 SIMD_4x32::load_le(&m_EK[4 * 5]),
258 SIMD_4x32::load_le(&m_EK[4 * 6]),
259 SIMD_4x32::load_le(&m_EK[4 * 7]),
260 SIMD_4x32::load_le(&m_EK[4 * 8]),
261 SIMD_4x32::load_le(&m_EK[4 * 9]),
262 SIMD_4x32::load_le(&m_EK[4 * 10]),
263 };
264
265 return vperm_encrypt_blocks(in, out, blocks, K, 10);
266}
267
268void BOTAN_FN_ISA_SIMD_4X32 AES_128::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
269 const SIMD_4x32 K[11] = {
270 SIMD_4x32::load_le(&m_DK[4 * 0]),
271 SIMD_4x32::load_le(&m_DK[4 * 1]),
272 SIMD_4x32::load_le(&m_DK[4 * 2]),
273 SIMD_4x32::load_le(&m_DK[4 * 3]),
274 SIMD_4x32::load_le(&m_DK[4 * 4]),
275 SIMD_4x32::load_le(&m_DK[4 * 5]),
276 SIMD_4x32::load_le(&m_DK[4 * 6]),
277 SIMD_4x32::load_le(&m_DK[4 * 7]),
278 SIMD_4x32::load_le(&m_DK[4 * 8]),
279 SIMD_4x32::load_le(&m_DK[4 * 9]),
280 SIMD_4x32::load_le(&m_DK[4 * 10]),
281 };
282
283 return vperm_decrypt_blocks(in, out, blocks, K, 10);
284}
285
286void BOTAN_FN_ISA_SIMD_4X32 AES_192::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
287 const SIMD_4x32 K[13] = {
288 SIMD_4x32::load_le(&m_EK[4 * 0]),
289 SIMD_4x32::load_le(&m_EK[4 * 1]),
290 SIMD_4x32::load_le(&m_EK[4 * 2]),
291 SIMD_4x32::load_le(&m_EK[4 * 3]),
292 SIMD_4x32::load_le(&m_EK[4 * 4]),
293 SIMD_4x32::load_le(&m_EK[4 * 5]),
294 SIMD_4x32::load_le(&m_EK[4 * 6]),
295 SIMD_4x32::load_le(&m_EK[4 * 7]),
296 SIMD_4x32::load_le(&m_EK[4 * 8]),
297 SIMD_4x32::load_le(&m_EK[4 * 9]),
298 SIMD_4x32::load_le(&m_EK[4 * 10]),
299 SIMD_4x32::load_le(&m_EK[4 * 11]),
300 SIMD_4x32::load_le(&m_EK[4 * 12]),
301 };
302
303 return vperm_encrypt_blocks(in, out, blocks, K, 12);
304}
305
306void BOTAN_FN_ISA_SIMD_4X32 AES_192::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
307 const SIMD_4x32 K[13] = {
308 SIMD_4x32::load_le(&m_DK[4 * 0]),
309 SIMD_4x32::load_le(&m_DK[4 * 1]),
310 SIMD_4x32::load_le(&m_DK[4 * 2]),
311 SIMD_4x32::load_le(&m_DK[4 * 3]),
312 SIMD_4x32::load_le(&m_DK[4 * 4]),
313 SIMD_4x32::load_le(&m_DK[4 * 5]),
314 SIMD_4x32::load_le(&m_DK[4 * 6]),
315 SIMD_4x32::load_le(&m_DK[4 * 7]),
316 SIMD_4x32::load_le(&m_DK[4 * 8]),
317 SIMD_4x32::load_le(&m_DK[4 * 9]),
318 SIMD_4x32::load_le(&m_DK[4 * 10]),
319 SIMD_4x32::load_le(&m_DK[4 * 11]),
320 SIMD_4x32::load_le(&m_DK[4 * 12]),
321 };
322
323 return vperm_decrypt_blocks(in, out, blocks, K, 12);
324}
325
326void BOTAN_FN_ISA_SIMD_4X32 AES_256::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
327 const SIMD_4x32 K[15] = {
328 SIMD_4x32::load_le(&m_EK[4 * 0]),
329 SIMD_4x32::load_le(&m_EK[4 * 1]),
330 SIMD_4x32::load_le(&m_EK[4 * 2]),
331 SIMD_4x32::load_le(&m_EK[4 * 3]),
332 SIMD_4x32::load_le(&m_EK[4 * 4]),
333 SIMD_4x32::load_le(&m_EK[4 * 5]),
334 SIMD_4x32::load_le(&m_EK[4 * 6]),
335 SIMD_4x32::load_le(&m_EK[4 * 7]),
336 SIMD_4x32::load_le(&m_EK[4 * 8]),
337 SIMD_4x32::load_le(&m_EK[4 * 9]),
338 SIMD_4x32::load_le(&m_EK[4 * 10]),
339 SIMD_4x32::load_le(&m_EK[4 * 11]),
340 SIMD_4x32::load_le(&m_EK[4 * 12]),
341 SIMD_4x32::load_le(&m_EK[4 * 13]),
342 SIMD_4x32::load_le(&m_EK[4 * 14]),
343 };
344
345 return vperm_encrypt_blocks(in, out, blocks, K, 14);
346}
347
348void BOTAN_FN_ISA_SIMD_4X32 AES_256::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const {
349 const SIMD_4x32 K[15] = {
350 SIMD_4x32::load_le(&m_DK[4 * 0]),
351 SIMD_4x32::load_le(&m_DK[4 * 1]),
352 SIMD_4x32::load_le(&m_DK[4 * 2]),
353 SIMD_4x32::load_le(&m_DK[4 * 3]),
354 SIMD_4x32::load_le(&m_DK[4 * 4]),
355 SIMD_4x32::load_le(&m_DK[4 * 5]),
356 SIMD_4x32::load_le(&m_DK[4 * 6]),
357 SIMD_4x32::load_le(&m_DK[4 * 7]),
358 SIMD_4x32::load_le(&m_DK[4 * 8]),
359 SIMD_4x32::load_le(&m_DK[4 * 9]),
360 SIMD_4x32::load_le(&m_DK[4 * 10]),
361 SIMD_4x32::load_le(&m_DK[4 * 11]),
362 SIMD_4x32::load_le(&m_DK[4 * 12]),
363 SIMD_4x32::load_le(&m_DK[4 * 13]),
364 SIMD_4x32::load_le(&m_DK[4 * 14]),
365 };
366
367 return vperm_decrypt_blocks(in, out, blocks, K, 14);
368}
369
370namespace {
371
372inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_transform(SIMD_4x32 input, SIMD_4x32 table_1, SIMD_4x32 table_2) {
373 return shuffle(table_1, low_nibs(input)) ^ shuffle(table_2, high_nibs(input));
374}
375
376inline SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_transform_init(SIMD_4x32 input) {
377 return aes_enc_first_round(input, SIMD_4x32());
378}
379
380SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no) {
381 const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
382
383 SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0);
384 SIMD_4x32 t2 = t;
385 t = shuffle(t, mc_forward0);
386 t2 = t ^ t2 ^ shuffle(t, mc_forward0);
387 return shiftrows(t2, round_no % 4);
388}
389
390SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_mangle_dec(SIMD_4x32 k, uint8_t round_no) {
391 const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
392
393 const SIMD_4x32 dsk[8] = {
394 SIMD_4x32(0x7ED9A700, 0xB6116FC8, 0x82255BFC, 0x4AED9334),
395 SIMD_4x32(0x27143300, 0x45765162, 0xE9DAFDCE, 0x8BB89FAC),
396 SIMD_4x32(0xCCA86400, 0x27438FEB, 0xADC90561, 0x4622EE8A),
397 SIMD_4x32(0x4F92DD00, 0x815C13CE, 0xBD602FF2, 0x73AEE13C),
398 SIMD_4x32(0x01C6C700, 0x03C4C502, 0xFA3D3CFB, 0xF83F3EF9),
399 SIMD_4x32(0x38CFF700, 0xEE1921D6, 0x7384BC4B, 0xA5526A9D),
400 SIMD_4x32(0x53732000, 0xE3C390B0, 0x10306343, 0xA080D3F3),
401 SIMD_4x32(0x036982E8, 0xA0CA214B, 0x8CE60D67, 0x2F45AEC4),
402 };
403
404 SIMD_4x32 t = aes_schedule_transform(k, dsk[0], dsk[1]);
405 SIMD_4x32 output = shuffle(t, mc_forward0);
406
407 t = aes_schedule_transform(t, dsk[2], dsk[3]);
408 output = shuffle(t ^ output, mc_forward0);
409
410 t = aes_schedule_transform(t, dsk[4], dsk[5]);
411 output = shuffle(t ^ output, mc_forward0);
412
413 t = aes_schedule_transform(t, dsk[6], dsk[7]);
414 output = shuffle(t ^ output, mc_forward0);
415
416 return shiftrows(output, round_no % 4);
417}
418
419SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no) {
420 const SIMD_4x32 out_tr1(0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121);
421 const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1);
422
423 k = shiftrows(k, round_no % 4) ^ SIMD_4x32::splat_u8(0x5B);
424 return aes_schedule_transform(k, out_tr1, out_tr2);
425}
426
427SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_mangle_last_dec(SIMD_4x32 k) {
428 const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A);
429 const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB);
430
431 k ^= SIMD_4x32::splat_u8(0x5B);
432 return aes_schedule_transform(k, deskew1, deskew2);
433}
434
435SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2) {
436 const SIMD_4x32 sb1u = SIMD_4x32(0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E);
437 const SIMD_4x32 sb1t = SIMD_4x32(0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1);
438
439 SIMD_4x32 smeared = input2 ^ input2.shift_elems_left<1>();
440 smeared ^= smeared.shift_elems_left<2>();
441 smeared ^= SIMD_4x32::splat_u8(0x5B);
442
443 const auto [t5, t6] = aes_decompose_kinv(input1);
444
445 return smeared ^ masked_shuffle(sb1u, t5) ^ masked_shuffle(sb1t, t6);
446}
447
448SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_round_rcon(size_t rc, SIMD_4x32 input1, SIMD_4x32 input2) {
449 const SIMD_4x32 rcon[10] = {
450 SIMD_4x32(0x00000070, 0x00000000, 0x00000000, 0x00000000),
451 SIMD_4x32(0x0000002A, 0x00000000, 0x00000000, 0x00000000),
452 SIMD_4x32(0x00000098, 0x00000000, 0x00000000, 0x00000000),
453 SIMD_4x32(0x00000008, 0x00000000, 0x00000000, 0x00000000),
454 SIMD_4x32(0x0000004D, 0x00000000, 0x00000000, 0x00000000),
455 SIMD_4x32(0x0000007C, 0x00000000, 0x00000000, 0x00000000),
456 SIMD_4x32(0x0000007D, 0x00000000, 0x00000000, 0x00000000),
457 SIMD_4x32(0x00000081, 0x00000000, 0x00000000, 0x00000000),
458 SIMD_4x32(0x0000001F, 0x00000000, 0x00000000, 0x00000000),
459 SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000),
460 };
461
462 // This byte shuffle is equivalent to alignr<1>(shuffle32(input1, (3,3,3,3)));
463 const SIMD_4x32 shuffle3333_15 = SIMD_4x32::splat(0x0C0F0E0D);
464 return aes_schedule_round(shuffle(input1, shuffle3333_15), input2 ^ rcon[rc]);
465}
466
467SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y) {
468 const SIMD_4x32 shuffle3332 = SIMD_4x32(0x0B0A0908, 0x0F0E0D0C, 0x0F0E0D0C, 0x0F0E0D0C);
469 const SIMD_4x32 shuffle2000 = SIMD_4x32(0x03020100, 0x03020100, 0x03020100, 0x0B0A0908);
470
471 const SIMD_4x32 zero_top_half(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
472 y &= zero_top_half;
473 return y ^ shuffle(x, shuffle3332) ^ shuffle(y, shuffle2000);
474}
475
476} // namespace
477
478// NOLINTBEGIN(readability-container-data-pointer)
479
480void BOTAN_FN_ISA_SIMD_4X32 AES_128::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) {
481 m_EK.resize(11 * 4);
482 m_DK.resize(11 * 4);
483
484 SIMD_4x32 key = SIMD_4x32::load_le(keyb);
485
486 shiftrows(key, 2).store_le(&m_DK[4 * 10]);
487
488 key = aes_schedule_transform_init(key);
489 key.store_le(&m_EK[0]);
490
491 for(size_t i = 1; i != 10; ++i) {
492 key = aes_schedule_round_rcon(i - 1, key, key);
493
494 aes_schedule_mangle(key, (12 - i) % 4).store_le(&m_EK[4 * i]);
495
496 aes_schedule_mangle_dec(key, (10 - i) % 4).store_le(&m_DK[4 * (10 - i)]);
497 }
498
499 key = aes_schedule_round_rcon(9, key, key);
500 aes_schedule_mangle_last(key, 2).store_le(&m_EK[4 * 10]);
501 aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]);
502}
503
504void BOTAN_FN_ISA_SIMD_4X32 AES_192::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) {
505 m_EK.resize(13 * 4);
506 m_DK.resize(13 * 4);
507
508 SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
509 SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 8);
510
511 shiftrows(key1, 0).store_le(&m_DK[12 * 4]);
512
513 key1 = aes_schedule_transform_init(key1);
514 key2 = aes_schedule_transform_init(key2);
515
516 key1.store_le(&m_EK[0]);
517
518 for(size_t i = 0; i != 4; ++i) {
519 // key2 with 8 high bytes masked off
520 SIMD_4x32 t = key2;
521 key2 = aes_schedule_round_rcon(2 * i, key2, key1);
522 const auto key2t = SIMD_4x32::alignr8(key2, t);
523
524 aes_schedule_mangle(key2t, (i + 3) % 4).store_le(&m_EK[4 * (3 * i + 1)]);
525 aes_schedule_mangle_dec(key2t, (i + 3) % 4).store_le(&m_DK[4 * (11 - 3 * i)]);
526
527 t = aes_schedule_192_smear(key2, t);
528
529 aes_schedule_mangle(t, (i + 2) % 4).store_le(&m_EK[4 * (3 * i + 2)]);
530 aes_schedule_mangle_dec(t, (i + 2) % 4).store_le(&m_DK[4 * (10 - 3 * i)]);
531
532 key2 = aes_schedule_round_rcon(2 * i + 1, t, key2);
533
534 if(i == 3) {
535 aes_schedule_mangle_last(key2, (i + 1) % 4).store_le(&m_EK[4 * (3 * i + 3)]);
536 aes_schedule_mangle_last_dec(key2).store_le(&m_DK[4 * (9 - 3 * i)]);
537 } else {
538 aes_schedule_mangle(key2, (i + 1) % 4).store_le(&m_EK[4 * (3 * i + 3)]);
539 aes_schedule_mangle_dec(key2, (i + 1) % 4).store_le(&m_DK[4 * (9 - 3 * i)]);
540 }
541
542 key1 = key2;
543 key2 = aes_schedule_192_smear(key2, t);
544 }
545}
546
547void BOTAN_FN_ISA_SIMD_4X32 AES_256::vperm_key_schedule(const uint8_t keyb[], size_t /*unused*/) {
548 m_EK.resize(15 * 4);
549 m_DK.resize(15 * 4);
550
551 SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
552 SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 16);
553
554 shiftrows(key1, 2).store_le(&m_DK[4 * 14]);
555
556 key1 = aes_schedule_transform_init(key1);
557 key2 = aes_schedule_transform_init(key2);
558
559 key1.store_le(&m_EK[0]);
560 aes_schedule_mangle(key2, 3).store_le(&m_EK[4]);
561
562 aes_schedule_mangle_dec(key2, 1).store_le(&m_DK[4 * 13]);
563
564 const SIMD_4x32 shuffle3333 = SIMD_4x32::splat(0x0F0E0D0C);
565
566 for(size_t i = 2; i != 14; i += 2) {
567 const SIMD_4x32 k_t = key2;
568 key1 = key2 = aes_schedule_round_rcon((i / 2) - 1, key2, key1);
569
570 aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4 * i]);
571 aes_schedule_mangle_dec(key2, (i + 2) % 4).store_le(&m_DK[4 * (14 - i)]);
572
573 key2 = aes_schedule_round(shuffle(key2, shuffle3333), k_t);
574
575 aes_schedule_mangle(key2, (i - 1) % 4).store_le(&m_EK[4 * (i + 1)]);
576 aes_schedule_mangle_dec(key2, (i + 1) % 4).store_le(&m_DK[4 * (13 - i)]);
577 }
578
579 key2 = aes_schedule_round_rcon(6, key2, key1);
580
581 aes_schedule_mangle_last(key2, 2).store_le(&m_EK[4 * 14]);
582 aes_schedule_mangle_last_dec(key2).store_le(&m_DK[0]);
583}
584
585// NOLINTEND(readability-container-data-pointer)
586
587} // namespace Botan
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 load_le(const void *in) noexcept
Definition simd_4x32.h:162
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 byte_shuffle(const SIMD_4x32 &tbl, const SIMD_4x32 &idx)
Definition simd_4x32.h:803
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 alignr8(const SIMD_4x32 &a, const SIMD_4x32 &b)
Definition simd_4x32.h:886
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 splat(uint32_t B) noexcept
Definition simd_4x32.h:127
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 masked_byte_shuffle(const SIMD_4x32 &tbl, const SIMD_4x32 &idx)
Definition simd_4x32.h:840
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 splat_u8(uint8_t B) noexcept
Definition simd_4x32.h:144
BOTAN_FN_ISA_SIMD_4X32 SIMD_4x32 bswap() const noexcept
Definition simd_4x32.h:576
SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 shift_elems_left() const noexcept
Definition simd_4x32.h:602
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
constexpr void unpoison(const T *p, size_t n)
Definition ct_utils.h:67
constexpr void poison(const T *p, size_t n)
Definition ct_utils.h:56