Botan 3.11.0
Crypto and TLS for C&
aria_avx512_gfni.cpp
Go to the documentation of this file.
1/*
2* (C) 2026 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/aria.h>
8
9#include <botan/mem_ops.h>
10#include <botan/internal/isa_extn.h>
11#include <botan/internal/simd_avx2_gfni.h>
12#include <botan/internal/simd_avx512.h>
13
14namespace Botan {
15
16namespace ARIA_AVX512 {
17
18namespace {
19
20/*
21* ARIA has two S-boxes pairs S1/X1 (the Rijndael sbox and its inverse)
22* and S2/X2 (another sbox and its inverse), all of which can be described
23* as an affine transformation applied to an inversion in GF(2^8)
24*
25* A very helpful reference for this implementation was
26*
27* "AVX-Based Acceleration of ARIA Block Cipher Algorithm"
28* by Yoo, Kivilinna, Cho.
29* IEEE Access, Vol. 11, 2023 (DOI: 10.1109/ACCESS.2023.3298026)
30* <https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10190597>
31*
32* The paper describes the sbox decompositions (Section IV. A. 1.)
33*
34* S1(x) = A_S1(inv(x)) -> affineinv(AFF_S1, x, 0x63)
35* S2(x) = A_S2(inv(x)) -> affineinv(AFF_S2, x, 0xE2)
36* X1(x) = inv(A_{S1^-1}(x)) -> affine(AFF_X1, x, 0x05) then affineinv(I, y, 0)
37* X2(x) = inv(A_{S2^-1}(x)) -> affine(AFF_X2, x, 0x2C) then affineinv(I, y, 0)
38*
39* where inv(x) = x^-1 in GF(2^8), implemented by the GFNI affineinv instruction
40* and the AFF_* matrixes are the constants following.
41*
42* The approach used here diverges from the implementation described in the
43* paper; they used AVX-512 to compute 64 blocks in parallel. This implementation
44* instead takes advantage of the fact that AVX-512/GFNI can use 4 different GFNI
45* affine constants in a single call, and so needs only 16 block chunks. This
46* leads to less register pressure and (imo) a simpler implementation, albeit likely
47* giving up some performance with larger input sizes.
48*/
49
50constexpr uint64_t AFF_S1 = gfni_matrix(R"(
51 1 0 0 0 1 1 1 1
52 1 1 0 0 0 1 1 1
53 1 1 1 0 0 0 1 1
54 1 1 1 1 0 0 0 1
55 1 1 1 1 1 0 0 0
56 0 1 1 1 1 1 0 0
57 0 0 1 1 1 1 1 0
58 0 0 0 1 1 1 1 1)");
59
60constexpr uint64_t AFF_S2 = gfni_matrix(R"(
61 0 1 0 1 0 1 1 1
62 0 0 1 1 1 1 1 1
63 1 1 1 0 1 1 0 1
64 1 1 0 0 0 0 1 1
65 0 1 0 0 0 0 1 1
66 1 1 0 0 1 1 1 0
67 0 1 1 0 0 0 1 1
68 1 1 1 1 0 1 1 0)");
69
70constexpr uint64_t AFF_X1 = gfni_matrix(R"(
71 0 0 1 0 0 1 0 1
72 1 0 0 1 0 0 1 0
73 0 1 0 0 1 0 0 1
74 1 0 1 0 0 1 0 0
75 0 1 0 1 0 0 1 0
76 0 0 1 0 1 0 0 1
77 1 0 0 1 0 1 0 0
78 0 1 0 0 1 0 1 0)");
79
80constexpr uint64_t AFF_X2 = gfni_matrix(R"(
81 0 0 0 1 1 0 0 0
82 0 0 1 0 0 1 1 0
83 0 0 0 0 1 0 1 0
84 1 1 1 0 0 0 1 1
85 1 1 1 0 1 1 0 0
86 0 1 1 0 1 0 1 1
87 1 0 1 1 1 1 0 1
88 1 0 0 1 0 0 1 1)");
89
90// GFNI identity matrix
91constexpr uint64_t IDENTITY = gfni_matrix(R"(
92 1 0 0 0 0 0 0 0
93 0 1 0 0 0 0 0 0
94 0 0 1 0 0 0 0 0
95 0 0 0 1 0 0 0 0
96 0 0 0 0 1 0 0 0
97 0 0 0 0 0 1 0 0
98 0 0 0 0 0 0 1 0
99 0 0 0 0 0 0 0 1)");
100
101BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI SIMD_16x32
102apply_aria_sbox(SIMD_16x32 x, __m512i pre_mat, __m512i pre_const, __m512i post_mat, __m512i post_const) {
103 /*
104 * After transposing the blocks, we have 4 16-word registers where register 0 contains
105 * all of the first words of the block, etc.
106 *
107 * However ARIA wants to send adjacent bytes of each word through the 4 different
108 * sboxes (either S1||S2||X1||X2 for "FE rounds" or X1||X2||S1||S2 for "FO rounds").
109 * This is handled here by using a permutation to send the 16 first bytes into the
110 * first zmm lane, the 16 second bytes in the second zmm lane, etc. GFNI lets you
111 * specify different affine matrices for each lane so we can then compute all 4 sboxes
112 * with a single sequence. We cannot make use of GFNI's builtin XOR/add instruction,
113 * since we need to use different constants for each lane, but this just requires an
114 * extra XOR instruction after the GFNI instructions.
115 */
116
117 const __m512i fwd_perm = _mm512_set_epi64(0x3F3B37332F2B2723,
118 0x1F1B17130F0B0703,
119 0x3E3A36322E2A2622,
120 0x1E1A16120E0A0602,
121 0x3D3935312D292521,
122 0x1D1915110D090501,
123 0x3C3834302C282420,
124 0x1C1814100C080400);
125
126 const __m512i inv_perm = _mm512_set_epi64(0x3F2F1F0F3E2E1E0E,
127 0x3D2D1D0D3C2C1C0C,
128 0x3B2B1B0B3A2A1A0A,
129 0x3929190938281808,
130 0x3727170736261606,
131 0x3525150534241404,
132 0x3323130332221202,
133 0x3121110130201000);
134
135 // Permute to align bytes into the 128-bit sbox lanes
136 __m512i v = _mm512_permutexvar_epi8(fwd_perm, x.raw());
137
138 // The sbox magic
139 v = _mm512_xor_si512(_mm512_gf2p8affine_epi64_epi8(v, pre_mat, 0), pre_const);
140 v = _mm512_xor_si512(_mm512_gf2p8affineinv_epi64_epi8(v, post_mat, 0), post_const);
141
142 // Permute back to standard ordering
143 v = _mm512_permutexvar_epi8(inv_perm, v);
144 return SIMD_16x32(v);
145}
146
147BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI SIMD_16x32 apply_fo_sbox(SIMD_16x32 x) {
148 /*
149 * FO is S1 || S2 || X1 || X2
150 *
151 * S1/S2 requires the affine transformation after the inversion, likewise X1/X2 requires
152 * the affine transformation before the inversion. So half of the matrices in use for
153 * each instruction are the identity.
154 */
155 const __m512i fo_pre_mat = _mm512_set_epi64(IDENTITY, IDENTITY, IDENTITY, IDENTITY, AFF_X1, AFF_X1, AFF_X2, AFF_X2);
156
157 const __m512i fo_post_mat = _mm512_set_epi64(AFF_S1, AFF_S1, AFF_S2, AFF_S2, IDENTITY, IDENTITY, IDENTITY, IDENTITY);
158
159 const __m512i fo_pre_const = _mm512_set_epi64(0x0000000000000000,
160 0x0000000000000000,
161 0x0000000000000000,
162 0x0000000000000000,
163 0x0505050505050505,
164 0x0505050505050505,
165 0x2C2C2C2C2C2C2C2C,
166 0x2C2C2C2C2C2C2C2C);
167
168 const __m512i fo_post_const = _mm512_set_epi64(0x6363636363636363,
169 0x6363636363636363,
170 0xE2E2E2E2E2E2E2E2,
171 0xE2E2E2E2E2E2E2E2,
172 0x0000000000000000,
173 0x0000000000000000,
174 0x0000000000000000,
175 0x0000000000000000);
176
177 return apply_aria_sbox(x, fo_pre_mat, fo_pre_const, fo_post_mat, fo_post_const);
178}
179
180BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI SIMD_16x32 apply_fe_sbox(SIMD_16x32 x) {
181 const __m512i fe_pre_mat = _mm512_set_epi64(AFF_X1, AFF_X1, AFF_X2, AFF_X2, IDENTITY, IDENTITY, IDENTITY, IDENTITY);
182
183 const __m512i fe_post_mat = _mm512_set_epi64(IDENTITY, IDENTITY, IDENTITY, IDENTITY, AFF_S1, AFF_S1, AFF_S2, AFF_S2);
184
185 const __m512i fe_pre_const = _mm512_set_epi64(0x0505050505050505,
186 0x0505050505050505,
187 0x2C2C2C2C2C2C2C2C,
188 0x2C2C2C2C2C2C2C2C,
189 0x0000000000000000,
190 0x0000000000000000,
191 0x0000000000000000,
192 0x0000000000000000);
193
194 const __m512i fe_post_const = _mm512_set_epi64(0x0000000000000000,
195 0x0000000000000000,
196 0x0000000000000000,
197 0x0000000000000000,
198 0x6363636363636363,
199 0x6363636363636363,
200 0xE2E2E2E2E2E2E2E2,
201 0xE2E2E2E2E2E2E2E2);
202
203 return apply_aria_sbox(x, fe_pre_mat, fe_pre_const, fe_post_mat, fe_post_const);
204}
205
206BOTAN_FN_ISA_AVX512 BOTAN_FORCE_INLINE SIMD_16x32 swap_abcd_badc(SIMD_16x32 x) {
207 // Why you no 16-bit rotate Intel?
208
209 const __m512i rol16 = _mm512_set_epi64(0x0E0F0C0D0A0B0809,
210 0x0607040502030001,
211 0x0E0F0C0D0A0B0809,
212 0x0607040502030001,
213 0x0E0F0C0D0A0B0809,
214 0x0607040502030001,
215 0x0E0F0C0D0A0B0809,
216 0x0607040502030001);
217
218 return SIMD_16x32(_mm512_shuffle_epi8(x.raw(), rol16));
219}
220
221/*
222* This applies mixing in much the same way as the M1/M2/M3/M4 constants in the
223* scalar/table version in aria.cpp (ARIA_F1/ARIA_F2)
224*
225* Notice that the constants are rotational and each has the property that it
226* maps the byte into all 3 of the other bytes, ie byte 0 goes into bytes 1,2,3,
227* then byte 1 goes into bytes 0,2,3, ....
228*
229* This is neatly handled by XORing together rotations of the words
230*/
231BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SIMD_16x32 aria_fo_m(SIMD_16x32 x) {
232 return x.rotl<8>() ^ x.rotl<16>() ^ x.rotl<24>();
233}
234
235BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SIMD_16x32 aria_fe_m(SIMD_16x32 x) {
236 return x ^ x.rotl<8>() ^ x.rotl<24>();
237}
238
239BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 void aria_mix(SIMD_16x32& B0, SIMD_16x32& B1, SIMD_16x32& B2, SIMD_16x32& B3) {
240 B1 ^= B2;
241 B2 ^= B3;
242 B0 ^= B1;
243 B3 ^= B1;
244 B2 ^= B0;
245 B1 ^= B2;
246}
247
248BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void aria_fo(SIMD_16x32& B0,
249 SIMD_16x32& B1,
250 SIMD_16x32& B2,
251 SIMD_16x32& B3) {
252 B0 = aria_fo_m(apply_fo_sbox(B0));
253 B1 = aria_fo_m(apply_fo_sbox(B1));
254 B2 = aria_fo_m(apply_fo_sbox(B2));
255 B3 = aria_fo_m(apply_fo_sbox(B3));
256
257 aria_mix(B0, B1, B2, B3);
258
259 B1 = swap_abcd_badc(B1);
260 B2 = B2.rotl<16>();
261 B3 = B3.bswap();
262
263 aria_mix(B0, B1, B2, B3);
264}
265
266BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void aria_fe(SIMD_16x32& B0,
267 SIMD_16x32& B1,
268 SIMD_16x32& B2,
269 SIMD_16x32& B3) {
270 B0 = aria_fe_m(apply_fe_sbox(B0));
271 B1 = aria_fe_m(apply_fe_sbox(B1));
272 B2 = aria_fe_m(apply_fe_sbox(B2));
273 B3 = aria_fe_m(apply_fe_sbox(B3));
274
275 aria_mix(B0, B1, B2, B3);
276
277 B3 = swap_abcd_badc(B3);
278 B0 = B0.rotl<16>();
279 B1 = B1.bswap();
280
281 aria_mix(B0, B1, B2, B3);
282}
283
284/*
285* 16-wide ARIA block processing
286*/
287BOTAN_FN_ISA_AVX512_GFNI
288void transform_16(const uint8_t in[], uint8_t out[], std::span<const uint32_t> KS) {
289 const size_t ROUNDS = (KS.size() / 4) - 1;
290
291 BOTAN_ASSERT_NOMSG(ROUNDS == 12 || ROUNDS == 14 || ROUNDS == 16);
292
294 SIMD_16x32 B1 = SIMD_16x32::load_be(in + 64);
295 SIMD_16x32 B2 = SIMD_16x32::load_be(in + 128);
296 SIMD_16x32 B3 = SIMD_16x32::load_be(in + 192);
297
298 SIMD_16x32::transpose(B0, B1, B2, B3);
299
300 for(size_t r = 0; r != ROUNDS; r += 2) {
301 B0 ^= SIMD_16x32::splat(KS[4 * r]);
302 B1 ^= SIMD_16x32::splat(KS[4 * r + 1]);
303 B2 ^= SIMD_16x32::splat(KS[4 * r + 2]);
304 B3 ^= SIMD_16x32::splat(KS[4 * r + 3]);
305 aria_fo(B0, B1, B2, B3);
306
307 B0 ^= SIMD_16x32::splat(KS[4 * r + 4]);
308 B1 ^= SIMD_16x32::splat(KS[4 * r + 5]);
309 B2 ^= SIMD_16x32::splat(KS[4 * r + 6]);
310 B3 ^= SIMD_16x32::splat(KS[4 * r + 7]);
311
312 if(r != ROUNDS - 2) {
313 aria_fe(B0, B1, B2, B3);
314 }
315 }
316
317 B0 = apply_fe_sbox(B0) ^ SIMD_16x32::splat(KS[4 * ROUNDS]);
318 B1 = apply_fe_sbox(B1) ^ SIMD_16x32::splat(KS[4 * ROUNDS + 1]);
319 B2 = apply_fe_sbox(B2) ^ SIMD_16x32::splat(KS[4 * ROUNDS + 2]);
320 B3 = apply_fe_sbox(B3) ^ SIMD_16x32::splat(KS[4 * ROUNDS + 3]);
321
322 SIMD_16x32::transpose(B0, B1, B2, B3);
323
324 B0.store_be(out);
325 B1.store_be(out + 64);
326 B2.store_be(out + 128);
327 B3.store_be(out + 192);
328}
329
330void BOTAN_FN_ISA_AVX512_GFNI aria_transform(const uint8_t in[],
331 uint8_t out[],
332 size_t blocks,
333 std::span<const uint32_t> KS) {
334 while(blocks >= 16) {
335 ARIA_AVX512::transform_16(in, out, KS);
336 in += 16 * 16;
337 out += 16 * 16;
338 blocks -= 16;
339 }
340
341 if(blocks > 0) {
342 uint8_t ibuf[16 * 16] = {0};
343 uint8_t obuf[16 * 16] = {0};
344 copy_mem(ibuf, in, blocks * 16);
345 ARIA_AVX512::transform_16(ibuf, obuf, KS);
346 copy_mem(out, obuf, blocks * 16);
347 }
348}
349
350} // namespace
351
352} // namespace ARIA_AVX512
353
354void BOTAN_FN_ISA_AVX512_GFNI ARIA_128::aria_avx512_gfni_encrypt(const uint8_t in[],
355 uint8_t out[],
356 size_t blocks) const {
357 ARIA_AVX512::aria_transform(in, out, blocks, m_ERK);
358}
359
360void BOTAN_FN_ISA_AVX512_GFNI ARIA_128::aria_avx512_gfni_decrypt(const uint8_t in[],
361 uint8_t out[],
362 size_t blocks) const {
363 ARIA_AVX512::aria_transform(in, out, blocks, m_DRK);
364}
365
366void BOTAN_FN_ISA_AVX512_GFNI ARIA_192::aria_avx512_gfni_encrypt(const uint8_t in[],
367 uint8_t out[],
368 size_t blocks) const {
369 ARIA_AVX512::aria_transform(in, out, blocks, m_ERK);
370}
371
372void BOTAN_FN_ISA_AVX512_GFNI ARIA_192::aria_avx512_gfni_decrypt(const uint8_t in[],
373 uint8_t out[],
374 size_t blocks) const {
375 ARIA_AVX512::aria_transform(in, out, blocks, m_DRK);
376}
377
378void BOTAN_FN_ISA_AVX512_GFNI ARIA_256::aria_avx512_gfni_encrypt(const uint8_t in[],
379 uint8_t out[],
380 size_t blocks) const {
381 ARIA_AVX512::aria_transform(in, out, blocks, m_ERK);
382}
383
384void BOTAN_FN_ISA_AVX512_GFNI ARIA_256::aria_avx512_gfni_decrypt(const uint8_t in[],
385 uint8_t out[],
386 size_t blocks) const {
387 ARIA_AVX512::aria_transform(in, out, blocks, m_DRK);
388}
389
390} // namespace Botan
#define BOTAN_ASSERT_NOMSG(expr)
Definition assert.h:75
static BOTAN_FN_ISA_AVX512 void transpose(SIMD_16x32 &B0, SIMD_16x32 &B1, SIMD_16x32 &B2, SIMD_16x32 &B3)
BOTAN_FN_ISA_AVX512 SIMD_16x32 rotl() const
Definition simd_avx512.h:77
BOTAN_FN_ISA_AVX512 SIMD_16x32 bswap() const
BOTAN_FN_ISA_AVX512 void store_be(uint8_t out[]) const
Definition simd_avx512.h:74
__m512i BOTAN_FN_ISA_AVX512 raw() const
static BOTAN_FN_ISA_AVX512 SIMD_16x32 splat(uint32_t B)
Definition simd_avx512.h:60
static BOTAN_FN_ISA_AVX512 SIMD_16x32 load_be(const uint8_t *in)
Definition simd_avx512.h:68
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
constexpr void copy_mem(T *out, const T *in, size_t n)
Definition mem_ops.h:144
consteval uint64_t gfni_matrix(std::string_view s)