Botan 3.11.1
Crypto and TLS for C&
seed_avx512_gfni.cpp
Go to the documentation of this file.
1/*
2* (C) 2026 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/seed.h>
8
9#include <botan/mem_ops.h>
10#include <botan/internal/isa_extn.h>
11#include <botan/internal/simd_avx512_gfni.h>
12
13namespace Botan {
14
16
17namespace {
18
19BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI SIMD_16x32 seed_g(const SIMD_16x32& X) {
20 /*
21 * SEED's two sboxes are both based on inversions in GF(2^8) modulo the polynomial
22 * x^8+x^6+x^5+x+1 (0x163), followed by different affine transforms.
23 *
24 * GFNI uses AES's field (modulo 0x11B) so the pre-inversion matrix is a field isomorphism
25 * that maps the inputs into the AES field. The post-inversion matrices then apply map
26 * back to SEED's field and apply the appropriate linear transform.
27 */
28
29 // Field isomorphism from SEED's field to AES field
30 constexpr uint64_t seed_pre_a = gfni_matrix(R"(
31 1 1 0 1 0 0 0 0
32 0 0 1 1 0 0 1 1
33 0 0 0 0 1 1 0 1
34 0 1 1 1 0 1 0 0
35 0 1 1 0 1 0 0 0
36 0 0 0 1 1 0 0 0
37 0 0 1 1 1 1 0 0
38 0 0 0 0 1 1 1 0
39 )");
40
41 // Field isomorphism from AES->SEED multiplied by S0's affine matrix
42 constexpr uint64_t seed_s0_post_a = gfni_matrix(R"(
43 0 1 0 1 1 0 0 1
44 0 0 1 1 1 0 1 0
45 1 0 0 0 1 1 1 0
46 1 1 0 0 1 0 0 1
47 0 1 0 1 1 0 1 1
48 1 1 1 1 1 0 1 1
49 0 0 1 1 0 1 0 1
50 0 0 0 1 0 1 1 1
51 )");
52
53 // Field isomorphism from AES->SEED multiplied by S1's affine matrix
54 constexpr uint64_t seed_s1_post_a = gfni_matrix(R"(
55 0 0 1 1 0 1 1 0
56 0 1 1 0 0 0 1 0
57 0 1 0 1 1 0 1 1
58 0 0 0 0 0 0 1 1
59 1 1 0 1 0 0 0 0
60 0 1 0 0 1 0 1 1
61 1 1 1 0 1 0 1 1
62 1 1 1 1 0 0 0 1
63 )");
64
65 constexpr uint8_t seed_s0_post_c = 0xA9;
66 constexpr uint8_t seed_s1_post_c = 0x38;
67
68 // Compute S0(x) and S1(x) for all bytes
69 const auto pre = gf2p8affine<seed_pre_a, 0x00>(X);
72
73 // Blend S0/S1 outputs by alternating bytes
74 constexpr uint64_t blend_mask = 0xAAAAAAAAAAAAAAAA; // 0b1010....
75 const auto sbox = SIMD_16x32(_mm512_mask_blend_epi8(blend_mask, s0.raw(), s1.raw()));
76
77 // Linear mixing layer
78 const auto M0 = SIMD_16x32::splat(0x3FCFF3FC);
79 const auto M1 = SIMD_16x32::splat(0xFC3FCFF3);
80 const auto M2 = SIMD_16x32::splat(0xF3FC3FCF);
81 const auto M3 = SIMD_16x32::splat(0xCFF3FC3F);
82
83 // Masks for broadcasting each byte across the 32 bit word that contains it
84
85 // clang-format off
86 alignas(64) constexpr uint8_t SHUF_BYTE0[64] = {
87 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12,
88 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12,
89 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12,
90 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12,
91 };
92 alignas(64) constexpr uint8_t SHUF_BYTE1[64] = {
93 1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13,
94 1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13,
95 1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13,
96 1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13,
97 };
98 alignas(64) constexpr uint8_t SHUF_BYTE2[64] = {
99 2, 2, 2, 2, 6, 6, 6, 6, 10, 10, 10, 10, 14, 14, 14, 14,
100 2, 2, 2, 2, 6, 6, 6, 6, 10, 10, 10, 10, 14, 14, 14, 14,
101 2, 2, 2, 2, 6, 6, 6, 6, 10, 10, 10, 10, 14, 14, 14, 14,
102 2, 2, 2, 2, 6, 6, 6, 6, 10, 10, 10, 10, 14, 14, 14, 14,
103 };
104 alignas(64) constexpr uint8_t SHUF_BYTE3[64] = {
105 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15,
106 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15,
107 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15,
108 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15,
109 };
110 // clang-format on
111
112 const auto b0 = SIMD_16x32(_mm512_shuffle_epi8(sbox.raw(), _mm512_load_si512(SHUF_BYTE0)));
113 const auto b1 = SIMD_16x32(_mm512_shuffle_epi8(sbox.raw(), _mm512_load_si512(SHUF_BYTE1)));
114 const auto b2 = SIMD_16x32(_mm512_shuffle_epi8(sbox.raw(), _mm512_load_si512(SHUF_BYTE2)));
115 const auto b3 = SIMD_16x32(_mm512_shuffle_epi8(sbox.raw(), _mm512_load_si512(SHUF_BYTE3)));
116
117 // Return (b0 & M0) ^ (b1 & M1) ^ (b2 & M2) ^ (b3 & M3)
118 // ternlogd 0x78 is a ^ (b & c)
119 auto result = SIMD_16x32(b0) & M0;
120 result = SIMD_16x32::ternary_fn<0x78>(result, b1, M1);
121 result = SIMD_16x32::ternary_fn<0x78>(result, b2, M2);
122 result = SIMD_16x32::ternary_fn<0x78>(result, b3, M3);
123
124 return SIMD_16x32(result);
125}
126
127BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void seed_round(
128 SIMD_16x32& B0, SIMD_16x32& B1, SIMD_16x32& B2, SIMD_16x32& B3, uint32_t K0, uint32_t K1, uint32_t K2, uint32_t K3) {
129 auto T0 = B2 ^ SIMD_16x32::splat(K0);
130 auto T1 = seed_g(B2 ^ B3 ^ SIMD_16x32::splat(K1));
131 T0 = seed_g(T1 + T0);
132 T1 = seed_g(T1 + T0);
133 B1 ^= T1;
134 B0 ^= T0 + T1;
135
136 T0 = B0 ^ SIMD_16x32::splat(K2);
137 T1 = seed_g(B0 ^ B1 ^ SIMD_16x32::splat(K3));
138 T0 = seed_g(T1 + T0);
139 T1 = seed_g(T1 + T0);
140 B3 ^= T1;
141 B2 ^= T0 + T1;
142}
143
144BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void encrypt(const uint8_t ptext[16 * 4 * 4],
145 uint8_t ctext[16 * 4 * 4],
146 std::span<const uint32_t> RK) {
147 SIMD_16x32 B0 = SIMD_16x32::load_be(ptext + 16 * 4 * 0);
148 SIMD_16x32 B1 = SIMD_16x32::load_be(ptext + 16 * 4 * 1);
149 SIMD_16x32 B2 = SIMD_16x32::load_be(ptext + 16 * 4 * 2);
150 SIMD_16x32 B3 = SIMD_16x32::load_be(ptext + 16 * 4 * 3);
151
152 SIMD_16x32::transpose(B0, B1, B2, B3);
153
154 for(size_t j = 0; j != 8; ++j) {
155 const uint32_t K0 = RK[4 * j];
156 const uint32_t K1 = RK[4 * j + 1];
157 const uint32_t K2 = RK[4 * j + 2];
158 const uint32_t K3 = RK[4 * j + 3];
159
160 seed_round(B0, B1, B2, B3, K0, K1, K2, K3);
161 }
162
163 // Output order is B2, B3, B0, B1
164 SIMD_16x32::transpose(B2, B3, B0, B1);
165 B2.store_be(ctext + 16 * 4 * 0);
166 B3.store_be(ctext + 16 * 4 * 1);
167 B0.store_be(ctext + 16 * 4 * 2);
168 B1.store_be(ctext + 16 * 4 * 3);
169}
170
171BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void decrypt(const uint8_t ctext[16 * 4 * 4],
172 uint8_t ptext[16 * 4 * 4],
173 std::span<const uint32_t> RK) {
174 SIMD_16x32 B0 = SIMD_16x32::load_be(ctext + 16 * 4 * 0);
175 SIMD_16x32 B1 = SIMD_16x32::load_be(ctext + 16 * 4 * 1);
176 SIMD_16x32 B2 = SIMD_16x32::load_be(ctext + 16 * 4 * 2);
177 SIMD_16x32 B3 = SIMD_16x32::load_be(ctext + 16 * 4 * 3);
178
179 SIMD_16x32::transpose(B0, B1, B2, B3);
180
181 for(size_t j = 0; j != 8; ++j) {
182 const uint32_t K0 = RK[30 - 4 * j];
183 const uint32_t K1 = RK[31 - 4 * j];
184 const uint32_t K2 = RK[28 - 4 * j];
185 const uint32_t K3 = RK[29 - 4 * j];
186
187 seed_round(B0, B1, B2, B3, K0, K1, K2, K3);
188 }
189
190 SIMD_16x32::transpose(B2, B3, B0, B1);
191 B2.store_be(ptext + 16 * 4 * 0);
192 B3.store_be(ptext + 16 * 4 * 1);
193 B0.store_be(ptext + 16 * 4 * 2);
194 B1.store_be(ptext + 16 * 4 * 3);
195}
196
197BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void encrypt_x2(const uint8_t ptext[32 * 4 * 4],
198 uint8_t ctext[32 * 4 * 4],
199 std::span<const uint32_t> RK) {
200 SIMD_16x32 B0 = SIMD_16x32::load_be(ptext + 16 * 4 * 0);
201 SIMD_16x32 B1 = SIMD_16x32::load_be(ptext + 16 * 4 * 1);
202 SIMD_16x32 B2 = SIMD_16x32::load_be(ptext + 16 * 4 * 2);
203 SIMD_16x32 B3 = SIMD_16x32::load_be(ptext + 16 * 4 * 3);
204
205 SIMD_16x32 B4 = SIMD_16x32::load_be(ptext + 16 * 4 * 4);
206 SIMD_16x32 B5 = SIMD_16x32::load_be(ptext + 16 * 4 * 5);
207 SIMD_16x32 B6 = SIMD_16x32::load_be(ptext + 16 * 4 * 6);
208 SIMD_16x32 B7 = SIMD_16x32::load_be(ptext + 16 * 4 * 7);
209
210 SIMD_16x32::transpose(B0, B1, B2, B3);
211 SIMD_16x32::transpose(B4, B5, B6, B7);
212
213 for(size_t j = 0; j != 8; ++j) {
214 const uint32_t K0 = RK[4 * j];
215 const uint32_t K1 = RK[4 * j + 1];
216 const uint32_t K2 = RK[4 * j + 2];
217 const uint32_t K3 = RK[4 * j + 3];
218
219 seed_round(B0, B1, B2, B3, K0, K1, K2, K3);
220 seed_round(B4, B5, B6, B7, K0, K1, K2, K3);
221 }
222
223 SIMD_16x32::transpose(B2, B3, B0, B1);
224 SIMD_16x32::transpose(B6, B7, B4, B5);
225
226 B2.store_be(ctext + 16 * 4 * 0);
227 B3.store_be(ctext + 16 * 4 * 1);
228 B0.store_be(ctext + 16 * 4 * 2);
229 B1.store_be(ctext + 16 * 4 * 3);
230
231 B6.store_be(ctext + 16 * 4 * 4);
232 B7.store_be(ctext + 16 * 4 * 5);
233 B4.store_be(ctext + 16 * 4 * 6);
234 B5.store_be(ctext + 16 * 4 * 7);
235}
236
237BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void decrypt_x2(const uint8_t ctext[32 * 4 * 4],
238 uint8_t ptext[32 * 4 * 4],
239 std::span<const uint32_t> RK) {
240 SIMD_16x32 B0 = SIMD_16x32::load_be(ctext + 16 * 4 * 0);
241 SIMD_16x32 B1 = SIMD_16x32::load_be(ctext + 16 * 4 * 1);
242 SIMD_16x32 B2 = SIMD_16x32::load_be(ctext + 16 * 4 * 2);
243 SIMD_16x32 B3 = SIMD_16x32::load_be(ctext + 16 * 4 * 3);
244
245 SIMD_16x32 B4 = SIMD_16x32::load_be(ctext + 16 * 4 * 4);
246 SIMD_16x32 B5 = SIMD_16x32::load_be(ctext + 16 * 4 * 5);
247 SIMD_16x32 B6 = SIMD_16x32::load_be(ctext + 16 * 4 * 6);
248 SIMD_16x32 B7 = SIMD_16x32::load_be(ctext + 16 * 4 * 7);
249
250 SIMD_16x32::transpose(B0, B1, B2, B3);
251 SIMD_16x32::transpose(B4, B5, B6, B7);
252
253 for(size_t j = 0; j != 8; ++j) {
254 const uint32_t K0 = RK[30 - 4 * j];
255 const uint32_t K1 = RK[31 - 4 * j];
256 const uint32_t K2 = RK[28 - 4 * j];
257 const uint32_t K3 = RK[29 - 4 * j];
258
259 seed_round(B0, B1, B2, B3, K0, K1, K2, K3);
260 seed_round(B4, B5, B6, B7, K0, K1, K2, K3);
261 }
262
263 SIMD_16x32::transpose(B2, B3, B0, B1);
264 SIMD_16x32::transpose(B6, B7, B4, B5);
265
266 B2.store_be(ptext + 16 * 4 * 0);
267 B3.store_be(ptext + 16 * 4 * 1);
268 B0.store_be(ptext + 16 * 4 * 2);
269 B1.store_be(ptext + 16 * 4 * 3);
270
271 B6.store_be(ptext + 16 * 4 * 4);
272 B7.store_be(ptext + 16 * 4 * 5);
273 B4.store_be(ptext + 16 * 4 * 6);
274 B5.store_be(ptext + 16 * 4 * 7);
275}
276
277} // namespace
278
279} // namespace SEED_AVX512_GFNI
280
281void BOTAN_FN_ISA_AVX512_GFNI SEED::avx512_gfni_encrypt(const uint8_t ptext[], uint8_t ctext[], size_t blocks) const {
282 while(blocks >= 32) {
283 SEED_AVX512_GFNI::encrypt_x2(ptext, ctext, m_K);
284 ptext += 16 * 32;
285 ctext += 16 * 32;
286 blocks -= 32;
287 }
288
289 while(blocks >= 16) {
290 SEED_AVX512_GFNI::encrypt(ptext, ctext, m_K);
291 ptext += 16 * 16;
292 ctext += 16 * 16;
293 blocks -= 16;
294 }
295
296 if(blocks > 0) {
297 BOTAN_ASSERT_NOMSG(blocks < 16);
298 uint8_t pbuf[16 * 16] = {0};
299 uint8_t cbuf[16 * 16] = {0};
300 copy_mem(pbuf, ptext, blocks * 16);
301 SEED_AVX512_GFNI::encrypt(pbuf, cbuf, m_K);
302 copy_mem(ctext, cbuf, blocks * 16);
303 }
304}
305
306void BOTAN_FN_ISA_AVX512_GFNI SEED::avx512_gfni_decrypt(const uint8_t ctext[], uint8_t ptext[], size_t blocks) const {
307 while(blocks >= 32) {
308 SEED_AVX512_GFNI::decrypt_x2(ctext, ptext, m_K);
309 ptext += 16 * 32;
310 ctext += 16 * 32;
311 blocks -= 32;
312 }
313
314 while(blocks >= 16) {
315 SEED_AVX512_GFNI::decrypt(ctext, ptext, m_K);
316 ptext += 16 * 16;
317 ctext += 16 * 16;
318 blocks -= 16;
319 }
320
321 if(blocks > 0) {
322 BOTAN_ASSERT_NOMSG(blocks < 16);
323 uint8_t pbuf[16 * 16] = {0};
324 uint8_t cbuf[16 * 16] = {0};
325 copy_mem(cbuf, ctext, blocks * 16);
326 SEED_AVX512_GFNI::decrypt(cbuf, pbuf, m_K);
327 copy_mem(ptext, pbuf, blocks * 16);
328 }
329}
330
331} // namespace Botan
#define BOTAN_ASSERT_NOMSG(expr)
Definition assert.h:75
static BOTAN_FN_ISA_AVX512 void transpose(SIMD_16x32 &B0, SIMD_16x32 &B1, SIMD_16x32 &B2, SIMD_16x32 &B3)
static BOTAN_FN_ISA_AVX512 SIMD_16x32 ternary_fn(const SIMD_16x32 &a, const SIMD_16x32 &b, const SIMD_16x32 &c)
BOTAN_FN_ISA_AVX512 void store_be(uint8_t out[]) const
Definition simd_avx512.h:74
static BOTAN_FN_ISA_AVX512 SIMD_16x32 splat(uint32_t B)
Definition simd_avx512.h:60
static BOTAN_FN_ISA_AVX512 SIMD_16x32 load_be(const uint8_t *in)
Definition simd_avx512.h:68
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
constexpr void copy_mem(T *out, const T *in, size_t n)
Definition mem_ops.h:144
consteval uint64_t gfni_matrix(std::string_view s)
Definition gfni_utils.h:17
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2_GFNI SIMD_8x32 gf2p8affineinv(const SIMD_8x32 &x)
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2_GFNI SIMD_8x32 gf2p8affine(const SIMD_8x32 &x)