Botan 3.11.1
Crypto and TLS for C&
sm4_avx512.cpp
Go to the documentation of this file.
1/*
2* (C) 2025 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/sm4.h>
8
9#include <botan/mem_ops.h>
10#include <botan/internal/isa_extn.h>
11#include <botan/internal/simd_avx2_gfni.h>
12#include <botan/internal/simd_avx512_gfni.h>
13
14namespace Botan {
15
16namespace SM4_AVX512_GFNI {
17
18namespace {
19
20template <typename SIMD_T>
21BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI SIMD_T sm4_sbox(const SIMD_T& x) {
22 /*
23 * See https://eprint.iacr.org/2022/1154 section 3.3 for details on
24 * how this works
25 */
26 constexpr uint64_t pre_a = gfni_matrix(R"(
27 0 0 1 1 0 0 1 0
28 0 0 0 1 0 1 0 0
29 1 0 1 1 1 1 1 0
30 1 0 0 1 1 1 0 1
31 0 1 0 1 1 0 0 0
32 0 1 0 0 0 1 0 0
33 0 0 0 0 1 0 1 0
34 1 0 1 1 1 0 1 0)");
35
36 constexpr uint8_t pre_c = 0b00111110;
37
38 constexpr uint64_t post_a = gfni_matrix(R"(
39 1 1 0 0 1 1 1 1
40 1 1 0 1 0 1 0 1
41 0 0 1 0 1 1 0 0
42 1 0 0 1 0 1 0 1
43 0 0 1 0 1 1 1 0
44 0 1 1 0 0 1 0 1
45 1 0 1 0 1 1 0 1
46 1 0 0 1 0 0 0 1)");
47
48 constexpr uint8_t post_c = 0b11010011;
49
50 auto y = gf2p8affine<pre_a, pre_c>(x);
52}
53
54template <typename SIMD_T>
55BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI SIMD_T sm4_f(const SIMD_T& x) {
56 const auto sx = sm4_sbox(x);
57 return sx ^ sx.template rotl<2>() ^ sx.template rotl<10>() ^ sx.template rotl<18>() ^ sx.template rotl<24>();
58}
59
60template <typename SIMD_T, size_t M>
61BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void encrypt(const uint8_t ptext[16 * 4 * M],
62 uint8_t ctext[16 * 4 * M],
63 std::span<const uint32_t> RK) {
64 SIMD_T B0 = SIMD_T::load_be(ptext);
65 SIMD_T B1 = SIMD_T::load_be(ptext + 16 * M);
66 SIMD_T B2 = SIMD_T::load_be(ptext + 16 * 2 * M);
67 SIMD_T B3 = SIMD_T::load_be(ptext + 16 * 3 * M);
68
69 SIMD_T::transpose(B0, B1, B2, B3);
70
71 B0 = B0.rev_words();
72 B1 = B1.rev_words();
73 B2 = B2.rev_words();
74 B3 = B3.rev_words();
75
76 for(size_t j = 0; j != 8; ++j) {
77 B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[4 * j]));
78 B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[4 * j + 1]));
79 B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[4 * j + 2]));
80 B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[4 * j + 3]));
81 }
82
83 SIMD_T::transpose(B0, B1, B2, B3);
84
85 B3.rev_words().store_be(ctext);
86 B2.rev_words().store_be(ctext + 16 * M);
87 B1.rev_words().store_be(ctext + 16 * 2 * M);
88 B0.rev_words().store_be(ctext + 16 * 3 * M);
89}
90
91template <typename SIMD_T, size_t M>
92BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void encrypt_x2(const uint8_t ptext[32 * 4 * M],
93 uint8_t ctext[32 * 4 * M],
94 std::span<const uint32_t> RK) {
95 SIMD_T B0 = SIMD_T::load_be(ptext);
96 SIMD_T B1 = SIMD_T::load_be(ptext + 16 * M);
97 SIMD_T B2 = SIMD_T::load_be(ptext + 16 * 2 * M);
98 SIMD_T B3 = SIMD_T::load_be(ptext + 16 * 3 * M);
99
100 SIMD_T B4 = SIMD_T::load_be(ptext + 16 * 4 * M);
101 SIMD_T B5 = SIMD_T::load_be(ptext + 16 * 5 * M);
102 SIMD_T B6 = SIMD_T::load_be(ptext + 16 * 6 * M);
103 SIMD_T B7 = SIMD_T::load_be(ptext + 16 * 7 * M);
104
105 SIMD_T::transpose(B0, B1, B2, B3);
106 SIMD_T::transpose(B4, B5, B6, B7);
107
108 B0 = B0.rev_words();
109 B1 = B1.rev_words();
110 B2 = B2.rev_words();
111 B3 = B3.rev_words();
112
113 B4 = B4.rev_words();
114 B5 = B5.rev_words();
115 B6 = B6.rev_words();
116 B7 = B7.rev_words();
117
118 for(size_t j = 0; j != 8; ++j) {
119 B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[4 * j]));
120 B4 ^= sm4_f(B5 ^ B6 ^ B7 ^ SIMD_T::splat(RK[4 * j]));
121
122 B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[4 * j + 1]));
123 B5 ^= sm4_f(B6 ^ B7 ^ B4 ^ SIMD_T::splat(RK[4 * j + 1]));
124
125 B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[4 * j + 2]));
126 B6 ^= sm4_f(B7 ^ B4 ^ B5 ^ SIMD_T::splat(RK[4 * j + 2]));
127
128 B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[4 * j + 3]));
129 B7 ^= sm4_f(B4 ^ B5 ^ B6 ^ SIMD_T::splat(RK[4 * j + 3]));
130 }
131
132 SIMD_T::transpose(B0, B1, B2, B3);
133 SIMD_T::transpose(B4, B5, B6, B7);
134
135 B3.rev_words().store_be(ctext);
136 B2.rev_words().store_be(ctext + 16 * M);
137 B1.rev_words().store_be(ctext + 16 * 2 * M);
138 B0.rev_words().store_be(ctext + 16 * 3 * M);
139
140 B7.rev_words().store_be(ctext + 16 * 4 * M);
141 B6.rev_words().store_be(ctext + 16 * 5 * M);
142 B5.rev_words().store_be(ctext + 16 * 6 * M);
143 B4.rev_words().store_be(ctext + 16 * 7 * M);
144}
145
146template <typename SIMD_T, size_t M>
147BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void decrypt(const uint8_t ctext[16 * 4 * M],
148 uint8_t ptext[16 * 4 * M],
149 std::span<const uint32_t> RK) {
150 SIMD_T B0 = SIMD_T::load_be(ctext);
151 SIMD_T B1 = SIMD_T::load_be(ctext + 16 * M);
152 SIMD_T B2 = SIMD_T::load_be(ctext + 16 * 2 * M);
153 SIMD_T B3 = SIMD_T::load_be(ctext + 16 * 3 * M);
154
155 SIMD_T::transpose(B0, B1, B2, B3);
156
157 B0 = B0.rev_words();
158 B1 = B1.rev_words();
159 B2 = B2.rev_words();
160 B3 = B3.rev_words();
161
162 for(size_t j = 0; j != 8; ++j) {
163 B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[32 - (4 * j + 1)]));
164 B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[32 - (4 * j + 2)]));
165 B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[32 - (4 * j + 3)]));
166 B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[32 - (4 * j + 4)]));
167 }
168
169 SIMD_T::transpose(B0, B1, B2, B3);
170
171 B3.rev_words().store_be(ptext);
172 B2.rev_words().store_be(ptext + 16 * M);
173 B1.rev_words().store_be(ptext + 16 * 2 * M);
174 B0.rev_words().store_be(ptext + 16 * 3 * M);
175}
176
177template <typename SIMD_T, size_t M>
178BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void decrypt_x2(const uint8_t ctext[32 * 4 * M],
179 uint8_t ptext[32 * 4 * M],
180 std::span<const uint32_t> RK) {
181 SIMD_T B0 = SIMD_T::load_be(ctext);
182 SIMD_T B1 = SIMD_T::load_be(ctext + 16 * M);
183 SIMD_T B2 = SIMD_T::load_be(ctext + 16 * 2 * M);
184 SIMD_T B3 = SIMD_T::load_be(ctext + 16 * 3 * M);
185
186 SIMD_T B4 = SIMD_T::load_be(ctext + 16 * 4 * M);
187 SIMD_T B5 = SIMD_T::load_be(ctext + 16 * 5 * M);
188 SIMD_T B6 = SIMD_T::load_be(ctext + 16 * 6 * M);
189 SIMD_T B7 = SIMD_T::load_be(ctext + 16 * 7 * M);
190
191 SIMD_T::transpose(B0, B1, B2, B3);
192 SIMD_T::transpose(B4, B5, B6, B7);
193
194 B0 = B0.rev_words();
195 B1 = B1.rev_words();
196 B2 = B2.rev_words();
197 B3 = B3.rev_words();
198
199 B4 = B4.rev_words();
200 B5 = B5.rev_words();
201 B6 = B6.rev_words();
202 B7 = B7.rev_words();
203
204 for(size_t j = 0; j != 8; ++j) {
205 B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[32 - (4 * j + 1)]));
206 B4 ^= sm4_f(B5 ^ B6 ^ B7 ^ SIMD_T::splat(RK[32 - (4 * j + 1)]));
207
208 B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[32 - (4 * j + 2)]));
209 B5 ^= sm4_f(B6 ^ B7 ^ B4 ^ SIMD_T::splat(RK[32 - (4 * j + 2)]));
210
211 B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[32 - (4 * j + 3)]));
212 B6 ^= sm4_f(B7 ^ B4 ^ B5 ^ SIMD_T::splat(RK[32 - (4 * j + 3)]));
213
214 B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[32 - (4 * j + 4)]));
215 B7 ^= sm4_f(B4 ^ B5 ^ B6 ^ SIMD_T::splat(RK[32 - (4 * j + 4)]));
216 }
217
218 SIMD_T::transpose(B0, B1, B2, B3);
219 SIMD_T::transpose(B4, B5, B6, B7);
220
221 B3.rev_words().store_be(ptext);
222 B2.rev_words().store_be(ptext + 16 * M);
223 B1.rev_words().store_be(ptext + 16 * 2 * M);
224 B0.rev_words().store_be(ptext + 16 * 3 * M);
225
226 B7.rev_words().store_be(ptext + 16 * 4 * M);
227 B6.rev_words().store_be(ptext + 16 * 5 * M);
228 B5.rev_words().store_be(ptext + 16 * 6 * M);
229 B4.rev_words().store_be(ptext + 16 * 7 * M);
230}
231
232} // namespace
233
234} // namespace SM4_AVX512_GFNI
235
236void BOTAN_FN_ISA_AVX512_GFNI SM4::sm4_avx512_gfni_encrypt(const uint8_t ptext[],
237 uint8_t ctext[],
238 size_t blocks) const {
239 while(blocks >= 32) {
240 SM4_AVX512_GFNI::encrypt_x2<SIMD_16x32, 4>(ptext, ctext, m_RK);
241 ptext += 16 * 32;
242 ctext += 16 * 32;
243 blocks -= 32;
244 }
245
246 while(blocks >= 16) {
247 SM4_AVX512_GFNI::encrypt<SIMD_16x32, 4>(ptext, ctext, m_RK);
248 ptext += 16 * 16;
249 ctext += 16 * 16;
250 blocks -= 16;
251 }
252
253 while(blocks >= 8) {
254 SM4_AVX512_GFNI::encrypt<SIMD_8x32, 2>(ptext, ctext, m_RK);
255 ptext += 16 * 8;
256 ctext += 16 * 8;
257 blocks -= 8;
258 }
259
260 if(blocks > 0) {
261 uint8_t pbuf[16 * 8] = {0};
262 uint8_t cbuf[16 * 8] = {0};
263 copy_mem(pbuf, ptext, blocks * 16);
264 SM4_AVX512_GFNI::encrypt<SIMD_8x32, 2>(pbuf, cbuf, m_RK);
265 copy_mem(ctext, cbuf, blocks * 16);
266 }
267}
268
269void BOTAN_FN_ISA_AVX512_GFNI SM4::sm4_avx512_gfni_decrypt(const uint8_t ctext[],
270 uint8_t ptext[],
271 size_t blocks) const {
272 while(blocks >= 32) {
273 SM4_AVX512_GFNI::decrypt_x2<SIMD_16x32, 4>(ctext, ptext, m_RK);
274 ptext += 16 * 32;
275 ctext += 16 * 32;
276 blocks -= 32;
277 }
278
279 while(blocks >= 16) {
280 SM4_AVX512_GFNI::decrypt<SIMD_16x32, 4>(ctext, ptext, m_RK);
281 ptext += 16 * 16;
282 ctext += 16 * 16;
283 blocks -= 16;
284 }
285
286 while(blocks >= 8) {
287 SM4_AVX512_GFNI::decrypt<SIMD_8x32, 2>(ctext, ptext, m_RK);
288 ptext += 16 * 8;
289 ctext += 16 * 8;
290 blocks -= 8;
291 }
292
293 if(blocks > 0) {
294 uint8_t cbuf[16 * 8] = {0};
295 uint8_t pbuf[16 * 8] = {0};
296 copy_mem(cbuf, ctext, blocks * 16);
297 SM4_AVX512_GFNI::decrypt<SIMD_8x32, 2>(cbuf, pbuf, m_RK);
298 copy_mem(ptext, pbuf, blocks * 16);
299 }
300}
301
302} // namespace Botan
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
constexpr void copy_mem(T *out, const T *in, size_t n)
Definition mem_ops.h:144
consteval uint64_t gfni_matrix(std::string_view s)
Definition gfni_utils.h:17
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2_GFNI SIMD_8x32 gf2p8affineinv(const SIMD_8x32 &x)
BOTAN_FORCE_INLINE constexpr T rotl(T input)
Definition rotate.h:23
BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2_GFNI SIMD_8x32 gf2p8affine(const SIMD_8x32 &x)