20template <u
int64_t A, u
int8_t B>
22 return SIMD_16x32(_mm512_gf2p8affine_epi64_epi8(x.
raw(), _mm512_set1_epi64(A), B));
25template <u
int64_t A, u
int8_t B>
27 return SIMD_16x32(_mm512_gf2p8affineinv_epi64_epi8(x.
raw(), _mm512_set1_epi64(A), B));
30template <
typename SIMD_T>
46 constexpr uint8_t pre_c = 0b00111110;
58 constexpr uint8_t post_c = 0b11010011;
64template <
typename SIMD_T>
66 const auto sx = sm4_sbox(x);
70template <
typename SIMD_T,
size_t M>
71BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI
void encrypt(
const uint8_t ptext[16 * 4 * M],
72 uint8_t ctext[16 * 4 * M],
73 std::span<const uint32_t> RK) {
74 SIMD_T B0 = SIMD_T::load_be(ptext);
75 SIMD_T B1 = SIMD_T::load_be(ptext + 16 * M);
76 SIMD_T B2 = SIMD_T::load_be(ptext + 16 * 2 * M);
77 SIMD_T B3 = SIMD_T::load_be(ptext + 16 * 3 * M);
79 SIMD_T::transpose(B0, B1, B2, B3);
86 for(
size_t j = 0; j != 8; ++j) {
87 B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[4 * j]));
88 B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[4 * j + 1]));
89 B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[4 * j + 2]));
90 B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[4 * j + 3]));
93 SIMD_T::transpose(B0, B1, B2, B3);
95 B3.rev_words().store_be(ctext);
96 B2.rev_words().store_be(ctext + 16 * M);
97 B1.rev_words().store_be(ctext + 16 * 2 * M);
98 B0.rev_words().store_be(ctext + 16 * 3 * M);
101template <
typename SIMD_T,
size_t M>
102BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI
void encrypt_x2(
const uint8_t ptext[32 * 4 * M],
103 uint8_t ctext[32 * 4 * M],
104 std::span<const uint32_t> RK) {
105 SIMD_T B0 = SIMD_T::load_be(ptext);
106 SIMD_T B1 = SIMD_T::load_be(ptext + 16 * M);
107 SIMD_T B2 = SIMD_T::load_be(ptext + 16 * 2 * M);
108 SIMD_T B3 = SIMD_T::load_be(ptext + 16 * 3 * M);
110 SIMD_T B4 = SIMD_T::load_be(ptext + 16 * 4 * M);
111 SIMD_T B5 = SIMD_T::load_be(ptext + 16 * 5 * M);
112 SIMD_T B6 = SIMD_T::load_be(ptext + 16 * 6 * M);
113 SIMD_T B7 = SIMD_T::load_be(ptext + 16 * 7 * M);
115 SIMD_T::transpose(B0, B1, B2, B3);
116 SIMD_T::transpose(B4, B5, B6, B7);
128 for(
size_t j = 0; j != 8; ++j) {
129 B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[4 * j]));
130 B4 ^= sm4_f(B5 ^ B6 ^ B7 ^ SIMD_T::splat(RK[4 * j]));
132 B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[4 * j + 1]));
133 B5 ^= sm4_f(B6 ^ B7 ^ B4 ^ SIMD_T::splat(RK[4 * j + 1]));
135 B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[4 * j + 2]));
136 B6 ^= sm4_f(B7 ^ B4 ^ B5 ^ SIMD_T::splat(RK[4 * j + 2]));
138 B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[4 * j + 3]));
139 B7 ^= sm4_f(B4 ^ B5 ^ B6 ^ SIMD_T::splat(RK[4 * j + 3]));
142 SIMD_T::transpose(B0, B1, B2, B3);
143 SIMD_T::transpose(B4, B5, B6, B7);
145 B3.rev_words().store_be(ctext);
146 B2.rev_words().store_be(ctext + 16 * M);
147 B1.rev_words().store_be(ctext + 16 * 2 * M);
148 B0.rev_words().store_be(ctext + 16 * 3 * M);
150 B7.rev_words().store_be(ctext + 16 * 4 * M);
151 B6.rev_words().store_be(ctext + 16 * 5 * M);
152 B5.rev_words().store_be(ctext + 16 * 6 * M);
153 B4.rev_words().store_be(ctext + 16 * 7 * M);
156template <
typename SIMD_T,
size_t M>
157BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI
void decrypt(
const uint8_t ctext[16 * 4 * M],
158 uint8_t ptext[16 * 4 * M],
159 std::span<const uint32_t> RK) {
160 SIMD_T B0 = SIMD_T::load_be(ctext);
161 SIMD_T B1 = SIMD_T::load_be(ctext + 16 * M);
162 SIMD_T B2 = SIMD_T::load_be(ctext + 16 * 2 * M);
163 SIMD_T B3 = SIMD_T::load_be(ctext + 16 * 3 * M);
165 SIMD_T::transpose(B0, B1, B2, B3);
172 for(
size_t j = 0; j != 8; ++j) {
173 B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[32 - (4 * j + 1)]));
174 B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[32 - (4 * j + 2)]));
175 B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[32 - (4 * j + 3)]));
176 B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[32 - (4 * j + 4)]));
179 SIMD_T::transpose(B0, B1, B2, B3);
181 B3.rev_words().store_be(ptext);
182 B2.rev_words().store_be(ptext + 16 * M);
183 B1.rev_words().store_be(ptext + 16 * 2 * M);
184 B0.rev_words().store_be(ptext + 16 * 3 * M);
187template <
typename SIMD_T,
size_t M>
188BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI
void decrypt_x2(
const uint8_t ctext[32 * 4 * M],
189 uint8_t ptext[32 * 4 * M],
190 std::span<const uint32_t> RK) {
191 SIMD_T B0 = SIMD_T::load_be(ctext);
192 SIMD_T B1 = SIMD_T::load_be(ctext + 16 * M);
193 SIMD_T B2 = SIMD_T::load_be(ctext + 16 * 2 * M);
194 SIMD_T B3 = SIMD_T::load_be(ctext + 16 * 3 * M);
196 SIMD_T B4 = SIMD_T::load_be(ctext + 16 * 4 * M);
197 SIMD_T B5 = SIMD_T::load_be(ctext + 16 * 5 * M);
198 SIMD_T B6 = SIMD_T::load_be(ctext + 16 * 6 * M);
199 SIMD_T B7 = SIMD_T::load_be(ctext + 16 * 7 * M);
201 SIMD_T::transpose(B0, B1, B2, B3);
202 SIMD_T::transpose(B4, B5, B6, B7);
214 for(
size_t j = 0; j != 8; ++j) {
215 B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[32 - (4 * j + 1)]));
216 B4 ^= sm4_f(B5 ^ B6 ^ B7 ^ SIMD_T::splat(RK[32 - (4 * j + 1)]));
218 B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[32 - (4 * j + 2)]));
219 B5 ^= sm4_f(B6 ^ B7 ^ B4 ^ SIMD_T::splat(RK[32 - (4 * j + 2)]));
221 B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[32 - (4 * j + 3)]));
222 B6 ^= sm4_f(B7 ^ B4 ^ B5 ^ SIMD_T::splat(RK[32 - (4 * j + 3)]));
224 B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[32 - (4 * j + 4)]));
225 B7 ^= sm4_f(B4 ^ B5 ^ B6 ^ SIMD_T::splat(RK[32 - (4 * j + 4)]));
228 SIMD_T::transpose(B0, B1, B2, B3);
229 SIMD_T::transpose(B4, B5, B6, B7);
231 B3.rev_words().store_be(ptext);
232 B2.rev_words().store_be(ptext + 16 * M);
233 B1.rev_words().store_be(ptext + 16 * 2 * M);
234 B0.rev_words().store_be(ptext + 16 * 3 * M);
236 B7.rev_words().store_be(ptext + 16 * 4 * M);
237 B6.rev_words().store_be(ptext + 16 * 5 * M);
238 B5.rev_words().store_be(ptext + 16 * 6 * M);
239 B4.rev_words().store_be(ptext + 16 * 7 * M);