20template <
typename SIMD_T>
36 constexpr uint8_t pre_c = 0b00111110;
48 constexpr uint8_t post_c = 0b11010011;
54template <
typename SIMD_T>
56 const auto sx = sm4_sbox(x);
60template <
typename SIMD_T,
size_t M>
61BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI
void encrypt(
const uint8_t ptext[16 * 4 * M],
62 uint8_t ctext[16 * 4 * M],
63 std::span<const uint32_t> RK) {
64 SIMD_T B0 = SIMD_T::load_be(ptext);
65 SIMD_T B1 = SIMD_T::load_be(ptext + 16 * M);
66 SIMD_T B2 = SIMD_T::load_be(ptext + 16 * 2 * M);
67 SIMD_T B3 = SIMD_T::load_be(ptext + 16 * 3 * M);
69 SIMD_T::transpose(B0, B1, B2, B3);
76 for(
size_t j = 0; j != 8; ++j) {
77 B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[4 * j]));
78 B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[4 * j + 1]));
79 B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[4 * j + 2]));
80 B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[4 * j + 3]));
83 SIMD_T::transpose(B0, B1, B2, B3);
85 B3.rev_words().store_be(ctext);
86 B2.rev_words().store_be(ctext + 16 * M);
87 B1.rev_words().store_be(ctext + 16 * 2 * M);
88 B0.rev_words().store_be(ctext + 16 * 3 * M);
91template <
typename SIMD_T,
size_t M>
92BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI
void encrypt_x2(
const uint8_t ptext[32 * 4 * M],
93 uint8_t ctext[32 * 4 * M],
94 std::span<const uint32_t> RK) {
95 SIMD_T B0 = SIMD_T::load_be(ptext);
96 SIMD_T B1 = SIMD_T::load_be(ptext + 16 * M);
97 SIMD_T B2 = SIMD_T::load_be(ptext + 16 * 2 * M);
98 SIMD_T B3 = SIMD_T::load_be(ptext + 16 * 3 * M);
100 SIMD_T B4 = SIMD_T::load_be(ptext + 16 * 4 * M);
101 SIMD_T B5 = SIMD_T::load_be(ptext + 16 * 5 * M);
102 SIMD_T B6 = SIMD_T::load_be(ptext + 16 * 6 * M);
103 SIMD_T B7 = SIMD_T::load_be(ptext + 16 * 7 * M);
105 SIMD_T::transpose(B0, B1, B2, B3);
106 SIMD_T::transpose(B4, B5, B6, B7);
118 for(
size_t j = 0; j != 8; ++j) {
119 B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[4 * j]));
120 B4 ^= sm4_f(B5 ^ B6 ^ B7 ^ SIMD_T::splat(RK[4 * j]));
122 B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[4 * j + 1]));
123 B5 ^= sm4_f(B6 ^ B7 ^ B4 ^ SIMD_T::splat(RK[4 * j + 1]));
125 B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[4 * j + 2]));
126 B6 ^= sm4_f(B7 ^ B4 ^ B5 ^ SIMD_T::splat(RK[4 * j + 2]));
128 B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[4 * j + 3]));
129 B7 ^= sm4_f(B4 ^ B5 ^ B6 ^ SIMD_T::splat(RK[4 * j + 3]));
132 SIMD_T::transpose(B0, B1, B2, B3);
133 SIMD_T::transpose(B4, B5, B6, B7);
135 B3.rev_words().store_be(ctext);
136 B2.rev_words().store_be(ctext + 16 * M);
137 B1.rev_words().store_be(ctext + 16 * 2 * M);
138 B0.rev_words().store_be(ctext + 16 * 3 * M);
140 B7.rev_words().store_be(ctext + 16 * 4 * M);
141 B6.rev_words().store_be(ctext + 16 * 5 * M);
142 B5.rev_words().store_be(ctext + 16 * 6 * M);
143 B4.rev_words().store_be(ctext + 16 * 7 * M);
146template <
typename SIMD_T,
size_t M>
147BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI
void decrypt(
const uint8_t ctext[16 * 4 * M],
148 uint8_t ptext[16 * 4 * M],
149 std::span<const uint32_t> RK) {
150 SIMD_T B0 = SIMD_T::load_be(ctext);
151 SIMD_T B1 = SIMD_T::load_be(ctext + 16 * M);
152 SIMD_T B2 = SIMD_T::load_be(ctext + 16 * 2 * M);
153 SIMD_T B3 = SIMD_T::load_be(ctext + 16 * 3 * M);
155 SIMD_T::transpose(B0, B1, B2, B3);
162 for(
size_t j = 0; j != 8; ++j) {
163 B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[32 - (4 * j + 1)]));
164 B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[32 - (4 * j + 2)]));
165 B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[32 - (4 * j + 3)]));
166 B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[32 - (4 * j + 4)]));
169 SIMD_T::transpose(B0, B1, B2, B3);
171 B3.rev_words().store_be(ptext);
172 B2.rev_words().store_be(ptext + 16 * M);
173 B1.rev_words().store_be(ptext + 16 * 2 * M);
174 B0.rev_words().store_be(ptext + 16 * 3 * M);
177template <
typename SIMD_T,
size_t M>
178BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI
void decrypt_x2(
const uint8_t ctext[32 * 4 * M],
179 uint8_t ptext[32 * 4 * M],
180 std::span<const uint32_t> RK) {
181 SIMD_T B0 = SIMD_T::load_be(ctext);
182 SIMD_T B1 = SIMD_T::load_be(ctext + 16 * M);
183 SIMD_T B2 = SIMD_T::load_be(ctext + 16 * 2 * M);
184 SIMD_T B3 = SIMD_T::load_be(ctext + 16 * 3 * M);
186 SIMD_T B4 = SIMD_T::load_be(ctext + 16 * 4 * M);
187 SIMD_T B5 = SIMD_T::load_be(ctext + 16 * 5 * M);
188 SIMD_T B6 = SIMD_T::load_be(ctext + 16 * 6 * M);
189 SIMD_T B7 = SIMD_T::load_be(ctext + 16 * 7 * M);
191 SIMD_T::transpose(B0, B1, B2, B3);
192 SIMD_T::transpose(B4, B5, B6, B7);
204 for(
size_t j = 0; j != 8; ++j) {
205 B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[32 - (4 * j + 1)]));
206 B4 ^= sm4_f(B5 ^ B6 ^ B7 ^ SIMD_T::splat(RK[32 - (4 * j + 1)]));
208 B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[32 - (4 * j + 2)]));
209 B5 ^= sm4_f(B6 ^ B7 ^ B4 ^ SIMD_T::splat(RK[32 - (4 * j + 2)]));
211 B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[32 - (4 * j + 3)]));
212 B6 ^= sm4_f(B7 ^ B4 ^ B5 ^ SIMD_T::splat(RK[32 - (4 * j + 3)]));
214 B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[32 - (4 * j + 4)]));
215 B7 ^= sm4_f(B4 ^ B5 ^ B6 ^ SIMD_T::splat(RK[32 - (4 * j + 4)]));
218 SIMD_T::transpose(B0, B1, B2, B3);
219 SIMD_T::transpose(B4, B5, B6, B7);
221 B3.rev_words().store_be(ptext);
222 B2.rev_words().store_be(ptext + 16 * M);
223 B1.rev_words().store_be(ptext + 16 * 2 * M);
224 B0.rev_words().store_be(ptext + 16 * 3 * M);
226 B7.rev_words().store_be(ptext + 16 * 4 * M);
227 B6.rev_words().store_be(ptext + 16 * 5 * M);
228 B5.rev_words().store_be(ptext + 16 * 6 * M);
229 B4.rev_words().store_be(ptext + 16 * 7 * M);