Botan 3.11.0
Crypto and TLS for C&
sha2_32_avx2.cpp
Go to the documentation of this file.
1/*
2* (C) 2025 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/sha2_32.h>
8
9#include <botan/internal/isa_extn.h>
10#include <botan/internal/sha2_32_f.h>
11#include <botan/internal/simd_4x32.h>
12#include <botan/internal/simd_avx2.h>
13#include <botan/internal/stack_scrubbing.h>
14
15#include <immintrin.h>
16
17namespace Botan {
18
19namespace {
20
21BOTAN_FN_ISA_AVX2_BMI2 inline SIMD_4x32 alignr4(const SIMD_4x32& a, const SIMD_4x32& b) {
22 return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), 4));
23}
24
25template <size_t S>
26BOTAN_FN_ISA_AVX2_BMI2 inline SIMD_4x32 shr64(const SIMD_4x32& a) {
27 return SIMD_4x32(_mm_srli_epi64(a.raw(), S));
28}
29
30template <uint8_t S>
31BOTAN_FN_ISA_AVX2_BMI2 inline SIMD_4x32 shuffle_32(const SIMD_4x32& a) {
32 return SIMD_4x32(_mm_shuffle_epi32(a.raw(), S));
33}
34
35BOTAN_FN_ISA_AVX2_BMI2 inline SIMD_8x32 alignr4(const SIMD_8x32& a, const SIMD_8x32& b) {
36 return SIMD_8x32(_mm256_alignr_epi8(a.raw(), b.raw(), 4));
37}
38
39template <size_t S>
40BOTAN_FN_ISA_AVX2_BMI2 inline SIMD_8x32 shr64(const SIMD_8x32& a) {
41 return SIMD_8x32(_mm256_srli_epi64(a.raw(), S));
42}
43
44template <uint8_t S>
45BOTAN_FN_ISA_AVX2_BMI2 inline SIMD_8x32 shuffle_32(const SIMD_8x32& a) {
46 return SIMD_8x32(_mm256_shuffle_epi32(a.raw(), S));
47}
48
49template <typename SIMD_T>
50BOTAN_FN_ISA_AVX2_BMI2 BOTAN_FORCE_INLINE SIMD_T next_w(SIMD_T x[4]) {
51 constexpr size_t sigma0_0 = 7;
52 constexpr size_t sigma0_1 = 18;
53 constexpr size_t sigma0_2 = 3;
54 constexpr size_t sigma1_0 = 17;
55 constexpr size_t sigma1_1 = 19;
56 constexpr size_t sigma1_2 = 10;
57
58 const SIMD_T lo_mask = SIMD_T(0x03020100, 0x0b0a0908, 0x80808080, 0x80808080);
59 const SIMD_T hi_mask = SIMD_T(0x80808080, 0x80808080, 0x03020100, 0x0b0a0908);
60
61 auto t0 = alignr4(x[1], x[0]);
62 x[0] += alignr4(x[3], x[2]);
63
64 auto t1 = t0.template shl<32 - sigma0_1>();
65 auto t2 = t0.template shr<sigma0_0>();
66 auto t3 = t0.template shr<sigma0_2>();
67 t0 = t3 ^ t2;
68
69 t3 = shuffle_32<0b11111010>(x[3]);
70 t2 = t2.template shr<sigma0_1 - sigma0_0>();
71 t0 ^= t1 ^ t2;
72 t1 = t1.template shl<sigma0_1 - sigma0_0>();
73 t2 = t3.template shr<sigma1_2>();
74 t3 = shr64<sigma1_0>(t3);
75 x[0] += t0 ^ t1;
76
77 t2 ^= t3;
78 t3 = shr64<sigma1_1 - sigma1_0>(t3);
79 x[0] += SIMD_T::byte_shuffle(t2 ^ t3, lo_mask);
80
81 t3 = shuffle_32<0b01010000>(x[0]);
82 t2 = t3.template shr<sigma1_2>();
83 t3 = shr64<sigma1_0>(t3);
84 t2 ^= t3;
85 t3 = shr64<sigma1_1 - sigma1_0>(t3);
86 x[0] += SIMD_T::byte_shuffle(t2 ^ t3, hi_mask);
87
88 const auto tmp = x[0];
89 x[0] = x[1];
90 x[1] = x[2];
91 x[2] = x[3];
92 x[3] = tmp;
93
94 return x[3];
95}
96
97} // namespace
98
99BOTAN_FN_ISA_AVX2_BMI2 BOTAN_SCRUB_STACK_AFTER_RETURN void SHA_256::compress_digest_x86_avx2(
100 digest_type& digest, std::span<const uint8_t> input, size_t blocks) {
101 // clang-format off
102
103 alignas(64) const uint32_t K[64] = {
104 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
105 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
106 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
107 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
108 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
109 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
110 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
111 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2};
112
113 // clang-format on
114
115 alignas(64) uint32_t W[16];
116 alignas(64) uint32_t W2[64];
117
118 uint32_t A = digest[0];
119 uint32_t B = digest[1];
120 uint32_t C = digest[2];
121 uint32_t D = digest[3];
122 uint32_t E = digest[4];
123 uint32_t F = digest[5];
124 uint32_t G = digest[6];
125 uint32_t H = digest[7];
126
127 const uint8_t* data = input.data();
128
129 while(blocks >= 2) {
130 SIMD_8x32 WS[4];
131
132 for(size_t i = 0; i < 4; i++) {
133 WS[i] = SIMD_8x32::load_be128(&data[16 * i], &data[64 + 16 * i]);
134 auto WK = WS[i] + SIMD_8x32::load_le128(&K[4 * i]);
135 WK.store_le128(&W[4 * i], &W2[4 * i]);
136 }
137
138 data += 2 * 64;
139 blocks -= 2;
140
141 for(size_t r = 0; r != 48; r += 16) {
142 auto w = next_w(WS) + SIMD_8x32::load_le128(&K[r + 16]);
143
144 SHA2_32_F(A, B, C, D, E, F, G, H, W[0]);
145 SHA2_32_F(H, A, B, C, D, E, F, G, W[1]);
146 SHA2_32_F(G, H, A, B, C, D, E, F, W[2]);
147 SHA2_32_F(F, G, H, A, B, C, D, E, W[3]);
148
149 w.store_le128(&W[0], &W2[r + 16]);
150
151 w = next_w(WS) + SIMD_8x32::load_le128(&K[r + 20]);
152
153 SHA2_32_F(E, F, G, H, A, B, C, D, W[4]);
154 SHA2_32_F(D, E, F, G, H, A, B, C, W[5]);
155 SHA2_32_F(C, D, E, F, G, H, A, B, W[6]);
156 SHA2_32_F(B, C, D, E, F, G, H, A, W[7]);
157
158 w.store_le128(&W[4], &W2[r + 20]);
159
160 w = next_w(WS) + SIMD_8x32::load_le128(&K[r + 24]);
161
162 SHA2_32_F(A, B, C, D, E, F, G, H, W[8]);
163 SHA2_32_F(H, A, B, C, D, E, F, G, W[9]);
164 SHA2_32_F(G, H, A, B, C, D, E, F, W[10]);
165 SHA2_32_F(F, G, H, A, B, C, D, E, W[11]);
166
167 w.store_le128(&W[8], &W2[r + 24]);
168
169 w = next_w(WS) + SIMD_8x32::load_le128(&K[r + 28]);
170
171 SHA2_32_F(E, F, G, H, A, B, C, D, W[12]);
172 SHA2_32_F(D, E, F, G, H, A, B, C, W[13]);
173 SHA2_32_F(C, D, E, F, G, H, A, B, W[14]);
174 SHA2_32_F(B, C, D, E, F, G, H, A, W[15]);
175
176 w.store_le128(&W[12], &W2[r + 28]);
177 }
178
179 SHA2_32_F(A, B, C, D, E, F, G, H, W[0]);
180 SHA2_32_F(H, A, B, C, D, E, F, G, W[1]);
181 SHA2_32_F(G, H, A, B, C, D, E, F, W[2]);
182 SHA2_32_F(F, G, H, A, B, C, D, E, W[3]);
183 SHA2_32_F(E, F, G, H, A, B, C, D, W[4]);
184 SHA2_32_F(D, E, F, G, H, A, B, C, W[5]);
185 SHA2_32_F(C, D, E, F, G, H, A, B, W[6]);
186 SHA2_32_F(B, C, D, E, F, G, H, A, W[7]);
187 SHA2_32_F(A, B, C, D, E, F, G, H, W[8]);
188 SHA2_32_F(H, A, B, C, D, E, F, G, W[9]);
189 SHA2_32_F(G, H, A, B, C, D, E, F, W[10]);
190 SHA2_32_F(F, G, H, A, B, C, D, E, W[11]);
191 SHA2_32_F(E, F, G, H, A, B, C, D, W[12]);
192 SHA2_32_F(D, E, F, G, H, A, B, C, W[13]);
193 SHA2_32_F(C, D, E, F, G, H, A, B, W[14]);
194 SHA2_32_F(B, C, D, E, F, G, H, A, W[15]);
195
196 A = (digest[0] += A);
197 B = (digest[1] += B);
198 C = (digest[2] += C);
199 D = (digest[3] += D);
200 E = (digest[4] += E);
201 F = (digest[5] += F);
202 G = (digest[6] += G);
203 H = (digest[7] += H);
204
205 // Now the second block, with already expanded message
206 SHA2_32_F(A, B, C, D, E, F, G, H, W2[0]);
207 SHA2_32_F(H, A, B, C, D, E, F, G, W2[1]);
208 SHA2_32_F(G, H, A, B, C, D, E, F, W2[2]);
209 SHA2_32_F(F, G, H, A, B, C, D, E, W2[3]);
210 SHA2_32_F(E, F, G, H, A, B, C, D, W2[4]);
211 SHA2_32_F(D, E, F, G, H, A, B, C, W2[5]);
212 SHA2_32_F(C, D, E, F, G, H, A, B, W2[6]);
213 SHA2_32_F(B, C, D, E, F, G, H, A, W2[7]);
214 SHA2_32_F(A, B, C, D, E, F, G, H, W2[8]);
215 SHA2_32_F(H, A, B, C, D, E, F, G, W2[9]);
216 SHA2_32_F(G, H, A, B, C, D, E, F, W2[10]);
217 SHA2_32_F(F, G, H, A, B, C, D, E, W2[11]);
218 SHA2_32_F(E, F, G, H, A, B, C, D, W2[12]);
219 SHA2_32_F(D, E, F, G, H, A, B, C, W2[13]);
220 SHA2_32_F(C, D, E, F, G, H, A, B, W2[14]);
221 SHA2_32_F(B, C, D, E, F, G, H, A, W2[15]);
222
223 SHA2_32_F(A, B, C, D, E, F, G, H, W2[16]);
224 SHA2_32_F(H, A, B, C, D, E, F, G, W2[17]);
225 SHA2_32_F(G, H, A, B, C, D, E, F, W2[18]);
226 SHA2_32_F(F, G, H, A, B, C, D, E, W2[19]);
227 SHA2_32_F(E, F, G, H, A, B, C, D, W2[20]);
228 SHA2_32_F(D, E, F, G, H, A, B, C, W2[21]);
229 SHA2_32_F(C, D, E, F, G, H, A, B, W2[22]);
230 SHA2_32_F(B, C, D, E, F, G, H, A, W2[23]);
231 SHA2_32_F(A, B, C, D, E, F, G, H, W2[24]);
232 SHA2_32_F(H, A, B, C, D, E, F, G, W2[25]);
233 SHA2_32_F(G, H, A, B, C, D, E, F, W2[26]);
234 SHA2_32_F(F, G, H, A, B, C, D, E, W2[27]);
235 SHA2_32_F(E, F, G, H, A, B, C, D, W2[28]);
236 SHA2_32_F(D, E, F, G, H, A, B, C, W2[29]);
237 SHA2_32_F(C, D, E, F, G, H, A, B, W2[30]);
238 SHA2_32_F(B, C, D, E, F, G, H, A, W2[31]);
239
240 SHA2_32_F(A, B, C, D, E, F, G, H, W2[32]);
241 SHA2_32_F(H, A, B, C, D, E, F, G, W2[33]);
242 SHA2_32_F(G, H, A, B, C, D, E, F, W2[34]);
243 SHA2_32_F(F, G, H, A, B, C, D, E, W2[35]);
244 SHA2_32_F(E, F, G, H, A, B, C, D, W2[36]);
245 SHA2_32_F(D, E, F, G, H, A, B, C, W2[37]);
246 SHA2_32_F(C, D, E, F, G, H, A, B, W2[38]);
247 SHA2_32_F(B, C, D, E, F, G, H, A, W2[39]);
248 SHA2_32_F(A, B, C, D, E, F, G, H, W2[40]);
249 SHA2_32_F(H, A, B, C, D, E, F, G, W2[41]);
250 SHA2_32_F(G, H, A, B, C, D, E, F, W2[42]);
251 SHA2_32_F(F, G, H, A, B, C, D, E, W2[43]);
252 SHA2_32_F(E, F, G, H, A, B, C, D, W2[44]);
253 SHA2_32_F(D, E, F, G, H, A, B, C, W2[45]);
254 SHA2_32_F(C, D, E, F, G, H, A, B, W2[46]);
255 SHA2_32_F(B, C, D, E, F, G, H, A, W2[47]);
256
257 SHA2_32_F(A, B, C, D, E, F, G, H, W2[48]);
258 SHA2_32_F(H, A, B, C, D, E, F, G, W2[49]);
259 SHA2_32_F(G, H, A, B, C, D, E, F, W2[50]);
260 SHA2_32_F(F, G, H, A, B, C, D, E, W2[51]);
261 SHA2_32_F(E, F, G, H, A, B, C, D, W2[52]);
262 SHA2_32_F(D, E, F, G, H, A, B, C, W2[53]);
263 SHA2_32_F(C, D, E, F, G, H, A, B, W2[54]);
264 SHA2_32_F(B, C, D, E, F, G, H, A, W2[55]);
265 SHA2_32_F(A, B, C, D, E, F, G, H, W2[56]);
266 SHA2_32_F(H, A, B, C, D, E, F, G, W2[57]);
267 SHA2_32_F(G, H, A, B, C, D, E, F, W2[58]);
268 SHA2_32_F(F, G, H, A, B, C, D, E, W2[59]);
269 SHA2_32_F(E, F, G, H, A, B, C, D, W2[60]);
270 SHA2_32_F(D, E, F, G, H, A, B, C, W2[61]);
271 SHA2_32_F(C, D, E, F, G, H, A, B, W2[62]);
272 SHA2_32_F(B, C, D, E, F, G, H, A, W2[63]);
273
274 A = (digest[0] += A);
275 B = (digest[1] += B);
276 C = (digest[2] += C);
277 D = (digest[3] += D);
278 E = (digest[4] += E);
279 F = (digest[5] += F);
280 G = (digest[6] += G);
281 H = (digest[7] += H);
282 }
283
284 while(blocks > 0) {
285 SIMD_4x32 WS[4];
286
287 for(size_t i = 0; i < 4; i++) {
288 WS[i] = SIMD_4x32::load_be(&data[16 * i]);
289 auto WK = WS[i] + SIMD_4x32::load_le(&K[4 * i]);
290 WK.store_le(&W[4 * i]);
291 }
292
293 data += 64;
294 blocks -= 1;
295
296 for(size_t r = 0; r != 48; r += 16) {
297 auto w = next_w(WS) + SIMD_4x32::load_le(&K[r + 16]);
298
299 SHA2_32_F(A, B, C, D, E, F, G, H, W[0]);
300 SHA2_32_F(H, A, B, C, D, E, F, G, W[1]);
301 SHA2_32_F(G, H, A, B, C, D, E, F, W[2]);
302 SHA2_32_F(F, G, H, A, B, C, D, E, W[3]);
303
304 w.store_le(&W[0]);
305
306 w = next_w(WS) + SIMD_4x32::load_le(&K[r + 20]);
307
308 SHA2_32_F(E, F, G, H, A, B, C, D, W[4]);
309 SHA2_32_F(D, E, F, G, H, A, B, C, W[5]);
310 SHA2_32_F(C, D, E, F, G, H, A, B, W[6]);
311 SHA2_32_F(B, C, D, E, F, G, H, A, W[7]);
312
313 w.store_le(&W[4]);
314
315 w = next_w(WS) + SIMD_4x32::load_le(&K[r + 24]);
316
317 SHA2_32_F(A, B, C, D, E, F, G, H, W[8]);
318 SHA2_32_F(H, A, B, C, D, E, F, G, W[9]);
319 SHA2_32_F(G, H, A, B, C, D, E, F, W[10]);
320 SHA2_32_F(F, G, H, A, B, C, D, E, W[11]);
321
322 w.store_le(&W[8]);
323
324 w = next_w(WS) + SIMD_4x32::load_le(&K[r + 28]);
325
326 SHA2_32_F(E, F, G, H, A, B, C, D, W[12]);
327 SHA2_32_F(D, E, F, G, H, A, B, C, W[13]);
328 SHA2_32_F(C, D, E, F, G, H, A, B, W[14]);
329 SHA2_32_F(B, C, D, E, F, G, H, A, W[15]);
330
331 w.store_le(&W[12]);
332 }
333
334 SHA2_32_F(A, B, C, D, E, F, G, H, W[0]);
335 SHA2_32_F(H, A, B, C, D, E, F, G, W[1]);
336 SHA2_32_F(G, H, A, B, C, D, E, F, W[2]);
337 SHA2_32_F(F, G, H, A, B, C, D, E, W[3]);
338 SHA2_32_F(E, F, G, H, A, B, C, D, W[4]);
339 SHA2_32_F(D, E, F, G, H, A, B, C, W[5]);
340 SHA2_32_F(C, D, E, F, G, H, A, B, W[6]);
341 SHA2_32_F(B, C, D, E, F, G, H, A, W[7]);
342 SHA2_32_F(A, B, C, D, E, F, G, H, W[8]);
343 SHA2_32_F(H, A, B, C, D, E, F, G, W[9]);
344 SHA2_32_F(G, H, A, B, C, D, E, F, W[10]);
345 SHA2_32_F(F, G, H, A, B, C, D, E, W[11]);
346 SHA2_32_F(E, F, G, H, A, B, C, D, W[12]);
347 SHA2_32_F(D, E, F, G, H, A, B, C, W[13]);
348 SHA2_32_F(C, D, E, F, G, H, A, B, W[14]);
349 SHA2_32_F(B, C, D, E, F, G, H, A, W[15]);
350
351 A = (digest[0] += A);
352 B = (digest[1] += B);
353 C = (digest[2] += C);
354 D = (digest[3] += D);
355 E = (digest[4] += E);
356 F = (digest[5] += F);
357 G = (digest[6] += G);
358 H = (digest[7] += H);
359 }
360}
361
362} // namespace Botan
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 load_be(const void *in) noexcept
Definition simd_4x32.h:189
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 load_le(const void *in) noexcept
Definition simd_4x32.h:162
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_le128(const uint8_t *in) noexcept
Definition simd_avx2.h:71
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be128(const uint8_t in1[], const uint8_t in2[]) noexcept
Definition simd_avx2.h:101
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
BOTAN_FORCE_INLINE void SHA2_32_F(uint32_t A, uint32_t B, uint32_t C, uint32_t &D, uint32_t E, uint32_t F, uint32_t G, uint32_t &H, uint32_t &M1, uint32_t M2, uint32_t M3, uint32_t M4, uint32_t magic)
Definition sha2_32_f.h:19
SIMD_4x32 shl(SIMD_4x32 input)
Definition simd_4x32.h:938
#define BOTAN_SCRUB_STACK_AFTER_RETURN