Botan 3.9.0
Crypto and TLS for C&
sha2_32_avx2.cpp
Go to the documentation of this file.
1/*
2* (C) 2025 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/sha2_32.h>
8
9#include <botan/internal/bit_ops.h>
10#include <botan/internal/isa_extn.h>
11#include <botan/internal/rotate.h>
12#include <botan/internal/sha2_32_f.h>
13#include <botan/internal/simd_4x32.h>
14#include <botan/internal/simd_avx2.h>
15#include <botan/internal/stack_scrubbing.h>
16
17#include <immintrin.h>
18
19namespace Botan {
20
21namespace {
22
23BOTAN_FN_ISA_AVX2_BMI2 inline SIMD_4x32 alignr4(const SIMD_4x32& a, const SIMD_4x32& b) {
24 return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), 4));
25}
26
27template <size_t S>
28BOTAN_FN_ISA_AVX2_BMI2 inline SIMD_4x32 shr64(const SIMD_4x32& a) {
29 return SIMD_4x32(_mm_srli_epi64(a.raw(), S));
30}
31
32template <uint8_t S>
33BOTAN_FN_ISA_AVX2_BMI2 inline SIMD_4x32 shuffle_32(const SIMD_4x32& a) {
34 return SIMD_4x32(_mm_shuffle_epi32(a.raw(), S));
35}
36
37BOTAN_FN_ISA_AVX2_BMI2 inline SIMD_8x32 alignr4(const SIMD_8x32& a, const SIMD_8x32& b) {
38 return SIMD_8x32(_mm256_alignr_epi8(a.raw(), b.raw(), 4));
39}
40
41template <size_t S>
42BOTAN_FN_ISA_AVX2_BMI2 inline SIMD_8x32 shr64(const SIMD_8x32& a) {
43 return SIMD_8x32(_mm256_srli_epi64(a.raw(), S));
44}
45
46template <uint8_t S>
47BOTAN_FN_ISA_AVX2_BMI2 inline SIMD_8x32 shuffle_32(const SIMD_8x32& a) {
48 return SIMD_8x32(_mm256_shuffle_epi32(a.raw(), S));
49}
50
51template <typename SIMD_T>
52BOTAN_FN_ISA_AVX2_BMI2 BOTAN_FORCE_INLINE SIMD_T next_w(SIMD_T x[4]) {
53 constexpr size_t sigma0_0 = 7;
54 constexpr size_t sigma0_1 = 18;
55 constexpr size_t sigma0_2 = 3;
56 constexpr size_t sigma1_0 = 17;
57 constexpr size_t sigma1_1 = 19;
58 constexpr size_t sigma1_2 = 10;
59
60 const SIMD_T lo_mask = SIMD_T(0x03020100, 0x0b0a0908, 0x80808080, 0x80808080);
61 const SIMD_T hi_mask = SIMD_T(0x80808080, 0x80808080, 0x03020100, 0x0b0a0908);
62
63 auto t0 = alignr4(x[1], x[0]);
64 x[0] += alignr4(x[3], x[2]);
65
66 auto t1 = t0.template shl<32 - sigma0_1>();
67 auto t2 = t0.template shr<sigma0_0>();
68 auto t3 = t0.template shr<sigma0_2>();
69 t0 = t3 ^ t2;
70
71 t3 = shuffle_32<0b11111010>(x[3]);
72 t2 = t2.template shr<sigma0_1 - sigma0_0>();
73 t0 ^= t1 ^ t2;
74 t1 = t1.template shl<sigma0_1 - sigma0_0>();
75 t2 = t3.template shr<sigma1_2>();
76 t3 = shr64<sigma1_0>(t3);
77 x[0] += t0 ^ t1;
78
79 t2 ^= t3;
80 t3 = shr64<sigma1_1 - sigma1_0>(t3);
81 x[0] += SIMD_T::byte_shuffle(t2 ^ t3, lo_mask);
82
83 t3 = shuffle_32<0b01010000>(x[0]);
84 t2 = t3.template shr<sigma1_2>();
85 t3 = shr64<sigma1_0>(t3);
86 t2 ^= t3;
87 t3 = shr64<sigma1_1 - sigma1_0>(t3);
88 x[0] += SIMD_T::byte_shuffle(t2 ^ t3, hi_mask);
89
90 const auto tmp = x[0];
91 x[0] = x[1];
92 x[1] = x[2];
93 x[2] = x[3];
94 x[3] = tmp;
95
96 return x[3];
97}
98
99} // namespace
100
101BOTAN_FN_ISA_AVX2_BMI2 BOTAN_SCRUB_STACK_AFTER_RETURN void SHA_256::compress_digest_x86_avx2(
102 digest_type& digest, std::span<const uint8_t> input, size_t blocks) {
103 // clang-format off
104
105 alignas(64) const uint32_t K[64] = {
106 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
107 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
108 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
109 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
110 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
111 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
112 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
113 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2};
114
115 alignas(64) const uint32_t K2[2 * 64] = {
116 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
117 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
118 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
119 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
120 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
121 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
122 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
123 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
124 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
125 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
126 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
127 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
128 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
129 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
130 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
131 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2};
132
133 // clang-format on
134
135 alignas(64) uint32_t W[16];
136 alignas(64) uint32_t W2[64];
137
138 uint32_t A = digest[0];
139 uint32_t B = digest[1];
140 uint32_t C = digest[2];
141 uint32_t D = digest[3];
142 uint32_t E = digest[4];
143 uint32_t F = digest[5];
144 uint32_t G = digest[6];
145 uint32_t H = digest[7];
146
147 const uint8_t* data = input.data();
148
149 while(blocks >= 2) {
150 SIMD_8x32 WS[4];
151
152 for(size_t i = 0; i < 4; i++) {
153 WS[i] = SIMD_8x32::load_be128(&data[16 * i], &data[64 + 16 * i]);
154 auto WK = WS[i] + SIMD_8x32::load_le(&K2[8 * i]);
155 WK.store_le128(&W[4 * i], &W2[4 * i]);
156 }
157
158 data += 2 * 64;
159 blocks -= 2;
160
161 for(size_t r = 0; r != 48; r += 16) {
162 auto w = next_w(WS) + SIMD_8x32::load_le(&K2[2 * (r + 16)]);
163
164 SHA2_32_F(A, B, C, D, E, F, G, H, W[0]);
165 SHA2_32_F(H, A, B, C, D, E, F, G, W[1]);
166 SHA2_32_F(G, H, A, B, C, D, E, F, W[2]);
167 SHA2_32_F(F, G, H, A, B, C, D, E, W[3]);
168
169 w.store_le128(&W[0], &W2[r + 16]);
170
171 w = next_w(WS) + SIMD_8x32::load_le(&K2[2 * (r + 20)]);
172
173 SHA2_32_F(E, F, G, H, A, B, C, D, W[4]);
174 SHA2_32_F(D, E, F, G, H, A, B, C, W[5]);
175 SHA2_32_F(C, D, E, F, G, H, A, B, W[6]);
176 SHA2_32_F(B, C, D, E, F, G, H, A, W[7]);
177
178 w.store_le128(&W[4], &W2[r + 20]);
179
180 w = next_w(WS) + SIMD_8x32::load_le(&K2[2 * (r + 24)]);
181
182 SHA2_32_F(A, B, C, D, E, F, G, H, W[8]);
183 SHA2_32_F(H, A, B, C, D, E, F, G, W[9]);
184 SHA2_32_F(G, H, A, B, C, D, E, F, W[10]);
185 SHA2_32_F(F, G, H, A, B, C, D, E, W[11]);
186
187 w.store_le128(&W[8], &W2[r + 24]);
188
189 w = next_w(WS) + SIMD_8x32::load_le(&K2[2 * (r + 28)]);
190
191 SHA2_32_F(E, F, G, H, A, B, C, D, W[12]);
192 SHA2_32_F(D, E, F, G, H, A, B, C, W[13]);
193 SHA2_32_F(C, D, E, F, G, H, A, B, W[14]);
194 SHA2_32_F(B, C, D, E, F, G, H, A, W[15]);
195
196 w.store_le128(&W[12], &W2[r + 28]);
197 }
198
199 SHA2_32_F(A, B, C, D, E, F, G, H, W[0]);
200 SHA2_32_F(H, A, B, C, D, E, F, G, W[1]);
201 SHA2_32_F(G, H, A, B, C, D, E, F, W[2]);
202 SHA2_32_F(F, G, H, A, B, C, D, E, W[3]);
203 SHA2_32_F(E, F, G, H, A, B, C, D, W[4]);
204 SHA2_32_F(D, E, F, G, H, A, B, C, W[5]);
205 SHA2_32_F(C, D, E, F, G, H, A, B, W[6]);
206 SHA2_32_F(B, C, D, E, F, G, H, A, W[7]);
207 SHA2_32_F(A, B, C, D, E, F, G, H, W[8]);
208 SHA2_32_F(H, A, B, C, D, E, F, G, W[9]);
209 SHA2_32_F(G, H, A, B, C, D, E, F, W[10]);
210 SHA2_32_F(F, G, H, A, B, C, D, E, W[11]);
211 SHA2_32_F(E, F, G, H, A, B, C, D, W[12]);
212 SHA2_32_F(D, E, F, G, H, A, B, C, W[13]);
213 SHA2_32_F(C, D, E, F, G, H, A, B, W[14]);
214 SHA2_32_F(B, C, D, E, F, G, H, A, W[15]);
215
216 A = (digest[0] += A);
217 B = (digest[1] += B);
218 C = (digest[2] += C);
219 D = (digest[3] += D);
220 E = (digest[4] += E);
221 F = (digest[5] += F);
222 G = (digest[6] += G);
223 H = (digest[7] += H);
224
225 // Now the second block, with already expanded message
226 SHA2_32_F(A, B, C, D, E, F, G, H, W2[0]);
227 SHA2_32_F(H, A, B, C, D, E, F, G, W2[1]);
228 SHA2_32_F(G, H, A, B, C, D, E, F, W2[2]);
229 SHA2_32_F(F, G, H, A, B, C, D, E, W2[3]);
230 SHA2_32_F(E, F, G, H, A, B, C, D, W2[4]);
231 SHA2_32_F(D, E, F, G, H, A, B, C, W2[5]);
232 SHA2_32_F(C, D, E, F, G, H, A, B, W2[6]);
233 SHA2_32_F(B, C, D, E, F, G, H, A, W2[7]);
234 SHA2_32_F(A, B, C, D, E, F, G, H, W2[8]);
235 SHA2_32_F(H, A, B, C, D, E, F, G, W2[9]);
236 SHA2_32_F(G, H, A, B, C, D, E, F, W2[10]);
237 SHA2_32_F(F, G, H, A, B, C, D, E, W2[11]);
238 SHA2_32_F(E, F, G, H, A, B, C, D, W2[12]);
239 SHA2_32_F(D, E, F, G, H, A, B, C, W2[13]);
240 SHA2_32_F(C, D, E, F, G, H, A, B, W2[14]);
241 SHA2_32_F(B, C, D, E, F, G, H, A, W2[15]);
242
243 SHA2_32_F(A, B, C, D, E, F, G, H, W2[16]);
244 SHA2_32_F(H, A, B, C, D, E, F, G, W2[17]);
245 SHA2_32_F(G, H, A, B, C, D, E, F, W2[18]);
246 SHA2_32_F(F, G, H, A, B, C, D, E, W2[19]);
247 SHA2_32_F(E, F, G, H, A, B, C, D, W2[20]);
248 SHA2_32_F(D, E, F, G, H, A, B, C, W2[21]);
249 SHA2_32_F(C, D, E, F, G, H, A, B, W2[22]);
250 SHA2_32_F(B, C, D, E, F, G, H, A, W2[23]);
251 SHA2_32_F(A, B, C, D, E, F, G, H, W2[24]);
252 SHA2_32_F(H, A, B, C, D, E, F, G, W2[25]);
253 SHA2_32_F(G, H, A, B, C, D, E, F, W2[26]);
254 SHA2_32_F(F, G, H, A, B, C, D, E, W2[27]);
255 SHA2_32_F(E, F, G, H, A, B, C, D, W2[28]);
256 SHA2_32_F(D, E, F, G, H, A, B, C, W2[29]);
257 SHA2_32_F(C, D, E, F, G, H, A, B, W2[30]);
258 SHA2_32_F(B, C, D, E, F, G, H, A, W2[31]);
259
260 SHA2_32_F(A, B, C, D, E, F, G, H, W2[32]);
261 SHA2_32_F(H, A, B, C, D, E, F, G, W2[33]);
262 SHA2_32_F(G, H, A, B, C, D, E, F, W2[34]);
263 SHA2_32_F(F, G, H, A, B, C, D, E, W2[35]);
264 SHA2_32_F(E, F, G, H, A, B, C, D, W2[36]);
265 SHA2_32_F(D, E, F, G, H, A, B, C, W2[37]);
266 SHA2_32_F(C, D, E, F, G, H, A, B, W2[38]);
267 SHA2_32_F(B, C, D, E, F, G, H, A, W2[39]);
268 SHA2_32_F(A, B, C, D, E, F, G, H, W2[40]);
269 SHA2_32_F(H, A, B, C, D, E, F, G, W2[41]);
270 SHA2_32_F(G, H, A, B, C, D, E, F, W2[42]);
271 SHA2_32_F(F, G, H, A, B, C, D, E, W2[43]);
272 SHA2_32_F(E, F, G, H, A, B, C, D, W2[44]);
273 SHA2_32_F(D, E, F, G, H, A, B, C, W2[45]);
274 SHA2_32_F(C, D, E, F, G, H, A, B, W2[46]);
275 SHA2_32_F(B, C, D, E, F, G, H, A, W2[47]);
276
277 SHA2_32_F(A, B, C, D, E, F, G, H, W2[48]);
278 SHA2_32_F(H, A, B, C, D, E, F, G, W2[49]);
279 SHA2_32_F(G, H, A, B, C, D, E, F, W2[50]);
280 SHA2_32_F(F, G, H, A, B, C, D, E, W2[51]);
281 SHA2_32_F(E, F, G, H, A, B, C, D, W2[52]);
282 SHA2_32_F(D, E, F, G, H, A, B, C, W2[53]);
283 SHA2_32_F(C, D, E, F, G, H, A, B, W2[54]);
284 SHA2_32_F(B, C, D, E, F, G, H, A, W2[55]);
285 SHA2_32_F(A, B, C, D, E, F, G, H, W2[56]);
286 SHA2_32_F(H, A, B, C, D, E, F, G, W2[57]);
287 SHA2_32_F(G, H, A, B, C, D, E, F, W2[58]);
288 SHA2_32_F(F, G, H, A, B, C, D, E, W2[59]);
289 SHA2_32_F(E, F, G, H, A, B, C, D, W2[60]);
290 SHA2_32_F(D, E, F, G, H, A, B, C, W2[61]);
291 SHA2_32_F(C, D, E, F, G, H, A, B, W2[62]);
292 SHA2_32_F(B, C, D, E, F, G, H, A, W2[63]);
293
294 A = (digest[0] += A);
295 B = (digest[1] += B);
296 C = (digest[2] += C);
297 D = (digest[3] += D);
298 E = (digest[4] += E);
299 F = (digest[5] += F);
300 G = (digest[6] += G);
301 H = (digest[7] += H);
302 }
303
304 while(blocks > 0) {
305 SIMD_4x32 WS[4];
306
307 for(size_t i = 0; i < 4; i++) {
308 WS[i] = SIMD_4x32::load_be(&data[16 * i]);
309 auto WK = WS[i] + SIMD_4x32::load_le(&K[4 * i]);
310 WK.store_le(&W[4 * i]);
311 }
312
313 data += 64;
314 blocks -= 1;
315
316 for(size_t r = 0; r != 48; r += 16) {
317 auto w = next_w(WS) + SIMD_4x32::load_le(&K[r + 16]);
318
319 SHA2_32_F(A, B, C, D, E, F, G, H, W[0]);
320 SHA2_32_F(H, A, B, C, D, E, F, G, W[1]);
321 SHA2_32_F(G, H, A, B, C, D, E, F, W[2]);
322 SHA2_32_F(F, G, H, A, B, C, D, E, W[3]);
323
324 w.store_le(&W[0]);
325
326 w = next_w(WS) + SIMD_4x32::load_le(&K[r + 20]);
327
328 SHA2_32_F(E, F, G, H, A, B, C, D, W[4]);
329 SHA2_32_F(D, E, F, G, H, A, B, C, W[5]);
330 SHA2_32_F(C, D, E, F, G, H, A, B, W[6]);
331 SHA2_32_F(B, C, D, E, F, G, H, A, W[7]);
332
333 w.store_le(&W[4]);
334
335 w = next_w(WS) + SIMD_4x32::load_le(&K[r + 24]);
336
337 SHA2_32_F(A, B, C, D, E, F, G, H, W[8]);
338 SHA2_32_F(H, A, B, C, D, E, F, G, W[9]);
339 SHA2_32_F(G, H, A, B, C, D, E, F, W[10]);
340 SHA2_32_F(F, G, H, A, B, C, D, E, W[11]);
341
342 w.store_le(&W[8]);
343
344 w = next_w(WS) + SIMD_4x32::load_le(&K[r + 28]);
345
346 SHA2_32_F(E, F, G, H, A, B, C, D, W[12]);
347 SHA2_32_F(D, E, F, G, H, A, B, C, W[13]);
348 SHA2_32_F(C, D, E, F, G, H, A, B, W[14]);
349 SHA2_32_F(B, C, D, E, F, G, H, A, W[15]);
350
351 w.store_le(&W[12]);
352 }
353
354 SHA2_32_F(A, B, C, D, E, F, G, H, W[0]);
355 SHA2_32_F(H, A, B, C, D, E, F, G, W[1]);
356 SHA2_32_F(G, H, A, B, C, D, E, F, W[2]);
357 SHA2_32_F(F, G, H, A, B, C, D, E, W[3]);
358 SHA2_32_F(E, F, G, H, A, B, C, D, W[4]);
359 SHA2_32_F(D, E, F, G, H, A, B, C, W[5]);
360 SHA2_32_F(C, D, E, F, G, H, A, B, W[6]);
361 SHA2_32_F(B, C, D, E, F, G, H, A, W[7]);
362 SHA2_32_F(A, B, C, D, E, F, G, H, W[8]);
363 SHA2_32_F(H, A, B, C, D, E, F, G, W[9]);
364 SHA2_32_F(G, H, A, B, C, D, E, F, W[10]);
365 SHA2_32_F(F, G, H, A, B, C, D, E, W[11]);
366 SHA2_32_F(E, F, G, H, A, B, C, D, W[12]);
367 SHA2_32_F(D, E, F, G, H, A, B, C, W[13]);
368 SHA2_32_F(C, D, E, F, G, H, A, B, W[14]);
369 SHA2_32_F(B, C, D, E, F, G, H, A, W[15]);
370
371 A = (digest[0] += A);
372 B = (digest[1] += B);
373 C = (digest[2] += C);
374 D = (digest[3] += D);
375 E = (digest[4] += E);
376 F = (digest[5] += F);
377 G = (digest[6] += G);
378 H = (digest[7] += H);
379 }
380}
381
382} // namespace Botan
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 load_be(const void *in) noexcept
Definition simd_4x32.h:174
static SIMD_4x32 load_le(const void *in) noexcept
Definition simd_4x32.h:149
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_le(const uint8_t *in) noexcept
Definition simd_avx2.h:61
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be128(const uint8_t in1[], const uint8_t in2[]) noexcept
Definition simd_avx2.h:101
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
BOTAN_FORCE_INLINE void SHA2_32_F(uint32_t A, uint32_t B, uint32_t C, uint32_t &D, uint32_t E, uint32_t F, uint32_t G, uint32_t &H, uint32_t &M1, uint32_t M2, uint32_t M3, uint32_t M4, uint32_t magic)
Definition sha2_32_f.h:19
SIMD_4x32 shl(SIMD_4x32 input)
Definition simd_4x32.h:790
#define BOTAN_SCRUB_STACK_AFTER_RETURN