Botan 3.10.0
Crypto and TLS for C&
sha1_avx2.cpp
Go to the documentation of this file.
1/*
2* (C) 2025 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/sha1.h>
8
9#include <botan/internal/isa_extn.h>
10#include <botan/internal/sha1_f.h>
11#include <botan/internal/simd_avx2.h>
12#include <immintrin.h>
13
14namespace Botan {
15
16namespace {
17
18/*
19* This is exactly the same approach as used in sha1_simd.cpp, just done
20* twice in the two AVX2 "lanes" - remember that alignr and slli/srli
21* here are working not across the entire register but instead as if
22* there were two smaller vectors.
23*/
24BOTAN_FN_ISA_AVX2_BMI2 BOTAN_FORCE_INLINE SIMD_8x32 sha1_avx2_next_w(SIMD_8x32& XW0,
25 SIMD_8x32 XW1,
26 SIMD_8x32 XW2,
27 SIMD_8x32 XW3) {
28 SIMD_8x32 T0 = XW0; // W[t-16..t-13]
29 T0 ^= SIMD_8x32(_mm256_alignr_epi8(XW1.raw(), XW0.raw(), 8));
30 T0 ^= XW2; // W[t-8..t-5]
31 T0 ^= SIMD_8x32(_mm256_srli_si256(XW3.raw(), 4)); // W[t-3..t-1] || 0
32
33 /* unrotated W[t]..W[t+2] in T0 ... still need W[t+3] */
34
35 // Extract w[t+0] into T2
36 auto T2 = SIMD_8x32(_mm256_slli_si256(T0.raw(), 3 * 4));
37
38 // Main rotation
39 T0 = T0.rotl<1>();
40
41 // Rotation of W[t+3] has rot by 2 to account for us working on non-rotated words
42 T2 = T2.rotl<2>();
43
44 // Merge rol(W[t+0], 1) into W[t+3]
45 T0 ^= T2;
46
47 XW0 = T0;
48 return T0;
49}
50
51/*
52* Helper for word permutation with zeroing because AVX2 is awful
53*
54* Clang and GCC both compile this to a couple of stored constants plus
55* a vpermd/vpand pair.
56*/
57template <int I0, int I1, int I2, int I3, int I4, int I5, int I6, int I7>
58BOTAN_FN_ISA_AVX2_BMI2 BOTAN_FORCE_INLINE SIMD_8x32 permute_words(SIMD_8x32 v) {
59 const __m256i tbl = _mm256_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7);
60 const __m256i mask = _mm256_setr_epi32(I0 >= 0 ? 0xFFFFFFFF : 0,
61 I1 >= 0 ? 0xFFFFFFFF : 0,
62 I2 >= 0 ? 0xFFFFFFFF : 0,
63 I3 >= 0 ? 0xFFFFFFFF : 0,
64 I4 >= 0 ? 0xFFFFFFFF : 0,
65 I5 >= 0 ? 0xFFFFFFFF : 0,
66 I6 >= 0 ? 0xFFFFFFFF : 0,
67 I7 >= 0 ? 0xFFFFFFFF : 0);
68
69 return SIMD_8x32(_mm256_and_si256(mask, _mm256_permutevar8x32_epi32(v.raw(), tbl)));
70}
71
72/*
73This is the same approach as the (single buffer) SHA-1 expansion in sha1_simd.cpp
74except unrolled further; instead of computing 4 words of W at once, we compute 8.
75
76However this is complicated both by the SHA-1 recurrence and AVX2
77limitations; it is faster than what's done in sha1_simd.cpp but only just barely.
78
79The basic idea here is that when computing this (8x per message block):
80
81W[j + 0] = rotl<1>(W[j - 3] ^ W[j - 8] ^ W[j - 14] ^ W[j - 16]);
82W[j + 1] = rotl<1>(W[j - 2] ^ W[j - 7] ^ W[j - 13] ^ W[j - 15]);
83W[j + 2] = rotl<1>(W[j - 1] ^ W[j - 6] ^ W[j - 12] ^ W[j - 14]);
84W[j + 3] = rotl<1>(W[j ] ^ W[j - 5] ^ W[j - 11] ^ W[j - 13]);
85W[j + 4] = rotl<1>(W[j + 1] ^ W[j - 4] ^ W[j - 10] ^ W[j - 12]);
86W[j + 5] = rotl<1>(W[j + 2] ^ W[j - 3] ^ W[j - 9] ^ W[j - 11]);
87W[j + 6] = rotl<1>(W[j + 3] ^ W[j - 2] ^ W[j - 8] ^ W[j - 10]);
88W[j + 7] = rotl<1>(W[j + 4] ^ W[j - 1] ^ W[j - 7] ^ W[j - 9]);
89
90We instead compute a partial expansion:
91
92W[j + 0] = rotl<1>(W[j - 3] ^ W[j - 8] ^ W[j - 14] ^ W[j - 16]);
93W[j + 1] = rotl<1>(W[j - 2] ^ W[j - 7] ^ W[j - 13] ^ W[j - 15]);
94W[j + 2] = rotl<1>(W[j - 1] ^ W[j - 6] ^ W[j - 12] ^ W[j - 14]);
95W[j + 3] = rotl<1>( W[j - 5] ^ W[j - 11] ^ W[j - 13]);
96W[j + 4] = rotl<1>( W[j - 4] ^ W[j - 10] ^ W[j - 12]);
97W[j + 5] = rotl<1>( W[j - 3] ^ W[j - 9] ^ W[j - 11]);
98W[j + 6] = rotl<1>( W[j - 2] ^ W[j - 8] ^ W[j - 10]);
99W[j + 7] = rotl<1>( W[j - 1] ^ W[j - 7] ^ W[j - 9]);
100
101Then update it with values that were not available until the first expansion is
102completed:
103
104W[j + 3] ^= rotl<1>(W[j ]);
105W[j + 4] ^= rotl<1>(W[j + 1]);
106W[j + 5] ^= rotl<1>(W[j + 2]);
107
108And then update again with values not available until the second expansion step
109is completed:
110
111W[j + 6] ^= rotl<1>(W[j + 3]);
112W[j + 7] ^= rotl<1>(W[j + 4]);
113*/
114
115BOTAN_FN_ISA_AVX2_BMI2 BOTAN_FORCE_INLINE SIMD_8x32 sha1_avx2_next_w2(SIMD_8x32& W0, SIMD_8x32 W2) {
116 // W[j-16..j-9] ^ W[j-8...j-1]
117 auto WN = W0 ^ W2;
118
119 // XOR in W[j-3..j-1] || 0 || 0 || 0 || W[j-8...j-7]
120 WN ^= permute_words<5, 6, 7, -1, -1, -1, 0, 1>(W2);
121
122 // XOR in W[j-14...j-9] || 0 || 0
123 WN ^= permute_words<2, 3, 4, 5, 6, 7, -1, -1>(W0);
124
125 // Extract W[j...j+2], rotate, and XOR into W[j+3...j+5]
126 auto T0 = permute_words<-1, -1, -1, 0, 1, 2, -1, -1>(WN).rotl<2>();
127 WN = WN.rotl<1>(); // main block rotation
128
129 WN ^= T0;
130
131 // Extract W[j+3...j+4], rotate, and XOR into W[j+6...j+7]
132 WN ^= permute_words<-1, -1, -1, -1, -1, -1, 3, 4>(WN).rotl<1>();
133
134 W0 = WN;
135 return WN;
136}
137
138} // namespace
139
140/*
141* SHA-1 Compression Function using SIMD for message expansion
142*/
143//static
144void BOTAN_FN_ISA_AVX2_BMI2 SHA_1::avx2_compress_n(digest_type& digest, std::span<const uint8_t> input, size_t blocks) {
145 using namespace SHA1_F;
146
147 const SIMD_8x32 K11 = SIMD_8x32::splat(K1);
148 const SIMD_8x32 K22 = SIMD_8x32::splat(K2);
149 const SIMD_8x32 K33 = SIMD_8x32::splat(K3);
150 const SIMD_8x32 K44 = SIMD_8x32::splat(K4);
151
152 const SIMD_8x32 K12(K1, K1, K1, K1, K2, K2, K2, K2);
153 const SIMD_8x32 K34(K3, K3, K3, K3, K4, K4, K4, K4);
154
155 uint32_t A = digest[0];
156 uint32_t B = digest[1];
157 uint32_t C = digest[2];
158 uint32_t D = digest[3];
159 uint32_t E = digest[4];
160
161 BufferSlicer in(input);
162
163 while(blocks >= 2) {
164 const auto block = in.take(2 * block_bytes);
165 blocks -= 2;
166
167 uint32_t W2[80] = {0};
168
169 uint32_t PT[4];
170
171 // NOLINTNEXTLINE(*-container-data-pointer)
172 SIMD_8x32 XW0 = SIMD_8x32::load_be128(&block[0], &block[64]);
173 SIMD_8x32 XW1 = SIMD_8x32::load_be128(&block[16], &block[80]);
174 SIMD_8x32 XW2 = SIMD_8x32::load_be128(&block[32], &block[96]);
175 SIMD_8x32 XW3 = SIMD_8x32::load_be128(&block[48], &block[112]);
176
177 SIMD_8x32 P0 = XW0 + SIMD_8x32::splat(K1);
178 SIMD_8x32 P1 = XW1 + SIMD_8x32::splat(K1);
179 SIMD_8x32 P2 = XW2 + SIMD_8x32::splat(K1);
180 SIMD_8x32 P3 = XW3 + SIMD_8x32::splat(K1);
181
182 // NOLINTBEGIN(readability-suspicious-call-argument) XW rotation
183
184 P0.store_le128(PT, &W2[0]);
185 P0 = sha1_avx2_next_w(XW0, XW1, XW2, XW3) + SIMD_8x32::splat(K1);
186 F1(A, B, C, D, E, PT[0]);
187 F1(E, A, B, C, D, PT[1]);
188 F1(D, E, A, B, C, PT[2]);
189 F1(C, D, E, A, B, PT[3]);
190
191 P1.store_le128(PT, &W2[4]);
192 P1 = sha1_avx2_next_w(XW1, XW2, XW3, XW0) + SIMD_8x32::splat(K2);
193 F1(B, C, D, E, A, PT[0]);
194 F1(A, B, C, D, E, PT[1]);
195 F1(E, A, B, C, D, PT[2]);
196 F1(D, E, A, B, C, PT[3]);
197
198 P2.store_le128(PT, &W2[8]);
199 P2 = sha1_avx2_next_w(XW2, XW3, XW0, XW1) + SIMD_8x32::splat(K2);
200 F1(C, D, E, A, B, PT[0]);
201 F1(B, C, D, E, A, PT[1]);
202 F1(A, B, C, D, E, PT[2]);
203 F1(E, A, B, C, D, PT[3]);
204
205 P3.store_le128(PT, &W2[12]);
206 P3 = sha1_avx2_next_w(XW3, XW0, XW1, XW2) + SIMD_8x32::splat(K2);
207 F1(D, E, A, B, C, PT[0]);
208 F1(C, D, E, A, B, PT[1]);
209 F1(B, C, D, E, A, PT[2]);
210 F1(A, B, C, D, E, PT[3]);
211
212 P0.store_le128(PT, &W2[16]);
213 P0 = sha1_avx2_next_w(XW0, XW1, XW2, XW3) + SIMD_8x32::splat(K2);
214 F1(E, A, B, C, D, PT[0]);
215 F1(D, E, A, B, C, PT[1]);
216 F1(C, D, E, A, B, PT[2]);
217 F1(B, C, D, E, A, PT[3]);
218
219 P1.store_le128(PT, &W2[20]);
220 P1 = sha1_avx2_next_w(XW1, XW2, XW3, XW0) + SIMD_8x32::splat(K2);
221 F2(A, B, C, D, E, PT[0]);
222 F2(E, A, B, C, D, PT[1]);
223 F2(D, E, A, B, C, PT[2]);
224 F2(C, D, E, A, B, PT[3]);
225
226 P2.store_le128(PT, &W2[24]);
227 P2 = sha1_avx2_next_w(XW2, XW3, XW0, XW1) + SIMD_8x32::splat(K3);
228 F2(B, C, D, E, A, PT[0]);
229 F2(A, B, C, D, E, PT[1]);
230 F2(E, A, B, C, D, PT[2]);
231 F2(D, E, A, B, C, PT[3]);
232
233 P3.store_le128(PT, &W2[28]);
234 P3 = sha1_avx2_next_w(XW3, XW0, XW1, XW2) + SIMD_8x32::splat(K3);
235 F2(C, D, E, A, B, PT[0]);
236 F2(B, C, D, E, A, PT[1]);
237 F2(A, B, C, D, E, PT[2]);
238 F2(E, A, B, C, D, PT[3]);
239
240 P0.store_le128(PT, &W2[32]);
241 P0 = sha1_avx2_next_w(XW0, XW1, XW2, XW3) + SIMD_8x32::splat(K3);
242 F2(D, E, A, B, C, PT[0]);
243 F2(C, D, E, A, B, PT[1]);
244 F2(B, C, D, E, A, PT[2]);
245 F2(A, B, C, D, E, PT[3]);
246
247 P1.store_le128(PT, &W2[36]);
248 P1 = sha1_avx2_next_w(XW1, XW2, XW3, XW0) + SIMD_8x32::splat(K3);
249 F2(E, A, B, C, D, PT[0]);
250 F2(D, E, A, B, C, PT[1]);
251 F2(C, D, E, A, B, PT[2]);
252 F2(B, C, D, E, A, PT[3]);
253
254 P2.store_le128(PT, &W2[40]);
255 P2 = sha1_avx2_next_w(XW2, XW3, XW0, XW1) + SIMD_8x32::splat(K3);
256 F3(A, B, C, D, E, PT[0]);
257 F3(E, A, B, C, D, PT[1]);
258 F3(D, E, A, B, C, PT[2]);
259 F3(C, D, E, A, B, PT[3]);
260
261 P3.store_le128(PT, &W2[44]);
262 P3 = sha1_avx2_next_w(XW3, XW0, XW1, XW2) + SIMD_8x32::splat(K4);
263 F3(B, C, D, E, A, PT[0]);
264 F3(A, B, C, D, E, PT[1]);
265 F3(E, A, B, C, D, PT[2]);
266 F3(D, E, A, B, C, PT[3]);
267
268 P0.store_le128(PT, &W2[48]);
269 P0 = sha1_avx2_next_w(XW0, XW1, XW2, XW3) + SIMD_8x32::splat(K4);
270 F3(C, D, E, A, B, PT[0]);
271 F3(B, C, D, E, A, PT[1]);
272 F3(A, B, C, D, E, PT[2]);
273 F3(E, A, B, C, D, PT[3]);
274
275 P1.store_le128(PT, &W2[52]);
276 P1 = sha1_avx2_next_w(XW1, XW2, XW3, XW0) + SIMD_8x32::splat(K4);
277 F3(D, E, A, B, C, PT[0]);
278 F3(C, D, E, A, B, PT[1]);
279 F3(B, C, D, E, A, PT[2]);
280 F3(A, B, C, D, E, PT[3]);
281
282 P2.store_le128(PT, &W2[56]);
283 P2 = sha1_avx2_next_w(XW2, XW3, XW0, XW1) + SIMD_8x32::splat(K4);
284 F3(E, A, B, C, D, PT[0]);
285 F3(D, E, A, B, C, PT[1]);
286 F3(C, D, E, A, B, PT[2]);
287 F3(B, C, D, E, A, PT[3]);
288
289 P3.store_le128(PT, &W2[60]);
290 P3 = sha1_avx2_next_w(XW3, XW0, XW1, XW2) + SIMD_8x32::splat(K4);
291 F4(A, B, C, D, E, PT[0]);
292 F4(E, A, B, C, D, PT[1]);
293 F4(D, E, A, B, C, PT[2]);
294 F4(C, D, E, A, B, PT[3]);
295
296 P0.store_le128(PT, &W2[64]);
297 F4(B, C, D, E, A, PT[0]);
298 F4(A, B, C, D, E, PT[1]);
299 F4(E, A, B, C, D, PT[2]);
300 F4(D, E, A, B, C, PT[3]);
301
302 P1.store_le128(PT, &W2[68]);
303 F4(C, D, E, A, B, PT[0]);
304 F4(B, C, D, E, A, PT[1]);
305 F4(A, B, C, D, E, PT[2]);
306 F4(E, A, B, C, D, PT[3]);
307
308 P2.store_le128(PT, &W2[72]);
309 F4(D, E, A, B, C, PT[0]);
310 F4(C, D, E, A, B, PT[1]);
311 F4(B, C, D, E, A, PT[2]);
312 F4(A, B, C, D, E, PT[3]);
313
314 P3.store_le128(PT, &W2[76]);
315 F4(E, A, B, C, D, PT[0]);
316 F4(D, E, A, B, C, PT[1]);
317 F4(C, D, E, A, B, PT[2]);
318 F4(B, C, D, E, A, PT[3]);
319
320 // NOLINTEND(readability-suspicious-call-argument)
321
322 A = (digest[0] += A);
323 B = (digest[1] += B);
324 C = (digest[2] += C);
325 D = (digest[3] += D);
326 E = (digest[4] += E);
327
328 // Second block with pre-expanded message
329 F1(A, B, C, D, E, W2[0]);
330 F1(E, A, B, C, D, W2[1]);
331 F1(D, E, A, B, C, W2[2]);
332 F1(C, D, E, A, B, W2[3]);
333 F1(B, C, D, E, A, W2[4]);
334 F1(A, B, C, D, E, W2[5]);
335 F1(E, A, B, C, D, W2[6]);
336 F1(D, E, A, B, C, W2[7]);
337 F1(C, D, E, A, B, W2[8]);
338 F1(B, C, D, E, A, W2[9]);
339 F1(A, B, C, D, E, W2[10]);
340 F1(E, A, B, C, D, W2[11]);
341 F1(D, E, A, B, C, W2[12]);
342 F1(C, D, E, A, B, W2[13]);
343 F1(B, C, D, E, A, W2[14]);
344 F1(A, B, C, D, E, W2[15]);
345 F1(E, A, B, C, D, W2[16]);
346 F1(D, E, A, B, C, W2[17]);
347 F1(C, D, E, A, B, W2[18]);
348 F1(B, C, D, E, A, W2[19]);
349 F2(A, B, C, D, E, W2[20]);
350 F2(E, A, B, C, D, W2[21]);
351 F2(D, E, A, B, C, W2[22]);
352 F2(C, D, E, A, B, W2[23]);
353 F2(B, C, D, E, A, W2[24]);
354 F2(A, B, C, D, E, W2[25]);
355 F2(E, A, B, C, D, W2[26]);
356 F2(D, E, A, B, C, W2[27]);
357 F2(C, D, E, A, B, W2[28]);
358 F2(B, C, D, E, A, W2[29]);
359 F2(A, B, C, D, E, W2[30]);
360 F2(E, A, B, C, D, W2[31]);
361 F2(D, E, A, B, C, W2[32]);
362 F2(C, D, E, A, B, W2[33]);
363 F2(B, C, D, E, A, W2[34]);
364 F2(A, B, C, D, E, W2[35]);
365 F2(E, A, B, C, D, W2[36]);
366 F2(D, E, A, B, C, W2[37]);
367 F2(C, D, E, A, B, W2[38]);
368 F2(B, C, D, E, A, W2[39]);
369 F3(A, B, C, D, E, W2[40]);
370 F3(E, A, B, C, D, W2[41]);
371 F3(D, E, A, B, C, W2[42]);
372 F3(C, D, E, A, B, W2[43]);
373 F3(B, C, D, E, A, W2[44]);
374 F3(A, B, C, D, E, W2[45]);
375 F3(E, A, B, C, D, W2[46]);
376 F3(D, E, A, B, C, W2[47]);
377 F3(C, D, E, A, B, W2[48]);
378 F3(B, C, D, E, A, W2[49]);
379 F3(A, B, C, D, E, W2[50]);
380 F3(E, A, B, C, D, W2[51]);
381 F3(D, E, A, B, C, W2[52]);
382 F3(C, D, E, A, B, W2[53]);
383 F3(B, C, D, E, A, W2[54]);
384 F3(A, B, C, D, E, W2[55]);
385 F3(E, A, B, C, D, W2[56]);
386 F3(D, E, A, B, C, W2[57]);
387 F3(C, D, E, A, B, W2[58]);
388 F3(B, C, D, E, A, W2[59]);
389 F4(A, B, C, D, E, W2[60]);
390 F4(E, A, B, C, D, W2[61]);
391 F4(D, E, A, B, C, W2[62]);
392 F4(C, D, E, A, B, W2[63]);
393 F4(B, C, D, E, A, W2[64]);
394 F4(A, B, C, D, E, W2[65]);
395 F4(E, A, B, C, D, W2[66]);
396 F4(D, E, A, B, C, W2[67]);
397 F4(C, D, E, A, B, W2[68]);
398 F4(B, C, D, E, A, W2[69]);
399 F4(A, B, C, D, E, W2[70]);
400 F4(E, A, B, C, D, W2[71]);
401 F4(D, E, A, B, C, W2[72]);
402 F4(C, D, E, A, B, W2[73]);
403 F4(B, C, D, E, A, W2[74]);
404 F4(A, B, C, D, E, W2[75]);
405 F4(E, A, B, C, D, W2[76]);
406 F4(D, E, A, B, C, W2[77]);
407 F4(C, D, E, A, B, W2[78]);
408 F4(B, C, D, E, A, W2[79]);
409
410 A = (digest[0] += A);
411 B = (digest[1] += B);
412 C = (digest[2] += C);
413 D = (digest[3] += D);
414 E = (digest[4] += E);
415 }
416
417 for(size_t i = 0; i != blocks; ++i) {
418 uint32_t PT[8];
419
420 const auto block = in.take(block_bytes);
421
422 SIMD_8x32 W0 = SIMD_8x32::load_be(&block[0]); // NOLINT(*-container-data-pointer)
423 SIMD_8x32 W2 = SIMD_8x32::load_be(&block[32]);
424
425 SIMD_8x32 P0 = W0 + K11;
426 SIMD_8x32 P2 = W2 + K11;
427
428 P0.store_le(PT);
429 P0 = sha1_avx2_next_w2(W0, W2) + K12;
430
431 F1(A, B, C, D, E, PT[0]);
432 F1(E, A, B, C, D, PT[1]);
433 F1(D, E, A, B, C, PT[2]);
434 F1(C, D, E, A, B, PT[3]);
435 F1(B, C, D, E, A, PT[4]);
436 F1(A, B, C, D, E, PT[5]);
437 F1(E, A, B, C, D, PT[6]);
438 F1(D, E, A, B, C, PT[7]);
439
440 P2.store_le(PT);
441 P2 = sha1_avx2_next_w2(W2, W0) + K22;
442
443 F1(C, D, E, A, B, PT[0]);
444 F1(B, C, D, E, A, PT[1]);
445 F1(A, B, C, D, E, PT[2]);
446 F1(E, A, B, C, D, PT[3]);
447 F1(D, E, A, B, C, PT[4]);
448 F1(C, D, E, A, B, PT[5]);
449 F1(B, C, D, E, A, PT[6]);
450 F1(A, B, C, D, E, PT[7]);
451
452 P0.store_le(PT);
453 P0 = sha1_avx2_next_w2(W0, W2) + K22;
454
455 F1(E, A, B, C, D, PT[0]);
456 F1(D, E, A, B, C, PT[1]);
457 F1(C, D, E, A, B, PT[2]);
458 F1(B, C, D, E, A, PT[3]);
459 F2(A, B, C, D, E, PT[4]);
460 F2(E, A, B, C, D, PT[5]);
461 F2(D, E, A, B, C, PT[6]);
462 F2(C, D, E, A, B, PT[7]);
463
464 P2.store_le(PT);
465 P2 = sha1_avx2_next_w2(W2, W0) + K33;
466
467 F2(B, C, D, E, A, PT[0]);
468 F2(A, B, C, D, E, PT[1]);
469 F2(E, A, B, C, D, PT[2]);
470 F2(D, E, A, B, C, PT[3]);
471 F2(C, D, E, A, B, PT[4]);
472 F2(B, C, D, E, A, PT[5]);
473 F2(A, B, C, D, E, PT[6]);
474 F2(E, A, B, C, D, PT[7]);
475
476 P0.store_le(PT);
477 P0 = sha1_avx2_next_w2(W0, W2) + K33;
478
479 F2(D, E, A, B, C, PT[0]);
480 F2(C, D, E, A, B, PT[1]);
481 F2(B, C, D, E, A, PT[2]);
482 F2(A, B, C, D, E, PT[3]);
483 F2(E, A, B, C, D, PT[4]);
484 F2(D, E, A, B, C, PT[5]);
485 F2(C, D, E, A, B, PT[6]);
486 F2(B, C, D, E, A, PT[7]);
487
488 P2.store_le(PT);
489 P2 = sha1_avx2_next_w2(W2, W0) + K34;
490
491 F3(A, B, C, D, E, PT[0]);
492 F3(E, A, B, C, D, PT[1]);
493 F3(D, E, A, B, C, PT[2]);
494 F3(C, D, E, A, B, PT[3]);
495 F3(B, C, D, E, A, PT[4]);
496 F3(A, B, C, D, E, PT[5]);
497 F3(E, A, B, C, D, PT[6]);
498 F3(D, E, A, B, C, PT[7]);
499
500 P0.store_le(PT);
501 P0 = sha1_avx2_next_w2(W0, W2) + K44;
502
503 F3(C, D, E, A, B, PT[0]);
504 F3(B, C, D, E, A, PT[1]);
505 F3(A, B, C, D, E, PT[2]);
506 F3(E, A, B, C, D, PT[3]);
507 F3(D, E, A, B, C, PT[4]);
508 F3(C, D, E, A, B, PT[5]);
509 F3(B, C, D, E, A, PT[6]);
510 F3(A, B, C, D, E, PT[7]);
511
512 P2.store_le(PT);
513 P2 = sha1_avx2_next_w2(W2, W0) + K44;
514
515 F3(E, A, B, C, D, PT[0]);
516 F3(D, E, A, B, C, PT[1]);
517 F3(C, D, E, A, B, PT[2]);
518 F3(B, C, D, E, A, PT[3]);
519 F4(A, B, C, D, E, PT[4]);
520 F4(E, A, B, C, D, PT[5]);
521 F4(D, E, A, B, C, PT[6]);
522 F4(C, D, E, A, B, PT[7]);
523
524 P0.store_le(PT);
525
526 F4(B, C, D, E, A, PT[0]);
527 F4(A, B, C, D, E, PT[1]);
528 F4(E, A, B, C, D, PT[2]);
529 F4(D, E, A, B, C, PT[3]);
530 F4(C, D, E, A, B, PT[4]);
531 F4(B, C, D, E, A, PT[5]);
532 F4(A, B, C, D, E, PT[6]);
533 F4(E, A, B, C, D, PT[7]);
534
535 P2.store_le(PT);
536
537 F4(D, E, A, B, C, PT[0]);
538 F4(C, D, E, A, B, PT[1]);
539 F4(B, C, D, E, A, PT[2]);
540 F4(A, B, C, D, E, PT[3]);
541 F4(E, A, B, C, D, PT[4]);
542 F4(D, E, A, B, C, PT[5]);
543 F4(C, D, E, A, B, PT[6]);
544 F4(B, C, D, E, A, PT[7]);
545
546 A = (digest[0] += A);
547 B = (digest[1] += B);
548 C = (digest[2] += C);
549 D = (digest[3] += D);
550 E = (digest[4] += E);
551 }
552}
553
554} // namespace Botan
static constexpr size_t block_bytes
Definition sha1.h:24
static BOTAN_FN_ISA_AVX2 SIMD_8x32 splat(uint32_t B) noexcept
Definition simd_avx2.h:58
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be128(const uint8_t in1[], const uint8_t in2[]) noexcept
Definition simd_avx2.h:101
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be(const uint8_t *in) noexcept
Definition simd_avx2.h:81
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
void F2(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
Definition sha1_f.h:26
void F4(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
Definition sha1_f.h:37
void F3(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
Definition sha1_f.h:31
void F1(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
Definition sha1_f.h:21
BOTAN_FORCE_INLINE constexpr T rotl(T input)
Definition rotate.h:23