Botan 3.9.0
Crypto and TLS for C&
sha1_avx2.cpp
Go to the documentation of this file.
1/*
2* (C) 2025 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/sha1.h>
8
9#include <botan/internal/bit_ops.h>
10#include <botan/internal/isa_extn.h>
11#include <botan/internal/rotate.h>
12#include <botan/internal/sha1_f.h>
13#include <botan/internal/simd_4x32.h>
14#include <botan/internal/simd_avx2.h>
15#include <immintrin.h>
16
17namespace Botan {
18
19namespace {
20
21/*
22* This is exactly the same approach as used in sha1_simd.cpp, just done
23* twice in the two AVX2 "lanes" - remember that alignr and slli/srli
24* here are working not across the entire register but instead as if
25* there were two smaller vectors.
26*/
27BOTAN_FN_ISA_AVX2_BMI2 BOTAN_FORCE_INLINE SIMD_8x32 sha1_avx2_next_w(SIMD_8x32& XW0,
28 SIMD_8x32 XW1,
29 SIMD_8x32 XW2,
30 SIMD_8x32 XW3) {
31 SIMD_8x32 T0 = XW0; // W[t-16..t-13]
32 T0 ^= SIMD_8x32(_mm256_alignr_epi8(XW1.raw(), XW0.raw(), 8));
33 T0 ^= XW2; // W[t-8..t-5]
34 T0 ^= SIMD_8x32(_mm256_srli_si256(XW3.raw(), 4)); // W[t-3..t-1] || 0
35
36 /* unrotated W[t]..W[t+2] in T0 ... still need W[t+3] */
37
38 // Extract w[t+0] into T2
39 auto T2 = SIMD_8x32(_mm256_slli_si256(T0.raw(), 3 * 4));
40
41 // Main rotation
42 T0 = T0.rotl<1>();
43
44 // Rotation of W[t+3] has rot by 2 to account for us working on non-rotated words
45 T2 = T2.rotl<2>();
46
47 // Merge rol(W[t+0], 1) into W[t+3]
48 T0 ^= T2;
49
50 XW0 = T0;
51 return T0;
52}
53
54/*
55* Helper for word permutation with zeroing because AVX2 is awful
56*
57* Clang and GCC both compile this to a couple of stored constants plus
58* a vpermd/vpand pair.
59*/
60template <int I0, int I1, int I2, int I3, int I4, int I5, int I6, int I7>
61BOTAN_FN_ISA_AVX2_BMI2 BOTAN_FORCE_INLINE SIMD_8x32 permute_words(SIMD_8x32 v) {
62 const __m256i tbl = _mm256_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7);
63 const __m256i mask = _mm256_setr_epi32(I0 >= 0 ? 0xFFFFFFFF : 0,
64 I1 >= 0 ? 0xFFFFFFFF : 0,
65 I2 >= 0 ? 0xFFFFFFFF : 0,
66 I3 >= 0 ? 0xFFFFFFFF : 0,
67 I4 >= 0 ? 0xFFFFFFFF : 0,
68 I5 >= 0 ? 0xFFFFFFFF : 0,
69 I6 >= 0 ? 0xFFFFFFFF : 0,
70 I7 >= 0 ? 0xFFFFFFFF : 0);
71
72 return SIMD_8x32(_mm256_and_si256(mask, _mm256_permutevar8x32_epi32(v.raw(), tbl)));
73}
74
75/*
76This is the same approach as the (single buffer) SHA-1 expansion in sha1_simd.cpp
77except unrolled further; instead of computing 4 words of W at once, we compute 8.
78
79However this is complicated both by the SHA-1 recurrence and AVX2
80limitations; it is faster than what's done in sha1_simd.cpp but only just barely.
81
82The basic idea here is that when computing this (8x per message block):
83
84W[j + 0] = rotl<1>(W[j - 3] ^ W[j - 8] ^ W[j - 14] ^ W[j - 16]);
85W[j + 1] = rotl<1>(W[j - 2] ^ W[j - 7] ^ W[j - 13] ^ W[j - 15]);
86W[j + 2] = rotl<1>(W[j - 1] ^ W[j - 6] ^ W[j - 12] ^ W[j - 14]);
87W[j + 3] = rotl<1>(W[j ] ^ W[j - 5] ^ W[j - 11] ^ W[j - 13]);
88W[j + 4] = rotl<1>(W[j + 1] ^ W[j - 4] ^ W[j - 10] ^ W[j - 12]);
89W[j + 5] = rotl<1>(W[j + 2] ^ W[j - 3] ^ W[j - 9] ^ W[j - 11]);
90W[j + 6] = rotl<1>(W[j + 3] ^ W[j - 2] ^ W[j - 8] ^ W[j - 10]);
91W[j + 7] = rotl<1>(W[j + 4] ^ W[j - 1] ^ W[j - 7] ^ W[j - 9]);
92
93We instead compute a partial expansion:
94
95W[j + 0] = rotl<1>(W[j - 3] ^ W[j - 8] ^ W[j - 14] ^ W[j - 16]);
96W[j + 1] = rotl<1>(W[j - 2] ^ W[j - 7] ^ W[j - 13] ^ W[j - 15]);
97W[j + 2] = rotl<1>(W[j - 1] ^ W[j - 6] ^ W[j - 12] ^ W[j - 14]);
98W[j + 3] = rotl<1>( W[j - 5] ^ W[j - 11] ^ W[j - 13]);
99W[j + 4] = rotl<1>( W[j - 4] ^ W[j - 10] ^ W[j - 12]);
100W[j + 5] = rotl<1>( W[j - 3] ^ W[j - 9] ^ W[j - 11]);
101W[j + 6] = rotl<1>( W[j - 2] ^ W[j - 8] ^ W[j - 10]);
102W[j + 7] = rotl<1>( W[j - 1] ^ W[j - 7] ^ W[j - 9]);
103
104Then update it with values that were not available until the first expansion is
105completed:
106
107W[j + 3] ^= rotl<1>(W[j ]);
108W[j + 4] ^= rotl<1>(W[j + 1]);
109W[j + 5] ^= rotl<1>(W[j + 2]);
110
111And then update again with values not available until the second expansion step
112is completed:
113
114W[j + 6] ^= rotl<1>(W[j + 3]);
115W[j + 7] ^= rotl<1>(W[j + 4]);
116*/
117
118BOTAN_FN_ISA_AVX2_BMI2 BOTAN_FORCE_INLINE SIMD_8x32 sha1_avx2_next_w2(SIMD_8x32& W0, SIMD_8x32 W2) {
119 // W[j-16..j-9] ^ W[j-8...j-1]
120 auto WN = W0 ^ W2;
121
122 // XOR in W[j-3..j-1] || 0 || 0 || 0 || W[j-8...j-7]
123 WN ^= permute_words<5, 6, 7, -1, -1, -1, 0, 1>(W2);
124
125 // XOR in W[j-14...j-9] || 0 || 0
126 WN ^= permute_words<2, 3, 4, 5, 6, 7, -1, -1>(W0);
127
128 // Extract W[j...j+2], rotate, and XOR into W[j+3...j+5]
129 auto T0 = permute_words<-1, -1, -1, 0, 1, 2, -1, -1>(WN).rotl<2>();
130 WN = WN.rotl<1>(); // main block rotation
131
132 WN ^= T0;
133
134 // Extract W[j+3...j+4], rotate, and XOR into W[j+6...j+7]
135 WN ^= permute_words<-1, -1, -1, -1, -1, -1, 3, 4>(WN).rotl<1>();
136
137 W0 = WN;
138 return WN;
139}
140
141} // namespace
142
143/*
144* SHA-1 Compression Function using SIMD for message expansion
145*/
146//static
147void BOTAN_FN_ISA_AVX2_BMI2 SHA_1::avx2_compress_n(digest_type& digest, std::span<const uint8_t> input, size_t blocks) {
148 using namespace SHA1_F;
149
150 const SIMD_8x32 K11 = SIMD_8x32::splat(K1);
151 const SIMD_8x32 K22 = SIMD_8x32::splat(K2);
152 const SIMD_8x32 K33 = SIMD_8x32::splat(K3);
153 const SIMD_8x32 K44 = SIMD_8x32::splat(K4);
154
155 const SIMD_8x32 K12(K1, K1, K1, K1, K2, K2, K2, K2);
156 const SIMD_8x32 K34(K3, K3, K3, K3, K4, K4, K4, K4);
157
158 uint32_t A = digest[0];
159 uint32_t B = digest[1];
160 uint32_t C = digest[2];
161 uint32_t D = digest[3];
162 uint32_t E = digest[4];
163
164 BufferSlicer in(input);
165
166 while(blocks >= 2) {
167 const auto block = in.take(2 * block_bytes);
168 blocks -= 2;
169
170 uint32_t W2[80] = {0};
171
172 uint32_t PT[4];
173
174 // NOLINTNEXTLINE(*-container-data-pointer)
175 SIMD_8x32 XW0 = SIMD_8x32::load_be128(&block[0], &block[64]);
176 SIMD_8x32 XW1 = SIMD_8x32::load_be128(&block[16], &block[80]);
177 SIMD_8x32 XW2 = SIMD_8x32::load_be128(&block[32], &block[96]);
178 SIMD_8x32 XW3 = SIMD_8x32::load_be128(&block[48], &block[112]);
179
180 SIMD_8x32 P0 = XW0 + SIMD_8x32::splat(K1);
181 SIMD_8x32 P1 = XW1 + SIMD_8x32::splat(K1);
182 SIMD_8x32 P2 = XW2 + SIMD_8x32::splat(K1);
183 SIMD_8x32 P3 = XW3 + SIMD_8x32::splat(K1);
184
185 // NOLINTBEGIN(readability-suspicious-call-argument) XW rotation
186
187 P0.store_le128(PT, &W2[0]);
188 P0 = sha1_avx2_next_w(XW0, XW1, XW2, XW3) + SIMD_8x32::splat(K1);
189 F1(A, B, C, D, E, PT[0]);
190 F1(E, A, B, C, D, PT[1]);
191 F1(D, E, A, B, C, PT[2]);
192 F1(C, D, E, A, B, PT[3]);
193
194 P1.store_le128(PT, &W2[4]);
195 P1 = sha1_avx2_next_w(XW1, XW2, XW3, XW0) + SIMD_8x32::splat(K2);
196 F1(B, C, D, E, A, PT[0]);
197 F1(A, B, C, D, E, PT[1]);
198 F1(E, A, B, C, D, PT[2]);
199 F1(D, E, A, B, C, PT[3]);
200
201 P2.store_le128(PT, &W2[8]);
202 P2 = sha1_avx2_next_w(XW2, XW3, XW0, XW1) + SIMD_8x32::splat(K2);
203 F1(C, D, E, A, B, PT[0]);
204 F1(B, C, D, E, A, PT[1]);
205 F1(A, B, C, D, E, PT[2]);
206 F1(E, A, B, C, D, PT[3]);
207
208 P3.store_le128(PT, &W2[12]);
209 P3 = sha1_avx2_next_w(XW3, XW0, XW1, XW2) + SIMD_8x32::splat(K2);
210 F1(D, E, A, B, C, PT[0]);
211 F1(C, D, E, A, B, PT[1]);
212 F1(B, C, D, E, A, PT[2]);
213 F1(A, B, C, D, E, PT[3]);
214
215 P0.store_le128(PT, &W2[16]);
216 P0 = sha1_avx2_next_w(XW0, XW1, XW2, XW3) + SIMD_8x32::splat(K2);
217 F1(E, A, B, C, D, PT[0]);
218 F1(D, E, A, B, C, PT[1]);
219 F1(C, D, E, A, B, PT[2]);
220 F1(B, C, D, E, A, PT[3]);
221
222 P1.store_le128(PT, &W2[20]);
223 P1 = sha1_avx2_next_w(XW1, XW2, XW3, XW0) + SIMD_8x32::splat(K2);
224 F2(A, B, C, D, E, PT[0]);
225 F2(E, A, B, C, D, PT[1]);
226 F2(D, E, A, B, C, PT[2]);
227 F2(C, D, E, A, B, PT[3]);
228
229 P2.store_le128(PT, &W2[24]);
230 P2 = sha1_avx2_next_w(XW2, XW3, XW0, XW1) + SIMD_8x32::splat(K3);
231 F2(B, C, D, E, A, PT[0]);
232 F2(A, B, C, D, E, PT[1]);
233 F2(E, A, B, C, D, PT[2]);
234 F2(D, E, A, B, C, PT[3]);
235
236 P3.store_le128(PT, &W2[28]);
237 P3 = sha1_avx2_next_w(XW3, XW0, XW1, XW2) + SIMD_8x32::splat(K3);
238 F2(C, D, E, A, B, PT[0]);
239 F2(B, C, D, E, A, PT[1]);
240 F2(A, B, C, D, E, PT[2]);
241 F2(E, A, B, C, D, PT[3]);
242
243 P0.store_le128(PT, &W2[32]);
244 P0 = sha1_avx2_next_w(XW0, XW1, XW2, XW3) + SIMD_8x32::splat(K3);
245 F2(D, E, A, B, C, PT[0]);
246 F2(C, D, E, A, B, PT[1]);
247 F2(B, C, D, E, A, PT[2]);
248 F2(A, B, C, D, E, PT[3]);
249
250 P1.store_le128(PT, &W2[36]);
251 P1 = sha1_avx2_next_w(XW1, XW2, XW3, XW0) + SIMD_8x32::splat(K3);
252 F2(E, A, B, C, D, PT[0]);
253 F2(D, E, A, B, C, PT[1]);
254 F2(C, D, E, A, B, PT[2]);
255 F2(B, C, D, E, A, PT[3]);
256
257 P2.store_le128(PT, &W2[40]);
258 P2 = sha1_avx2_next_w(XW2, XW3, XW0, XW1) + SIMD_8x32::splat(K3);
259 F3(A, B, C, D, E, PT[0]);
260 F3(E, A, B, C, D, PT[1]);
261 F3(D, E, A, B, C, PT[2]);
262 F3(C, D, E, A, B, PT[3]);
263
264 P3.store_le128(PT, &W2[44]);
265 P3 = sha1_avx2_next_w(XW3, XW0, XW1, XW2) + SIMD_8x32::splat(K4);
266 F3(B, C, D, E, A, PT[0]);
267 F3(A, B, C, D, E, PT[1]);
268 F3(E, A, B, C, D, PT[2]);
269 F3(D, E, A, B, C, PT[3]);
270
271 P0.store_le128(PT, &W2[48]);
272 P0 = sha1_avx2_next_w(XW0, XW1, XW2, XW3) + SIMD_8x32::splat(K4);
273 F3(C, D, E, A, B, PT[0]);
274 F3(B, C, D, E, A, PT[1]);
275 F3(A, B, C, D, E, PT[2]);
276 F3(E, A, B, C, D, PT[3]);
277
278 P1.store_le128(PT, &W2[52]);
279 P1 = sha1_avx2_next_w(XW1, XW2, XW3, XW0) + SIMD_8x32::splat(K4);
280 F3(D, E, A, B, C, PT[0]);
281 F3(C, D, E, A, B, PT[1]);
282 F3(B, C, D, E, A, PT[2]);
283 F3(A, B, C, D, E, PT[3]);
284
285 P2.store_le128(PT, &W2[56]);
286 P2 = sha1_avx2_next_w(XW2, XW3, XW0, XW1) + SIMD_8x32::splat(K4);
287 F3(E, A, B, C, D, PT[0]);
288 F3(D, E, A, B, C, PT[1]);
289 F3(C, D, E, A, B, PT[2]);
290 F3(B, C, D, E, A, PT[3]);
291
292 P3.store_le128(PT, &W2[60]);
293 P3 = sha1_avx2_next_w(XW3, XW0, XW1, XW2) + SIMD_8x32::splat(K4);
294 F4(A, B, C, D, E, PT[0]);
295 F4(E, A, B, C, D, PT[1]);
296 F4(D, E, A, B, C, PT[2]);
297 F4(C, D, E, A, B, PT[3]);
298
299 P0.store_le128(PT, &W2[64]);
300 F4(B, C, D, E, A, PT[0]);
301 F4(A, B, C, D, E, PT[1]);
302 F4(E, A, B, C, D, PT[2]);
303 F4(D, E, A, B, C, PT[3]);
304
305 P1.store_le128(PT, &W2[68]);
306 F4(C, D, E, A, B, PT[0]);
307 F4(B, C, D, E, A, PT[1]);
308 F4(A, B, C, D, E, PT[2]);
309 F4(E, A, B, C, D, PT[3]);
310
311 P2.store_le128(PT, &W2[72]);
312 F4(D, E, A, B, C, PT[0]);
313 F4(C, D, E, A, B, PT[1]);
314 F4(B, C, D, E, A, PT[2]);
315 F4(A, B, C, D, E, PT[3]);
316
317 P3.store_le128(PT, &W2[76]);
318 F4(E, A, B, C, D, PT[0]);
319 F4(D, E, A, B, C, PT[1]);
320 F4(C, D, E, A, B, PT[2]);
321 F4(B, C, D, E, A, PT[3]);
322
323 // NOLINTEND(readability-suspicious-call-argument)
324
325 A = (digest[0] += A);
326 B = (digest[1] += B);
327 C = (digest[2] += C);
328 D = (digest[3] += D);
329 E = (digest[4] += E);
330
331 // Second block with pre-expanded message
332 F1(A, B, C, D, E, W2[0]);
333 F1(E, A, B, C, D, W2[1]);
334 F1(D, E, A, B, C, W2[2]);
335 F1(C, D, E, A, B, W2[3]);
336 F1(B, C, D, E, A, W2[4]);
337 F1(A, B, C, D, E, W2[5]);
338 F1(E, A, B, C, D, W2[6]);
339 F1(D, E, A, B, C, W2[7]);
340 F1(C, D, E, A, B, W2[8]);
341 F1(B, C, D, E, A, W2[9]);
342 F1(A, B, C, D, E, W2[10]);
343 F1(E, A, B, C, D, W2[11]);
344 F1(D, E, A, B, C, W2[12]);
345 F1(C, D, E, A, B, W2[13]);
346 F1(B, C, D, E, A, W2[14]);
347 F1(A, B, C, D, E, W2[15]);
348 F1(E, A, B, C, D, W2[16]);
349 F1(D, E, A, B, C, W2[17]);
350 F1(C, D, E, A, B, W2[18]);
351 F1(B, C, D, E, A, W2[19]);
352 F2(A, B, C, D, E, W2[20]);
353 F2(E, A, B, C, D, W2[21]);
354 F2(D, E, A, B, C, W2[22]);
355 F2(C, D, E, A, B, W2[23]);
356 F2(B, C, D, E, A, W2[24]);
357 F2(A, B, C, D, E, W2[25]);
358 F2(E, A, B, C, D, W2[26]);
359 F2(D, E, A, B, C, W2[27]);
360 F2(C, D, E, A, B, W2[28]);
361 F2(B, C, D, E, A, W2[29]);
362 F2(A, B, C, D, E, W2[30]);
363 F2(E, A, B, C, D, W2[31]);
364 F2(D, E, A, B, C, W2[32]);
365 F2(C, D, E, A, B, W2[33]);
366 F2(B, C, D, E, A, W2[34]);
367 F2(A, B, C, D, E, W2[35]);
368 F2(E, A, B, C, D, W2[36]);
369 F2(D, E, A, B, C, W2[37]);
370 F2(C, D, E, A, B, W2[38]);
371 F2(B, C, D, E, A, W2[39]);
372 F3(A, B, C, D, E, W2[40]);
373 F3(E, A, B, C, D, W2[41]);
374 F3(D, E, A, B, C, W2[42]);
375 F3(C, D, E, A, B, W2[43]);
376 F3(B, C, D, E, A, W2[44]);
377 F3(A, B, C, D, E, W2[45]);
378 F3(E, A, B, C, D, W2[46]);
379 F3(D, E, A, B, C, W2[47]);
380 F3(C, D, E, A, B, W2[48]);
381 F3(B, C, D, E, A, W2[49]);
382 F3(A, B, C, D, E, W2[50]);
383 F3(E, A, B, C, D, W2[51]);
384 F3(D, E, A, B, C, W2[52]);
385 F3(C, D, E, A, B, W2[53]);
386 F3(B, C, D, E, A, W2[54]);
387 F3(A, B, C, D, E, W2[55]);
388 F3(E, A, B, C, D, W2[56]);
389 F3(D, E, A, B, C, W2[57]);
390 F3(C, D, E, A, B, W2[58]);
391 F3(B, C, D, E, A, W2[59]);
392 F4(A, B, C, D, E, W2[60]);
393 F4(E, A, B, C, D, W2[61]);
394 F4(D, E, A, B, C, W2[62]);
395 F4(C, D, E, A, B, W2[63]);
396 F4(B, C, D, E, A, W2[64]);
397 F4(A, B, C, D, E, W2[65]);
398 F4(E, A, B, C, D, W2[66]);
399 F4(D, E, A, B, C, W2[67]);
400 F4(C, D, E, A, B, W2[68]);
401 F4(B, C, D, E, A, W2[69]);
402 F4(A, B, C, D, E, W2[70]);
403 F4(E, A, B, C, D, W2[71]);
404 F4(D, E, A, B, C, W2[72]);
405 F4(C, D, E, A, B, W2[73]);
406 F4(B, C, D, E, A, W2[74]);
407 F4(A, B, C, D, E, W2[75]);
408 F4(E, A, B, C, D, W2[76]);
409 F4(D, E, A, B, C, W2[77]);
410 F4(C, D, E, A, B, W2[78]);
411 F4(B, C, D, E, A, W2[79]);
412
413 A = (digest[0] += A);
414 B = (digest[1] += B);
415 C = (digest[2] += C);
416 D = (digest[3] += D);
417 E = (digest[4] += E);
418 }
419
420 for(size_t i = 0; i != blocks; ++i) {
421 uint32_t PT[8];
422
423 const auto block = in.take(block_bytes);
424
425 SIMD_8x32 W0 = SIMD_8x32::load_be(&block[0]); // NOLINT(*-container-data-pointer)
426 SIMD_8x32 W2 = SIMD_8x32::load_be(&block[32]);
427
428 SIMD_8x32 P0 = W0 + K11;
429 SIMD_8x32 P2 = W2 + K11;
430
431 P0.store_le(PT);
432 P0 = sha1_avx2_next_w2(W0, W2) + K12;
433
434 F1(A, B, C, D, E, PT[0]);
435 F1(E, A, B, C, D, PT[1]);
436 F1(D, E, A, B, C, PT[2]);
437 F1(C, D, E, A, B, PT[3]);
438 F1(B, C, D, E, A, PT[4]);
439 F1(A, B, C, D, E, PT[5]);
440 F1(E, A, B, C, D, PT[6]);
441 F1(D, E, A, B, C, PT[7]);
442
443 P2.store_le(PT);
444 P2 = sha1_avx2_next_w2(W2, W0) + K22;
445
446 F1(C, D, E, A, B, PT[0]);
447 F1(B, C, D, E, A, PT[1]);
448 F1(A, B, C, D, E, PT[2]);
449 F1(E, A, B, C, D, PT[3]);
450 F1(D, E, A, B, C, PT[4]);
451 F1(C, D, E, A, B, PT[5]);
452 F1(B, C, D, E, A, PT[6]);
453 F1(A, B, C, D, E, PT[7]);
454
455 P0.store_le(PT);
456 P0 = sha1_avx2_next_w2(W0, W2) + K22;
457
458 F1(E, A, B, C, D, PT[0]);
459 F1(D, E, A, B, C, PT[1]);
460 F1(C, D, E, A, B, PT[2]);
461 F1(B, C, D, E, A, PT[3]);
462 F2(A, B, C, D, E, PT[4]);
463 F2(E, A, B, C, D, PT[5]);
464 F2(D, E, A, B, C, PT[6]);
465 F2(C, D, E, A, B, PT[7]);
466
467 P2.store_le(PT);
468 P2 = sha1_avx2_next_w2(W2, W0) + K33;
469
470 F2(B, C, D, E, A, PT[0]);
471 F2(A, B, C, D, E, PT[1]);
472 F2(E, A, B, C, D, PT[2]);
473 F2(D, E, A, B, C, PT[3]);
474 F2(C, D, E, A, B, PT[4]);
475 F2(B, C, D, E, A, PT[5]);
476 F2(A, B, C, D, E, PT[6]);
477 F2(E, A, B, C, D, PT[7]);
478
479 P0.store_le(PT);
480 P0 = sha1_avx2_next_w2(W0, W2) + K33;
481
482 F2(D, E, A, B, C, PT[0]);
483 F2(C, D, E, A, B, PT[1]);
484 F2(B, C, D, E, A, PT[2]);
485 F2(A, B, C, D, E, PT[3]);
486 F2(E, A, B, C, D, PT[4]);
487 F2(D, E, A, B, C, PT[5]);
488 F2(C, D, E, A, B, PT[6]);
489 F2(B, C, D, E, A, PT[7]);
490
491 P2.store_le(PT);
492 P2 = sha1_avx2_next_w2(W2, W0) + K34;
493
494 F3(A, B, C, D, E, PT[0]);
495 F3(E, A, B, C, D, PT[1]);
496 F3(D, E, A, B, C, PT[2]);
497 F3(C, D, E, A, B, PT[3]);
498 F3(B, C, D, E, A, PT[4]);
499 F3(A, B, C, D, E, PT[5]);
500 F3(E, A, B, C, D, PT[6]);
501 F3(D, E, A, B, C, PT[7]);
502
503 P0.store_le(PT);
504 P0 = sha1_avx2_next_w2(W0, W2) + K44;
505
506 F3(C, D, E, A, B, PT[0]);
507 F3(B, C, D, E, A, PT[1]);
508 F3(A, B, C, D, E, PT[2]);
509 F3(E, A, B, C, D, PT[3]);
510 F3(D, E, A, B, C, PT[4]);
511 F3(C, D, E, A, B, PT[5]);
512 F3(B, C, D, E, A, PT[6]);
513 F3(A, B, C, D, E, PT[7]);
514
515 P2.store_le(PT);
516 P2 = sha1_avx2_next_w2(W2, W0) + K44;
517
518 F3(E, A, B, C, D, PT[0]);
519 F3(D, E, A, B, C, PT[1]);
520 F3(C, D, E, A, B, PT[2]);
521 F3(B, C, D, E, A, PT[3]);
522 F4(A, B, C, D, E, PT[4]);
523 F4(E, A, B, C, D, PT[5]);
524 F4(D, E, A, B, C, PT[6]);
525 F4(C, D, E, A, B, PT[7]);
526
527 P0.store_le(PT);
528
529 F4(B, C, D, E, A, PT[0]);
530 F4(A, B, C, D, E, PT[1]);
531 F4(E, A, B, C, D, PT[2]);
532 F4(D, E, A, B, C, PT[3]);
533 F4(C, D, E, A, B, PT[4]);
534 F4(B, C, D, E, A, PT[5]);
535 F4(A, B, C, D, E, PT[6]);
536 F4(E, A, B, C, D, PT[7]);
537
538 P2.store_le(PT);
539
540 F4(D, E, A, B, C, PT[0]);
541 F4(C, D, E, A, B, PT[1]);
542 F4(B, C, D, E, A, PT[2]);
543 F4(A, B, C, D, E, PT[3]);
544 F4(E, A, B, C, D, PT[4]);
545 F4(D, E, A, B, C, PT[5]);
546 F4(C, D, E, A, B, PT[6]);
547 F4(B, C, D, E, A, PT[7]);
548
549 A = (digest[0] += A);
550 B = (digest[1] += B);
551 C = (digest[2] += C);
552 D = (digest[3] += D);
553 E = (digest[4] += E);
554 }
555}
556
557} // namespace Botan
static constexpr size_t block_bytes
Definition sha1.h:24
static BOTAN_FN_ISA_AVX2 SIMD_8x32 splat(uint32_t B) noexcept
Definition simd_avx2.h:58
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be128(const uint8_t in1[], const uint8_t in2[]) noexcept
Definition simd_avx2.h:101
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be(const uint8_t *in) noexcept
Definition simd_avx2.h:81
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
void F2(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
Definition sha1_f.h:26
void F4(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
Definition sha1_f.h:37
void F3(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
Definition sha1_f.h:31
void F1(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
Definition sha1_f.h:21
BOTAN_FORCE_INLINE constexpr T rotl(T input)
Definition rotate.h:23