7#include <botan/internal/sha1.h>
9#include <botan/internal/bit_ops.h>
10#include <botan/internal/isa_extn.h>
11#include <botan/internal/rotate.h>
12#include <botan/internal/sha1_f.h>
13#include <botan/internal/simd_4x32.h>
14#include <botan/internal/simd_avx2.h>
32 T0 ^=
SIMD_8x32(_mm256_alignr_epi8(XW1.raw(), XW0.raw(), 8));
34 T0 ^=
SIMD_8x32(_mm256_srli_si256(XW3.raw(), 4));
39 auto T2 =
SIMD_8x32(_mm256_slli_si256(T0.raw(), 3 * 4));
60template <
int I0,
int I1,
int I2,
int I3,
int I4,
int I5,
int I6,
int I7>
62 const __m256i tbl = _mm256_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7);
63 const __m256i mask = _mm256_setr_epi32(I0 >= 0 ? 0xFFFFFFFF : 0,
64 I1 >= 0 ? 0xFFFFFFFF : 0,
65 I2 >= 0 ? 0xFFFFFFFF : 0,
66 I3 >= 0 ? 0xFFFFFFFF : 0,
67 I4 >= 0 ? 0xFFFFFFFF : 0,
68 I5 >= 0 ? 0xFFFFFFFF : 0,
69 I6 >= 0 ? 0xFFFFFFFF : 0,
70 I7 >= 0 ? 0xFFFFFFFF : 0);
72 return SIMD_8x32(_mm256_and_si256(mask, _mm256_permutevar8x32_epi32(v.raw(), tbl)));
123 WN ^= permute_words<5, 6, 7, -1, -1, -1, 0, 1>(W2);
126 WN ^= permute_words<2, 3, 4, 5, 6, 7, -1, -1>(W0);
129 auto T0 = permute_words<-1, -1, -1, 0, 1, 2, -1, -1>(WN).
rotl<2>();
135 WN ^= permute_words<-1, -1, -1, -1, -1, -1, 3, 4>(WN).
rotl<1>();
147void BOTAN_FN_ISA_AVX2_BMI2 SHA_1::avx2_compress_n(digest_type& digest, std::span<const uint8_t> input,
size_t blocks) {
148 using namespace SHA1_F;
155 const SIMD_8x32 K12(K1, K1, K1, K1, K2, K2, K2, K2);
156 const SIMD_8x32 K34(K3, K3, K3, K3, K4, K4, K4, K4);
158 uint32_t A = digest[0];
159 uint32_t B = digest[1];
160 uint32_t C = digest[2];
161 uint32_t D = digest[3];
162 uint32_t E = digest[4];
164 BufferSlicer in(input);
170 uint32_t W2[80] = {0};
187 P0.store_le128(PT, &W2[0]);
189 F1(A, B, C, D, E, PT[0]);
190 F1(E, A, B, C, D, PT[1]);
191 F1(D, E, A, B, C, PT[2]);
192 F1(C, D, E, A, B, PT[3]);
194 P1.store_le128(PT, &W2[4]);
196 F1(B, C, D, E, A, PT[0]);
197 F1(A, B, C, D, E, PT[1]);
198 F1(E, A, B, C, D, PT[2]);
199 F1(D, E, A, B, C, PT[3]);
201 P2.store_le128(PT, &W2[8]);
203 F1(C, D, E, A, B, PT[0]);
204 F1(B, C, D, E, A, PT[1]);
205 F1(A, B, C, D, E, PT[2]);
206 F1(E, A, B, C, D, PT[3]);
208 P3.store_le128(PT, &W2[12]);
210 F1(D, E, A, B, C, PT[0]);
211 F1(C, D, E, A, B, PT[1]);
212 F1(B, C, D, E, A, PT[2]);
213 F1(A, B, C, D, E, PT[3]);
215 P0.store_le128(PT, &W2[16]);
217 F1(E, A, B, C, D, PT[0]);
218 F1(D, E, A, B, C, PT[1]);
219 F1(C, D, E, A, B, PT[2]);
220 F1(B, C, D, E, A, PT[3]);
222 P1.store_le128(PT, &W2[20]);
224 F2(A, B, C, D, E, PT[0]);
225 F2(E, A, B, C, D, PT[1]);
226 F2(D, E, A, B, C, PT[2]);
227 F2(C, D, E, A, B, PT[3]);
229 P2.store_le128(PT, &W2[24]);
231 F2(B, C, D, E, A, PT[0]);
232 F2(A, B, C, D, E, PT[1]);
233 F2(E, A, B, C, D, PT[2]);
234 F2(D, E, A, B, C, PT[3]);
236 P3.store_le128(PT, &W2[28]);
238 F2(C, D, E, A, B, PT[0]);
239 F2(B, C, D, E, A, PT[1]);
240 F2(A, B, C, D, E, PT[2]);
241 F2(E, A, B, C, D, PT[3]);
243 P0.store_le128(PT, &W2[32]);
245 F2(D, E, A, B, C, PT[0]);
246 F2(C, D, E, A, B, PT[1]);
247 F2(B, C, D, E, A, PT[2]);
248 F2(A, B, C, D, E, PT[3]);
250 P1.store_le128(PT, &W2[36]);
252 F2(E, A, B, C, D, PT[0]);
253 F2(D, E, A, B, C, PT[1]);
254 F2(C, D, E, A, B, PT[2]);
255 F2(B, C, D, E, A, PT[3]);
257 P2.store_le128(PT, &W2[40]);
259 F3(A, B, C, D, E, PT[0]);
260 F3(E, A, B, C, D, PT[1]);
261 F3(D, E, A, B, C, PT[2]);
262 F3(C, D, E, A, B, PT[3]);
264 P3.store_le128(PT, &W2[44]);
266 F3(B, C, D, E, A, PT[0]);
267 F3(A, B, C, D, E, PT[1]);
268 F3(E, A, B, C, D, PT[2]);
269 F3(D, E, A, B, C, PT[3]);
271 P0.store_le128(PT, &W2[48]);
273 F3(C, D, E, A, B, PT[0]);
274 F3(B, C, D, E, A, PT[1]);
275 F3(A, B, C, D, E, PT[2]);
276 F3(E, A, B, C, D, PT[3]);
278 P1.store_le128(PT, &W2[52]);
280 F3(D, E, A, B, C, PT[0]);
281 F3(C, D, E, A, B, PT[1]);
282 F3(B, C, D, E, A, PT[2]);
283 F3(A, B, C, D, E, PT[3]);
285 P2.store_le128(PT, &W2[56]);
287 F3(E, A, B, C, D, PT[0]);
288 F3(D, E, A, B, C, PT[1]);
289 F3(C, D, E, A, B, PT[2]);
290 F3(B, C, D, E, A, PT[3]);
292 P3.store_le128(PT, &W2[60]);
294 F4(A, B, C, D, E, PT[0]);
295 F4(E, A, B, C, D, PT[1]);
296 F4(D, E, A, B, C, PT[2]);
297 F4(C, D, E, A, B, PT[3]);
299 P0.store_le128(PT, &W2[64]);
300 F4(B, C, D, E, A, PT[0]);
301 F4(A, B, C, D, E, PT[1]);
302 F4(E, A, B, C, D, PT[2]);
303 F4(D, E, A, B, C, PT[3]);
305 P1.store_le128(PT, &W2[68]);
306 F4(C, D, E, A, B, PT[0]);
307 F4(B, C, D, E, A, PT[1]);
308 F4(A, B, C, D, E, PT[2]);
309 F4(E, A, B, C, D, PT[3]);
311 P2.store_le128(PT, &W2[72]);
312 F4(D, E, A, B, C, PT[0]);
313 F4(C, D, E, A, B, PT[1]);
314 F4(B, C, D, E, A, PT[2]);
315 F4(A, B, C, D, E, PT[3]);
317 P3.store_le128(PT, &W2[76]);
318 F4(E, A, B, C, D, PT[0]);
319 F4(D, E, A, B, C, PT[1]);
320 F4(C, D, E, A, B, PT[2]);
321 F4(B, C, D, E, A, PT[3]);
325 A = (digest[0] += A);
326 B = (digest[1] += B);
327 C = (digest[2] += C);
328 D = (digest[3] += D);
329 E = (digest[4] += E);
332 F1(A, B, C, D, E, W2[0]);
333 F1(E, A, B, C, D, W2[1]);
334 F1(D, E, A, B, C, W2[2]);
335 F1(C, D, E, A, B, W2[3]);
336 F1(B, C, D, E, A, W2[4]);
337 F1(A, B, C, D, E, W2[5]);
338 F1(E, A, B, C, D, W2[6]);
339 F1(D, E, A, B, C, W2[7]);
340 F1(C, D, E, A, B, W2[8]);
341 F1(B, C, D, E, A, W2[9]);
342 F1(A, B, C, D, E, W2[10]);
343 F1(E, A, B, C, D, W2[11]);
344 F1(D, E, A, B, C, W2[12]);
345 F1(C, D, E, A, B, W2[13]);
346 F1(B, C, D, E, A, W2[14]);
347 F1(A, B, C, D, E, W2[15]);
348 F1(E, A, B, C, D, W2[16]);
349 F1(D, E, A, B, C, W2[17]);
350 F1(C, D, E, A, B, W2[18]);
351 F1(B, C, D, E, A, W2[19]);
352 F2(A, B, C, D, E, W2[20]);
353 F2(E, A, B, C, D, W2[21]);
354 F2(D, E, A, B, C, W2[22]);
355 F2(C, D, E, A, B, W2[23]);
356 F2(B, C, D, E, A, W2[24]);
357 F2(A, B, C, D, E, W2[25]);
358 F2(E, A, B, C, D, W2[26]);
359 F2(D, E, A, B, C, W2[27]);
360 F2(C, D, E, A, B, W2[28]);
361 F2(B, C, D, E, A, W2[29]);
362 F2(A, B, C, D, E, W2[30]);
363 F2(E, A, B, C, D, W2[31]);
364 F2(D, E, A, B, C, W2[32]);
365 F2(C, D, E, A, B, W2[33]);
366 F2(B, C, D, E, A, W2[34]);
367 F2(A, B, C, D, E, W2[35]);
368 F2(E, A, B, C, D, W2[36]);
369 F2(D, E, A, B, C, W2[37]);
370 F2(C, D, E, A, B, W2[38]);
371 F2(B, C, D, E, A, W2[39]);
372 F3(A, B, C, D, E, W2[40]);
373 F3(E, A, B, C, D, W2[41]);
374 F3(D, E, A, B, C, W2[42]);
375 F3(C, D, E, A, B, W2[43]);
376 F3(B, C, D, E, A, W2[44]);
377 F3(A, B, C, D, E, W2[45]);
378 F3(E, A, B, C, D, W2[46]);
379 F3(D, E, A, B, C, W2[47]);
380 F3(C, D, E, A, B, W2[48]);
381 F3(B, C, D, E, A, W2[49]);
382 F3(A, B, C, D, E, W2[50]);
383 F3(E, A, B, C, D, W2[51]);
384 F3(D, E, A, B, C, W2[52]);
385 F3(C, D, E, A, B, W2[53]);
386 F3(B, C, D, E, A, W2[54]);
387 F3(A, B, C, D, E, W2[55]);
388 F3(E, A, B, C, D, W2[56]);
389 F3(D, E, A, B, C, W2[57]);
390 F3(C, D, E, A, B, W2[58]);
391 F3(B, C, D, E, A, W2[59]);
392 F4(A, B, C, D, E, W2[60]);
393 F4(E, A, B, C, D, W2[61]);
394 F4(D, E, A, B, C, W2[62]);
395 F4(C, D, E, A, B, W2[63]);
396 F4(B, C, D, E, A, W2[64]);
397 F4(A, B, C, D, E, W2[65]);
398 F4(E, A, B, C, D, W2[66]);
399 F4(D, E, A, B, C, W2[67]);
400 F4(C, D, E, A, B, W2[68]);
401 F4(B, C, D, E, A, W2[69]);
402 F4(A, B, C, D, E, W2[70]);
403 F4(E, A, B, C, D, W2[71]);
404 F4(D, E, A, B, C, W2[72]);
405 F4(C, D, E, A, B, W2[73]);
406 F4(B, C, D, E, A, W2[74]);
407 F4(A, B, C, D, E, W2[75]);
408 F4(E, A, B, C, D, W2[76]);
409 F4(D, E, A, B, C, W2[77]);
410 F4(C, D, E, A, B, W2[78]);
411 F4(B, C, D, E, A, W2[79]);
413 A = (digest[0] += A);
414 B = (digest[1] += B);
415 C = (digest[2] += C);
416 D = (digest[3] += D);
417 E = (digest[4] += E);
420 for(
size_t i = 0; i != blocks; ++i) {
428 SIMD_8x32 P0 = W0 + K11;
429 SIMD_8x32 P2 = W2 + K11;
432 P0 = sha1_avx2_next_w2(W0, W2) + K12;
434 F1(A, B, C, D, E, PT[0]);
435 F1(E, A, B, C, D, PT[1]);
436 F1(D, E, A, B, C, PT[2]);
437 F1(C, D, E, A, B, PT[3]);
438 F1(B, C, D, E, A, PT[4]);
439 F1(A, B, C, D, E, PT[5]);
440 F1(E, A, B, C, D, PT[6]);
441 F1(D, E, A, B, C, PT[7]);
444 P2 = sha1_avx2_next_w2(W2, W0) + K22;
446 F1(C, D, E, A, B, PT[0]);
447 F1(B, C, D, E, A, PT[1]);
448 F1(A, B, C, D, E, PT[2]);
449 F1(E, A, B, C, D, PT[3]);
450 F1(D, E, A, B, C, PT[4]);
451 F1(C, D, E, A, B, PT[5]);
452 F1(B, C, D, E, A, PT[6]);
453 F1(A, B, C, D, E, PT[7]);
456 P0 = sha1_avx2_next_w2(W0, W2) + K22;
458 F1(E, A, B, C, D, PT[0]);
459 F1(D, E, A, B, C, PT[1]);
460 F1(C, D, E, A, B, PT[2]);
461 F1(B, C, D, E, A, PT[3]);
462 F2(A, B, C, D, E, PT[4]);
463 F2(E, A, B, C, D, PT[5]);
464 F2(D, E, A, B, C, PT[6]);
465 F2(C, D, E, A, B, PT[7]);
468 P2 = sha1_avx2_next_w2(W2, W0) + K33;
470 F2(B, C, D, E, A, PT[0]);
471 F2(A, B, C, D, E, PT[1]);
472 F2(E, A, B, C, D, PT[2]);
473 F2(D, E, A, B, C, PT[3]);
474 F2(C, D, E, A, B, PT[4]);
475 F2(B, C, D, E, A, PT[5]);
476 F2(A, B, C, D, E, PT[6]);
477 F2(E, A, B, C, D, PT[7]);
480 P0 = sha1_avx2_next_w2(W0, W2) + K33;
482 F2(D, E, A, B, C, PT[0]);
483 F2(C, D, E, A, B, PT[1]);
484 F2(B, C, D, E, A, PT[2]);
485 F2(A, B, C, D, E, PT[3]);
486 F2(E, A, B, C, D, PT[4]);
487 F2(D, E, A, B, C, PT[5]);
488 F2(C, D, E, A, B, PT[6]);
489 F2(B, C, D, E, A, PT[7]);
492 P2 = sha1_avx2_next_w2(W2, W0) + K34;
494 F3(A, B, C, D, E, PT[0]);
495 F3(E, A, B, C, D, PT[1]);
496 F3(D, E, A, B, C, PT[2]);
497 F3(C, D, E, A, B, PT[3]);
498 F3(B, C, D, E, A, PT[4]);
499 F3(A, B, C, D, E, PT[5]);
500 F3(E, A, B, C, D, PT[6]);
501 F3(D, E, A, B, C, PT[7]);
504 P0 = sha1_avx2_next_w2(W0, W2) + K44;
506 F3(C, D, E, A, B, PT[0]);
507 F3(B, C, D, E, A, PT[1]);
508 F3(A, B, C, D, E, PT[2]);
509 F3(E, A, B, C, D, PT[3]);
510 F3(D, E, A, B, C, PT[4]);
511 F3(C, D, E, A, B, PT[5]);
512 F3(B, C, D, E, A, PT[6]);
513 F3(A, B, C, D, E, PT[7]);
516 P2 = sha1_avx2_next_w2(W2, W0) + K44;
518 F3(E, A, B, C, D, PT[0]);
519 F3(D, E, A, B, C, PT[1]);
520 F3(C, D, E, A, B, PT[2]);
521 F3(B, C, D, E, A, PT[3]);
522 F4(A, B, C, D, E, PT[4]);
523 F4(E, A, B, C, D, PT[5]);
524 F4(D, E, A, B, C, PT[6]);
525 F4(C, D, E, A, B, PT[7]);
529 F4(B, C, D, E, A, PT[0]);
530 F4(A, B, C, D, E, PT[1]);
531 F4(E, A, B, C, D, PT[2]);
532 F4(D, E, A, B, C, PT[3]);
533 F4(C, D, E, A, B, PT[4]);
534 F4(B, C, D, E, A, PT[5]);
535 F4(A, B, C, D, E, PT[6]);
536 F4(E, A, B, C, D, PT[7]);
540 F4(D, E, A, B, C, PT[0]);
541 F4(C, D, E, A, B, PT[1]);
542 F4(B, C, D, E, A, PT[2]);
543 F4(A, B, C, D, E, PT[3]);
544 F4(E, A, B, C, D, PT[4]);
545 F4(D, E, A, B, C, PT[5]);
546 F4(C, D, E, A, B, PT[6]);
547 F4(B, C, D, E, A, PT[7]);
549 A = (digest[0] += A);
550 B = (digest[1] += B);
551 C = (digest[2] += C);
552 D = (digest[3] += D);
553 E = (digest[4] += E);
static constexpr size_t block_bytes
static BOTAN_FN_ISA_AVX2 SIMD_8x32 splat(uint32_t B) noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be128(const uint8_t in1[], const uint8_t in2[]) noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be(const uint8_t *in) noexcept
#define BOTAN_FORCE_INLINE
void F2(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
void F4(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
void F3(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
void F1(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
BOTAN_FORCE_INLINE constexpr T rotl(T input)