7#include <botan/internal/sha1.h>
9#include <botan/internal/isa_extn.h>
10#include <botan/internal/sha1_f.h>
11#include <botan/internal/simd_avx2.h>
29 T0 ^=
SIMD_8x32(_mm256_alignr_epi8(XW1.raw(), XW0.raw(), 8));
31 T0 ^=
SIMD_8x32(_mm256_srli_si256(XW3.raw(), 4));
36 auto T2 =
SIMD_8x32(_mm256_slli_si256(T0.raw(), 3 * 4));
57template <
int I0,
int I1,
int I2,
int I3,
int I4,
int I5,
int I6,
int I7>
59 const __m256i tbl = _mm256_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7);
60 const __m256i mask = _mm256_setr_epi32(I0 >= 0 ? 0xFFFFFFFF : 0,
61 I1 >= 0 ? 0xFFFFFFFF : 0,
62 I2 >= 0 ? 0xFFFFFFFF : 0,
63 I3 >= 0 ? 0xFFFFFFFF : 0,
64 I4 >= 0 ? 0xFFFFFFFF : 0,
65 I5 >= 0 ? 0xFFFFFFFF : 0,
66 I6 >= 0 ? 0xFFFFFFFF : 0,
67 I7 >= 0 ? 0xFFFFFFFF : 0);
69 return SIMD_8x32(_mm256_and_si256(mask, _mm256_permutevar8x32_epi32(v.raw(), tbl)));
120 WN ^= permute_words<5, 6, 7, -1, -1, -1, 0, 1>(W2);
123 WN ^= permute_words<2, 3, 4, 5, 6, 7, -1, -1>(W0);
126 auto T0 = permute_words<-1, -1, -1, 0, 1, 2, -1, -1>(WN).
rotl<2>();
132 WN ^= permute_words<-1, -1, -1, -1, -1, -1, 3, 4>(WN).
rotl<1>();
144void BOTAN_FN_ISA_AVX2_BMI2 SHA_1::avx2_compress_n(digest_type& digest, std::span<const uint8_t> input,
size_t blocks) {
145 using namespace SHA1_F;
152 const SIMD_8x32 K12(K1, K1, K1, K1, K2, K2, K2, K2);
153 const SIMD_8x32 K34(K3, K3, K3, K3, K4, K4, K4, K4);
155 uint32_t A = digest[0];
156 uint32_t B = digest[1];
157 uint32_t C = digest[2];
158 uint32_t D = digest[3];
159 uint32_t E = digest[4];
161 BufferSlicer in(input);
167 uint32_t W2[80] = {0};
184 P0.store_le128(PT, &W2[0]);
186 F1(A, B, C, D, E, PT[0]);
187 F1(E, A, B, C, D, PT[1]);
188 F1(D, E, A, B, C, PT[2]);
189 F1(C, D, E, A, B, PT[3]);
191 P1.store_le128(PT, &W2[4]);
193 F1(B, C, D, E, A, PT[0]);
194 F1(A, B, C, D, E, PT[1]);
195 F1(E, A, B, C, D, PT[2]);
196 F1(D, E, A, B, C, PT[3]);
198 P2.store_le128(PT, &W2[8]);
200 F1(C, D, E, A, B, PT[0]);
201 F1(B, C, D, E, A, PT[1]);
202 F1(A, B, C, D, E, PT[2]);
203 F1(E, A, B, C, D, PT[3]);
205 P3.store_le128(PT, &W2[12]);
207 F1(D, E, A, B, C, PT[0]);
208 F1(C, D, E, A, B, PT[1]);
209 F1(B, C, D, E, A, PT[2]);
210 F1(A, B, C, D, E, PT[3]);
212 P0.store_le128(PT, &W2[16]);
214 F1(E, A, B, C, D, PT[0]);
215 F1(D, E, A, B, C, PT[1]);
216 F1(C, D, E, A, B, PT[2]);
217 F1(B, C, D, E, A, PT[3]);
219 P1.store_le128(PT, &W2[20]);
221 F2(A, B, C, D, E, PT[0]);
222 F2(E, A, B, C, D, PT[1]);
223 F2(D, E, A, B, C, PT[2]);
224 F2(C, D, E, A, B, PT[3]);
226 P2.store_le128(PT, &W2[24]);
228 F2(B, C, D, E, A, PT[0]);
229 F2(A, B, C, D, E, PT[1]);
230 F2(E, A, B, C, D, PT[2]);
231 F2(D, E, A, B, C, PT[3]);
233 P3.store_le128(PT, &W2[28]);
235 F2(C, D, E, A, B, PT[0]);
236 F2(B, C, D, E, A, PT[1]);
237 F2(A, B, C, D, E, PT[2]);
238 F2(E, A, B, C, D, PT[3]);
240 P0.store_le128(PT, &W2[32]);
242 F2(D, E, A, B, C, PT[0]);
243 F2(C, D, E, A, B, PT[1]);
244 F2(B, C, D, E, A, PT[2]);
245 F2(A, B, C, D, E, PT[3]);
247 P1.store_le128(PT, &W2[36]);
249 F2(E, A, B, C, D, PT[0]);
250 F2(D, E, A, B, C, PT[1]);
251 F2(C, D, E, A, B, PT[2]);
252 F2(B, C, D, E, A, PT[3]);
254 P2.store_le128(PT, &W2[40]);
256 F3(A, B, C, D, E, PT[0]);
257 F3(E, A, B, C, D, PT[1]);
258 F3(D, E, A, B, C, PT[2]);
259 F3(C, D, E, A, B, PT[3]);
261 P3.store_le128(PT, &W2[44]);
263 F3(B, C, D, E, A, PT[0]);
264 F3(A, B, C, D, E, PT[1]);
265 F3(E, A, B, C, D, PT[2]);
266 F3(D, E, A, B, C, PT[3]);
268 P0.store_le128(PT, &W2[48]);
270 F3(C, D, E, A, B, PT[0]);
271 F3(B, C, D, E, A, PT[1]);
272 F3(A, B, C, D, E, PT[2]);
273 F3(E, A, B, C, D, PT[3]);
275 P1.store_le128(PT, &W2[52]);
277 F3(D, E, A, B, C, PT[0]);
278 F3(C, D, E, A, B, PT[1]);
279 F3(B, C, D, E, A, PT[2]);
280 F3(A, B, C, D, E, PT[3]);
282 P2.store_le128(PT, &W2[56]);
284 F3(E, A, B, C, D, PT[0]);
285 F3(D, E, A, B, C, PT[1]);
286 F3(C, D, E, A, B, PT[2]);
287 F3(B, C, D, E, A, PT[3]);
289 P3.store_le128(PT, &W2[60]);
291 F4(A, B, C, D, E, PT[0]);
292 F4(E, A, B, C, D, PT[1]);
293 F4(D, E, A, B, C, PT[2]);
294 F4(C, D, E, A, B, PT[3]);
296 P0.store_le128(PT, &W2[64]);
297 F4(B, C, D, E, A, PT[0]);
298 F4(A, B, C, D, E, PT[1]);
299 F4(E, A, B, C, D, PT[2]);
300 F4(D, E, A, B, C, PT[3]);
302 P1.store_le128(PT, &W2[68]);
303 F4(C, D, E, A, B, PT[0]);
304 F4(B, C, D, E, A, PT[1]);
305 F4(A, B, C, D, E, PT[2]);
306 F4(E, A, B, C, D, PT[3]);
308 P2.store_le128(PT, &W2[72]);
309 F4(D, E, A, B, C, PT[0]);
310 F4(C, D, E, A, B, PT[1]);
311 F4(B, C, D, E, A, PT[2]);
312 F4(A, B, C, D, E, PT[3]);
314 P3.store_le128(PT, &W2[76]);
315 F4(E, A, B, C, D, PT[0]);
316 F4(D, E, A, B, C, PT[1]);
317 F4(C, D, E, A, B, PT[2]);
318 F4(B, C, D, E, A, PT[3]);
322 A = (digest[0] += A);
323 B = (digest[1] += B);
324 C = (digest[2] += C);
325 D = (digest[3] += D);
326 E = (digest[4] += E);
329 F1(A, B, C, D, E, W2[0]);
330 F1(E, A, B, C, D, W2[1]);
331 F1(D, E, A, B, C, W2[2]);
332 F1(C, D, E, A, B, W2[3]);
333 F1(B, C, D, E, A, W2[4]);
334 F1(A, B, C, D, E, W2[5]);
335 F1(E, A, B, C, D, W2[6]);
336 F1(D, E, A, B, C, W2[7]);
337 F1(C, D, E, A, B, W2[8]);
338 F1(B, C, D, E, A, W2[9]);
339 F1(A, B, C, D, E, W2[10]);
340 F1(E, A, B, C, D, W2[11]);
341 F1(D, E, A, B, C, W2[12]);
342 F1(C, D, E, A, B, W2[13]);
343 F1(B, C, D, E, A, W2[14]);
344 F1(A, B, C, D, E, W2[15]);
345 F1(E, A, B, C, D, W2[16]);
346 F1(D, E, A, B, C, W2[17]);
347 F1(C, D, E, A, B, W2[18]);
348 F1(B, C, D, E, A, W2[19]);
349 F2(A, B, C, D, E, W2[20]);
350 F2(E, A, B, C, D, W2[21]);
351 F2(D, E, A, B, C, W2[22]);
352 F2(C, D, E, A, B, W2[23]);
353 F2(B, C, D, E, A, W2[24]);
354 F2(A, B, C, D, E, W2[25]);
355 F2(E, A, B, C, D, W2[26]);
356 F2(D, E, A, B, C, W2[27]);
357 F2(C, D, E, A, B, W2[28]);
358 F2(B, C, D, E, A, W2[29]);
359 F2(A, B, C, D, E, W2[30]);
360 F2(E, A, B, C, D, W2[31]);
361 F2(D, E, A, B, C, W2[32]);
362 F2(C, D, E, A, B, W2[33]);
363 F2(B, C, D, E, A, W2[34]);
364 F2(A, B, C, D, E, W2[35]);
365 F2(E, A, B, C, D, W2[36]);
366 F2(D, E, A, B, C, W2[37]);
367 F2(C, D, E, A, B, W2[38]);
368 F2(B, C, D, E, A, W2[39]);
369 F3(A, B, C, D, E, W2[40]);
370 F3(E, A, B, C, D, W2[41]);
371 F3(D, E, A, B, C, W2[42]);
372 F3(C, D, E, A, B, W2[43]);
373 F3(B, C, D, E, A, W2[44]);
374 F3(A, B, C, D, E, W2[45]);
375 F3(E, A, B, C, D, W2[46]);
376 F3(D, E, A, B, C, W2[47]);
377 F3(C, D, E, A, B, W2[48]);
378 F3(B, C, D, E, A, W2[49]);
379 F3(A, B, C, D, E, W2[50]);
380 F3(E, A, B, C, D, W2[51]);
381 F3(D, E, A, B, C, W2[52]);
382 F3(C, D, E, A, B, W2[53]);
383 F3(B, C, D, E, A, W2[54]);
384 F3(A, B, C, D, E, W2[55]);
385 F3(E, A, B, C, D, W2[56]);
386 F3(D, E, A, B, C, W2[57]);
387 F3(C, D, E, A, B, W2[58]);
388 F3(B, C, D, E, A, W2[59]);
389 F4(A, B, C, D, E, W2[60]);
390 F4(E, A, B, C, D, W2[61]);
391 F4(D, E, A, B, C, W2[62]);
392 F4(C, D, E, A, B, W2[63]);
393 F4(B, C, D, E, A, W2[64]);
394 F4(A, B, C, D, E, W2[65]);
395 F4(E, A, B, C, D, W2[66]);
396 F4(D, E, A, B, C, W2[67]);
397 F4(C, D, E, A, B, W2[68]);
398 F4(B, C, D, E, A, W2[69]);
399 F4(A, B, C, D, E, W2[70]);
400 F4(E, A, B, C, D, W2[71]);
401 F4(D, E, A, B, C, W2[72]);
402 F4(C, D, E, A, B, W2[73]);
403 F4(B, C, D, E, A, W2[74]);
404 F4(A, B, C, D, E, W2[75]);
405 F4(E, A, B, C, D, W2[76]);
406 F4(D, E, A, B, C, W2[77]);
407 F4(C, D, E, A, B, W2[78]);
408 F4(B, C, D, E, A, W2[79]);
410 A = (digest[0] += A);
411 B = (digest[1] += B);
412 C = (digest[2] += C);
413 D = (digest[3] += D);
414 E = (digest[4] += E);
417 for(
size_t i = 0; i != blocks; ++i) {
425 SIMD_8x32 P0 = W0 + K11;
426 SIMD_8x32 P2 = W2 + K11;
429 P0 = sha1_avx2_next_w2(W0, W2) + K12;
431 F1(A, B, C, D, E, PT[0]);
432 F1(E, A, B, C, D, PT[1]);
433 F1(D, E, A, B, C, PT[2]);
434 F1(C, D, E, A, B, PT[3]);
435 F1(B, C, D, E, A, PT[4]);
436 F1(A, B, C, D, E, PT[5]);
437 F1(E, A, B, C, D, PT[6]);
438 F1(D, E, A, B, C, PT[7]);
441 P2 = sha1_avx2_next_w2(W2, W0) + K22;
443 F1(C, D, E, A, B, PT[0]);
444 F1(B, C, D, E, A, PT[1]);
445 F1(A, B, C, D, E, PT[2]);
446 F1(E, A, B, C, D, PT[3]);
447 F1(D, E, A, B, C, PT[4]);
448 F1(C, D, E, A, B, PT[5]);
449 F1(B, C, D, E, A, PT[6]);
450 F1(A, B, C, D, E, PT[7]);
453 P0 = sha1_avx2_next_w2(W0, W2) + K22;
455 F1(E, A, B, C, D, PT[0]);
456 F1(D, E, A, B, C, PT[1]);
457 F1(C, D, E, A, B, PT[2]);
458 F1(B, C, D, E, A, PT[3]);
459 F2(A, B, C, D, E, PT[4]);
460 F2(E, A, B, C, D, PT[5]);
461 F2(D, E, A, B, C, PT[6]);
462 F2(C, D, E, A, B, PT[7]);
465 P2 = sha1_avx2_next_w2(W2, W0) + K33;
467 F2(B, C, D, E, A, PT[0]);
468 F2(A, B, C, D, E, PT[1]);
469 F2(E, A, B, C, D, PT[2]);
470 F2(D, E, A, B, C, PT[3]);
471 F2(C, D, E, A, B, PT[4]);
472 F2(B, C, D, E, A, PT[5]);
473 F2(A, B, C, D, E, PT[6]);
474 F2(E, A, B, C, D, PT[7]);
477 P0 = sha1_avx2_next_w2(W0, W2) + K33;
479 F2(D, E, A, B, C, PT[0]);
480 F2(C, D, E, A, B, PT[1]);
481 F2(B, C, D, E, A, PT[2]);
482 F2(A, B, C, D, E, PT[3]);
483 F2(E, A, B, C, D, PT[4]);
484 F2(D, E, A, B, C, PT[5]);
485 F2(C, D, E, A, B, PT[6]);
486 F2(B, C, D, E, A, PT[7]);
489 P2 = sha1_avx2_next_w2(W2, W0) + K34;
491 F3(A, B, C, D, E, PT[0]);
492 F3(E, A, B, C, D, PT[1]);
493 F3(D, E, A, B, C, PT[2]);
494 F3(C, D, E, A, B, PT[3]);
495 F3(B, C, D, E, A, PT[4]);
496 F3(A, B, C, D, E, PT[5]);
497 F3(E, A, B, C, D, PT[6]);
498 F3(D, E, A, B, C, PT[7]);
501 P0 = sha1_avx2_next_w2(W0, W2) + K44;
503 F3(C, D, E, A, B, PT[0]);
504 F3(B, C, D, E, A, PT[1]);
505 F3(A, B, C, D, E, PT[2]);
506 F3(E, A, B, C, D, PT[3]);
507 F3(D, E, A, B, C, PT[4]);
508 F3(C, D, E, A, B, PT[5]);
509 F3(B, C, D, E, A, PT[6]);
510 F3(A, B, C, D, E, PT[7]);
513 P2 = sha1_avx2_next_w2(W2, W0) + K44;
515 F3(E, A, B, C, D, PT[0]);
516 F3(D, E, A, B, C, PT[1]);
517 F3(C, D, E, A, B, PT[2]);
518 F3(B, C, D, E, A, PT[3]);
519 F4(A, B, C, D, E, PT[4]);
520 F4(E, A, B, C, D, PT[5]);
521 F4(D, E, A, B, C, PT[6]);
522 F4(C, D, E, A, B, PT[7]);
526 F4(B, C, D, E, A, PT[0]);
527 F4(A, B, C, D, E, PT[1]);
528 F4(E, A, B, C, D, PT[2]);
529 F4(D, E, A, B, C, PT[3]);
530 F4(C, D, E, A, B, PT[4]);
531 F4(B, C, D, E, A, PT[5]);
532 F4(A, B, C, D, E, PT[6]);
533 F4(E, A, B, C, D, PT[7]);
537 F4(D, E, A, B, C, PT[0]);
538 F4(C, D, E, A, B, PT[1]);
539 F4(B, C, D, E, A, PT[2]);
540 F4(A, B, C, D, E, PT[3]);
541 F4(E, A, B, C, D, PT[4]);
542 F4(D, E, A, B, C, PT[5]);
543 F4(C, D, E, A, B, PT[6]);
544 F4(B, C, D, E, A, PT[7]);
546 A = (digest[0] += A);
547 B = (digest[1] += B);
548 C = (digest[2] += C);
549 D = (digest[3] += D);
550 E = (digest[4] += E);
static constexpr size_t block_bytes
static BOTAN_FN_ISA_AVX2 SIMD_8x32 splat(uint32_t B) noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be128(const uint8_t in1[], const uint8_t in2[]) noexcept
static BOTAN_FN_ISA_AVX2 SIMD_8x32 load_be(const uint8_t *in) noexcept
#define BOTAN_FORCE_INLINE
void F2(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
void F4(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
void F3(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
void F1(uint32_t A, uint32_t &B, uint32_t C, uint32_t D, uint32_t &E, uint32_t M)
BOTAN_FORCE_INLINE constexpr T rotl(T input)