7#include <botan/internal/sha2_64.h>
9#include <botan/internal/isa_extn.h>
10#include <botan/internal/sha2_64_f.h>
11#include <botan/internal/simd_2x64.h>
12#include <botan/internal/simd_4x64.h>
18template <
typename SIMD_T>
20 auto t0 = SIMD_T::alignr8(x[1], x[0]);
21 auto t1 = SIMD_T::alignr8(x[5], x[4]);
23 auto s0 = t0.template
rotr<1>() ^ t0.template
rotr<8>() ^ t0.template shr<7>();
24 auto s1 = x[7].template
rotr<19>() ^ x[7].template
rotr<61>() ^ x[7].template shr<6>();
26 auto nx = x[0] + s0 + s1 + t1;
42BOTAN_FN_ISA_AVX2_BMI2
void SHA_512::compress_digest_x86_avx2(digest_type& digest,
43 std::span<const uint8_t> input,
46 alignas(64)
const uint64_t K[80] = {
47 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
48 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
49 0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
50 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
51 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
52 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
53 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
54 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
55 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
56 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
57 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
58 0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
59 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
60 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
61 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
62 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
63 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
64 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
65 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
66 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817,
71 alignas(64)
const uint64_t K2[2 * 80] = {
72 0x428A2F98D728AE22, 0x7137449123EF65CD, 0x428A2F98D728AE22, 0x7137449123EF65CD,
73 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
74 0x3956C25BF348B538, 0x59F111F1B605D019, 0x3956C25BF348B538, 0x59F111F1B605D019,
75 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
76 0xD807AA98A3030242, 0x12835B0145706FBE, 0xD807AA98A3030242, 0x12835B0145706FBE,
77 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
78 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
79 0x9BDC06A725C71235, 0xC19BF174CF692694, 0x9BDC06A725C71235, 0xC19BF174CF692694,
80 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
81 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
82 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
83 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
84 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0x983E5152EE66DFAB, 0xA831C66D2DB43210,
85 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
86 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
87 0x06CA6351E003826F, 0x142929670A0E6E70, 0x06CA6351E003826F, 0x142929670A0E6E70,
88 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x27B70A8546D22FFC, 0x2E1B21385C26C926,
89 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
90 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
91 0x81C2C92E47EDAEE6, 0x92722C851482353B, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
92 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xA2BFE8A14CF10364, 0xA81A664BBC423001,
93 0xC24B8B70D0F89791, 0xC76C51A30654BE30, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
94 0xD192E819D6EF5218, 0xD69906245565A910, 0xD192E819D6EF5218, 0xD69906245565A910,
95 0xF40E35855771202A, 0x106AA07032BBD1B8, 0xF40E35855771202A, 0x106AA07032BBD1B8,
96 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
97 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
98 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
99 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
100 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
101 0x84C87814A1F0AB72, 0x8CC702081A6439EC, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
102 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
103 0xBEF9A3F7B2C67915, 0xC67178F2E372532B, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
104 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xCA273ECEEA26619C, 0xD186B8C721C0C207,
105 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
106 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
107 0x113F9804BEF90DAE, 0x1B710B35131C471B, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
108 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x28DB77F523047D84, 0x32CAAB7B40C72493,
109 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
110 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
111 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817,
115 alignas(64) uint64_t W[16] = {0};
116 alignas(64) uint64_t W2[80];
118 uint64_t A = digest[0];
119 uint64_t B = digest[1];
120 uint64_t C = digest[2];
121 uint64_t D = digest[3];
122 uint64_t E = digest[4];
123 uint64_t F = digest[5];
124 uint64_t G = digest[6];
125 uint64_t H = digest[7];
127 const uint8_t* data = input.data();
132 for(
size_t i = 0; i < 8; i++) {
135 WK.store_le2(&W[2 * i], &W2[2 * i]);
142 for(
size_t r = 0; r != 64; r += 16) {
146 w.store_le2(&W[0], &W2[r + 16]);
151 w.store_le2(&W[2], &W2[r + 18]);
156 w.store_le2(&W[4], &W2[r + 20]);
161 w.store_le2(&W[6], &W2[r + 22]);
166 w.store_le2(&W[8], &W2[r + 24]);
169 SHA2_64_F(G, H, A, B, C, D, E, F, W[10]);
170 SHA2_64_F(F, G, H, A, B, C, D, E, W[11]);
171 w.store_le2(&W[10], &W2[r + 26]);
174 SHA2_64_F(E, F, G, H, A, B, C, D, W[12]);
175 SHA2_64_F(D, E, F, G, H, A, B, C, W[13]);
176 w.store_le2(&W[12], &W2[r + 28]);
179 SHA2_64_F(C, D, E, F, G, H, A, B, W[14]);
180 SHA2_64_F(B, C, D, E, F, G, H, A, W[15]);
181 w.store_le2(&W[14], &W2[r + 30]);
195 SHA2_64_F(G, H, A, B, C, D, E, F, W[10]);
196 SHA2_64_F(F, G, H, A, B, C, D, E, W[11]);
197 SHA2_64_F(E, F, G, H, A, B, C, D, W[12]);
198 SHA2_64_F(D, E, F, G, H, A, B, C, W[13]);
199 SHA2_64_F(C, D, E, F, G, H, A, B, W[14]);
200 SHA2_64_F(B, C, D, E, F, G, H, A, W[15]);
202 A = (digest[0] += A);
203 B = (digest[1] += B);
204 C = (digest[2] += C);
205 D = (digest[3] += D);
206 E = (digest[4] += E);
207 F = (digest[5] += F);
208 G = (digest[6] += G);
209 H = (digest[7] += H);
212 SHA2_64_F(A, B, C, D, E, F, G, H, W2[0]);
213 SHA2_64_F(H, A, B, C, D, E, F, G, W2[1]);
214 SHA2_64_F(G, H, A, B, C, D, E, F, W2[2]);
215 SHA2_64_F(F, G, H, A, B, C, D, E, W2[3]);
216 SHA2_64_F(E, F, G, H, A, B, C, D, W2[4]);
217 SHA2_64_F(D, E, F, G, H, A, B, C, W2[5]);
218 SHA2_64_F(C, D, E, F, G, H, A, B, W2[6]);
219 SHA2_64_F(B, C, D, E, F, G, H, A, W2[7]);
220 SHA2_64_F(A, B, C, D, E, F, G, H, W2[8]);
221 SHA2_64_F(H, A, B, C, D, E, F, G, W2[9]);
222 SHA2_64_F(G, H, A, B, C, D, E, F, W2[10]);
223 SHA2_64_F(F, G, H, A, B, C, D, E, W2[11]);
224 SHA2_64_F(E, F, G, H, A, B, C, D, W2[12]);
225 SHA2_64_F(D, E, F, G, H, A, B, C, W2[13]);
226 SHA2_64_F(C, D, E, F, G, H, A, B, W2[14]);
227 SHA2_64_F(B, C, D, E, F, G, H, A, W2[15]);
229 SHA2_64_F(A, B, C, D, E, F, G, H, W2[16]);
230 SHA2_64_F(H, A, B, C, D, E, F, G, W2[17]);
231 SHA2_64_F(G, H, A, B, C, D, E, F, W2[18]);
232 SHA2_64_F(F, G, H, A, B, C, D, E, W2[19]);
233 SHA2_64_F(E, F, G, H, A, B, C, D, W2[20]);
234 SHA2_64_F(D, E, F, G, H, A, B, C, W2[21]);
235 SHA2_64_F(C, D, E, F, G, H, A, B, W2[22]);
236 SHA2_64_F(B, C, D, E, F, G, H, A, W2[23]);
237 SHA2_64_F(A, B, C, D, E, F, G, H, W2[24]);
238 SHA2_64_F(H, A, B, C, D, E, F, G, W2[25]);
239 SHA2_64_F(G, H, A, B, C, D, E, F, W2[26]);
240 SHA2_64_F(F, G, H, A, B, C, D, E, W2[27]);
241 SHA2_64_F(E, F, G, H, A, B, C, D, W2[28]);
242 SHA2_64_F(D, E, F, G, H, A, B, C, W2[29]);
243 SHA2_64_F(C, D, E, F, G, H, A, B, W2[30]);
244 SHA2_64_F(B, C, D, E, F, G, H, A, W2[31]);
246 SHA2_64_F(A, B, C, D, E, F, G, H, W2[32]);
247 SHA2_64_F(H, A, B, C, D, E, F, G, W2[33]);
248 SHA2_64_F(G, H, A, B, C, D, E, F, W2[34]);
249 SHA2_64_F(F, G, H, A, B, C, D, E, W2[35]);
250 SHA2_64_F(E, F, G, H, A, B, C, D, W2[36]);
251 SHA2_64_F(D, E, F, G, H, A, B, C, W2[37]);
252 SHA2_64_F(C, D, E, F, G, H, A, B, W2[38]);
253 SHA2_64_F(B, C, D, E, F, G, H, A, W2[39]);
254 SHA2_64_F(A, B, C, D, E, F, G, H, W2[40]);
255 SHA2_64_F(H, A, B, C, D, E, F, G, W2[41]);
256 SHA2_64_F(G, H, A, B, C, D, E, F, W2[42]);
257 SHA2_64_F(F, G, H, A, B, C, D, E, W2[43]);
258 SHA2_64_F(E, F, G, H, A, B, C, D, W2[44]);
259 SHA2_64_F(D, E, F, G, H, A, B, C, W2[45]);
260 SHA2_64_F(C, D, E, F, G, H, A, B, W2[46]);
261 SHA2_64_F(B, C, D, E, F, G, H, A, W2[47]);
263 SHA2_64_F(A, B, C, D, E, F, G, H, W2[48]);
264 SHA2_64_F(H, A, B, C, D, E, F, G, W2[49]);
265 SHA2_64_F(G, H, A, B, C, D, E, F, W2[50]);
266 SHA2_64_F(F, G, H, A, B, C, D, E, W2[51]);
267 SHA2_64_F(E, F, G, H, A, B, C, D, W2[52]);
268 SHA2_64_F(D, E, F, G, H, A, B, C, W2[53]);
269 SHA2_64_F(C, D, E, F, G, H, A, B, W2[54]);
270 SHA2_64_F(B, C, D, E, F, G, H, A, W2[55]);
271 SHA2_64_F(A, B, C, D, E, F, G, H, W2[56]);
272 SHA2_64_F(H, A, B, C, D, E, F, G, W2[57]);
273 SHA2_64_F(G, H, A, B, C, D, E, F, W2[58]);
274 SHA2_64_F(F, G, H, A, B, C, D, E, W2[59]);
275 SHA2_64_F(E, F, G, H, A, B, C, D, W2[60]);
276 SHA2_64_F(D, E, F, G, H, A, B, C, W2[61]);
277 SHA2_64_F(C, D, E, F, G, H, A, B, W2[62]);
278 SHA2_64_F(B, C, D, E, F, G, H, A, W2[63]);
280 SHA2_64_F(A, B, C, D, E, F, G, H, W2[64]);
281 SHA2_64_F(H, A, B, C, D, E, F, G, W2[65]);
282 SHA2_64_F(G, H, A, B, C, D, E, F, W2[66]);
283 SHA2_64_F(F, G, H, A, B, C, D, E, W2[67]);
284 SHA2_64_F(E, F, G, H, A, B, C, D, W2[68]);
285 SHA2_64_F(D, E, F, G, H, A, B, C, W2[69]);
286 SHA2_64_F(C, D, E, F, G, H, A, B, W2[70]);
287 SHA2_64_F(B, C, D, E, F, G, H, A, W2[71]);
288 SHA2_64_F(A, B, C, D, E, F, G, H, W2[72]);
289 SHA2_64_F(H, A, B, C, D, E, F, G, W2[73]);
290 SHA2_64_F(G, H, A, B, C, D, E, F, W2[74]);
291 SHA2_64_F(F, G, H, A, B, C, D, E, W2[75]);
292 SHA2_64_F(E, F, G, H, A, B, C, D, W2[76]);
293 SHA2_64_F(D, E, F, G, H, A, B, C, W2[77]);
294 SHA2_64_F(C, D, E, F, G, H, A, B, W2[78]);
295 SHA2_64_F(B, C, D, E, F, G, H, A, W2[79]);
297 A = (digest[0] += A);
298 B = (digest[1] += B);
299 C = (digest[2] += C);
300 D = (digest[3] += D);
301 E = (digest[4] += E);
302 F = (digest[5] += F);
303 G = (digest[6] += G);
304 H = (digest[7] += H);
310 for(
size_t i = 0; i < 8; i++) {
313 WK.store_le(&W[2 * i]);
320 for(
size_t r = 0; r != 64; r += 16) {
347 SHA2_64_F(G, H, A, B, C, D, E, F, W[10]);
348 SHA2_64_F(F, G, H, A, B, C, D, E, W[11]);
352 SHA2_64_F(E, F, G, H, A, B, C, D, W[12]);
353 SHA2_64_F(D, E, F, G, H, A, B, C, W[13]);
357 SHA2_64_F(C, D, E, F, G, H, A, B, W[14]);
358 SHA2_64_F(B, C, D, E, F, G, H, A, W[15]);
373 SHA2_64_F(G, H, A, B, C, D, E, F, W[10]);
374 SHA2_64_F(F, G, H, A, B, C, D, E, W[11]);
375 SHA2_64_F(E, F, G, H, A, B, C, D, W[12]);
376 SHA2_64_F(D, E, F, G, H, A, B, C, W[13]);
377 SHA2_64_F(C, D, E, F, G, H, A, B, W[14]);
378 SHA2_64_F(B, C, D, E, F, G, H, A, W[15]);
380 A = (digest[0] += A);
381 B = (digest[1] += B);
382 C = (digest[2] += C);
383 D = (digest[3] += D);
384 E = (digest[4] += E);
385 F = (digest[5] += F);
386 G = (digest[6] += G);
387 H = (digest[7] += H);
static SIMD_2x64 load_le(const void *in)
static SIMD_2x64 load_be(const void *in)
static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 load_be2(const void *lo, const void *hi)
static BOTAN_FN_ISA_SIMD_4X64 SIMD_4x64 load_le(const void *in)
#define BOTAN_FORCE_INLINE
BOTAN_FORCE_INLINE void SHA2_64_F(uint64_t A, uint64_t B, uint64_t C, uint64_t &D, uint64_t E, uint64_t F, uint64_t G, uint64_t &H, uint64_t &M1, uint64_t M2, uint64_t M3, uint64_t M4, uint64_t magic)
BOTAN_FORCE_INLINE constexpr T rotr(T input)