Botan 3.10.0
Crypto and TLS for C&
sha2_64_avx512.cpp
Go to the documentation of this file.
1/*
2* (C) 2025 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/sha2_64.h>
8
9#include <botan/internal/isa_extn.h>
10#include <botan/internal/sha2_64_f.h>
11#include <botan/internal/simd_2x64.h>
12#include <botan/internal/simd_8x64.h>
13
14namespace Botan {
15
16namespace {
17
18template <typename SIMD_T>
19BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_BMI2 SIMD_T sha512_next_w_avx512(SIMD_T x[8]) {
20 auto t0 = SIMD_T::alignr8(x[1], x[0]);
21 auto t1 = SIMD_T::alignr8(x[5], x[4]);
22
23 auto s0 = t0.template rotr<1>() ^ t0.template rotr<8>() ^ t0.template shr<7>();
24 auto s1 = x[7].template rotr<19>() ^ x[7].template rotr<61>() ^ x[7].template shr<6>();
25
26 auto nx = x[0] + s0 + s1 + t1;
27
28 x[0] = x[1];
29 x[1] = x[2];
30 x[2] = x[3];
31 x[3] = x[4];
32 x[4] = x[5];
33 x[5] = x[6];
34 x[6] = x[7];
35 x[7] = nx;
36
37 return x[7];
38}
39
40} // namespace
41
42BOTAN_FN_ISA_AVX512_BMI2 void SHA_512::compress_digest_x86_avx512(digest_type& digest,
43 std::span<const uint8_t> input,
44 size_t blocks) {
45 // clang-format off
46 alignas(64) const uint64_t K[80] = {
47 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
48 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
49 0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
50 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
51 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
52 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
53 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
54 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
55 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
56 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
57 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
58 0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
59 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
60 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
61 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
62 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
63 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
64 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
65 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
66 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817,
67 };
68
69 // K4 is each pair of elements in K repeated 4 times, since we are performing
70 // 4 parallel message expansions
71 alignas(64) const uint64_t K4[4 * 80] = {
72 0x428A2F98D728AE22, 0x7137449123EF65CD, 0x428A2F98D728AE22, 0x7137449123EF65CD,
73 0x428A2F98D728AE22, 0x7137449123EF65CD, 0x428A2F98D728AE22, 0x7137449123EF65CD,
74 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
75 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
76 0x3956C25BF348B538, 0x59F111F1B605D019, 0x3956C25BF348B538, 0x59F111F1B605D019,
77 0x3956C25BF348B538, 0x59F111F1B605D019, 0x3956C25BF348B538, 0x59F111F1B605D019,
78 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
79 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
80 0xD807AA98A3030242, 0x12835B0145706FBE, 0xD807AA98A3030242, 0x12835B0145706FBE,
81 0xD807AA98A3030242, 0x12835B0145706FBE, 0xD807AA98A3030242, 0x12835B0145706FBE,
82 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
83 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
84 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
85 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
86 0x9BDC06A725C71235, 0xC19BF174CF692694, 0x9BDC06A725C71235, 0xC19BF174CF692694,
87 0x9BDC06A725C71235, 0xC19BF174CF692694, 0x9BDC06A725C71235, 0xC19BF174CF692694,
88 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
89 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
90 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
91 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
92 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
93 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
94 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
95 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
96 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0x983E5152EE66DFAB, 0xA831C66D2DB43210,
97 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0x983E5152EE66DFAB, 0xA831C66D2DB43210,
98 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
99 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
100 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
101 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
102 0x06CA6351E003826F, 0x142929670A0E6E70, 0x06CA6351E003826F, 0x142929670A0E6E70,
103 0x06CA6351E003826F, 0x142929670A0E6E70, 0x06CA6351E003826F, 0x142929670A0E6E70,
104 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x27B70A8546D22FFC, 0x2E1B21385C26C926,
105 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x27B70A8546D22FFC, 0x2E1B21385C26C926,
106 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
107 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
108 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
109 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
110 0x81C2C92E47EDAEE6, 0x92722C851482353B, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
111 0x81C2C92E47EDAEE6, 0x92722C851482353B, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
112 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xA2BFE8A14CF10364, 0xA81A664BBC423001,
113 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xA2BFE8A14CF10364, 0xA81A664BBC423001,
114 0xC24B8B70D0F89791, 0xC76C51A30654BE30, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
115 0xC24B8B70D0F89791, 0xC76C51A30654BE30, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
116 0xD192E819D6EF5218, 0xD69906245565A910, 0xD192E819D6EF5218, 0xD69906245565A910,
117 0xD192E819D6EF5218, 0xD69906245565A910, 0xD192E819D6EF5218, 0xD69906245565A910,
118 0xF40E35855771202A, 0x106AA07032BBD1B8, 0xF40E35855771202A, 0x106AA07032BBD1B8,
119 0xF40E35855771202A, 0x106AA07032BBD1B8, 0xF40E35855771202A, 0x106AA07032BBD1B8,
120 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
121 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
122 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
123 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
124 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
125 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
126 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
127 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
128 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
129 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
130 0x84C87814A1F0AB72, 0x8CC702081A6439EC, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
131 0x84C87814A1F0AB72, 0x8CC702081A6439EC, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
132 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
133 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
134 0xBEF9A3F7B2C67915, 0xC67178F2E372532B, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
135 0xBEF9A3F7B2C67915, 0xC67178F2E372532B, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
136 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xCA273ECEEA26619C, 0xD186B8C721C0C207,
137 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xCA273ECEEA26619C, 0xD186B8C721C0C207,
138 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
139 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
140 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
141 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
142 0x113F9804BEF90DAE, 0x1B710B35131C471B, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
143 0x113F9804BEF90DAE, 0x1B710B35131C471B, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
144 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x28DB77F523047D84, 0x32CAAB7B40C72493,
145 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x28DB77F523047D84, 0x32CAAB7B40C72493,
146 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
147 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
148 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
149 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
150 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817,
151 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817,
152 };
153 // clang-format on
154
155 alignas(64) uint64_t W[16] = {0};
156 alignas(64) uint64_t WN[3][80];
157
158 uint64_t A = digest[0];
159 uint64_t B = digest[1];
160 uint64_t C = digest[2];
161 uint64_t D = digest[3];
162 uint64_t E = digest[4];
163 uint64_t F = digest[5];
164 uint64_t G = digest[6];
165 uint64_t H = digest[7];
166
167 const uint8_t* data = input.data();
168
169 while(blocks >= 4) {
170 SIMD_8x64 WS[8];
171
172 for(size_t i = 0; i < 8; i++) {
173 WS[i] = SIMD_8x64::load_be4(
174 &data[16 * i], &data[1 * 128 + 16 * i], &data[2 * 128 + 16 * i], &data[3 * 128 + 16 * i]);
175 auto WK = WS[i] + SIMD_8x64::load_le(&K4[8 * i]);
176 WK.store_le4(&W[2 * i], &WN[0][2 * i], &WN[1][2 * i], &WN[2][2 * i]);
177 }
178
179 data += 4 * 128;
180 blocks -= 4;
181
182 // First 64 rounds of SHA-512
183 for(size_t r = 0; r != 64; r += 16) {
184 auto w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 16)]);
185 SHA2_64_F(A, B, C, D, E, F, G, H, W[0]);
186 SHA2_64_F(H, A, B, C, D, E, F, G, W[1]);
187 w.store_le4(&W[0], &WN[0][r + 16], &WN[1][r + 16], &WN[2][r + 16]);
188
189 w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 18)]);
190 SHA2_64_F(G, H, A, B, C, D, E, F, W[2]);
191 SHA2_64_F(F, G, H, A, B, C, D, E, W[3]);
192 w.store_le4(&W[2], &WN[0][r + 18], &WN[1][r + 18], &WN[2][r + 18]);
193
194 w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 20)]);
195 SHA2_64_F(E, F, G, H, A, B, C, D, W[4]);
196 SHA2_64_F(D, E, F, G, H, A, B, C, W[5]);
197 w.store_le4(&W[4], &WN[0][r + 20], &WN[1][r + 20], &WN[2][r + 20]);
198
199 w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 22)]);
200 SHA2_64_F(C, D, E, F, G, H, A, B, W[6]);
201 SHA2_64_F(B, C, D, E, F, G, H, A, W[7]);
202 w.store_le4(&W[6], &WN[0][r + 22], &WN[1][r + 22], &WN[2][r + 22]);
203
204 w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 24)]);
205 SHA2_64_F(A, B, C, D, E, F, G, H, W[8]);
206 SHA2_64_F(H, A, B, C, D, E, F, G, W[9]);
207 w.store_le4(&W[8], &WN[0][r + 24], &WN[1][r + 24], &WN[2][r + 24]);
208
209 w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 26)]);
210 SHA2_64_F(G, H, A, B, C, D, E, F, W[10]);
211 SHA2_64_F(F, G, H, A, B, C, D, E, W[11]);
212 w.store_le4(&W[10], &WN[0][r + 26], &WN[1][r + 26], &WN[2][r + 26]);
213
214 w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 28)]);
215 SHA2_64_F(E, F, G, H, A, B, C, D, W[12]);
216 SHA2_64_F(D, E, F, G, H, A, B, C, W[13]);
217 w.store_le4(&W[12], &WN[0][r + 28], &WN[1][r + 28], &WN[2][r + 28]);
218
219 w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 30)]);
220 SHA2_64_F(C, D, E, F, G, H, A, B, W[14]);
221 SHA2_64_F(B, C, D, E, F, G, H, A, W[15]);
222 w.store_le4(&W[14], &WN[0][r + 30], &WN[1][r + 30], &WN[2][r + 30]);
223 }
224
225 // Final 16 rounds of SHA-512
226 SHA2_64_F(A, B, C, D, E, F, G, H, W[0]);
227 SHA2_64_F(H, A, B, C, D, E, F, G, W[1]);
228 SHA2_64_F(G, H, A, B, C, D, E, F, W[2]);
229 SHA2_64_F(F, G, H, A, B, C, D, E, W[3]);
230 SHA2_64_F(E, F, G, H, A, B, C, D, W[4]);
231 SHA2_64_F(D, E, F, G, H, A, B, C, W[5]);
232 SHA2_64_F(C, D, E, F, G, H, A, B, W[6]);
233 SHA2_64_F(B, C, D, E, F, G, H, A, W[7]);
234 SHA2_64_F(A, B, C, D, E, F, G, H, W[8]);
235 SHA2_64_F(H, A, B, C, D, E, F, G, W[9]);
236 SHA2_64_F(G, H, A, B, C, D, E, F, W[10]);
237 SHA2_64_F(F, G, H, A, B, C, D, E, W[11]);
238 SHA2_64_F(E, F, G, H, A, B, C, D, W[12]);
239 SHA2_64_F(D, E, F, G, H, A, B, C, W[13]);
240 SHA2_64_F(C, D, E, F, G, H, A, B, W[14]);
241 SHA2_64_F(B, C, D, E, F, G, H, A, W[15]);
242
243 A = (digest[0] += A);
244 B = (digest[1] += B);
245 C = (digest[2] += C);
246 D = (digest[3] += D);
247 E = (digest[4] += E);
248 F = (digest[5] += F);
249 G = (digest[6] += G);
250 H = (digest[7] += H);
251
252 // Block 2,3,4 of SHA-512 compression, with pre-expanded messages
253 for(size_t b = 0; b != 3; ++b) { // NOLINT(*-loop-convert)
254 for(size_t r = 0; r != 80; r += 16) {
255 SHA2_64_F(A, B, C, D, E, F, G, H, WN[b][r + 0]);
256 SHA2_64_F(H, A, B, C, D, E, F, G, WN[b][r + 1]);
257 SHA2_64_F(G, H, A, B, C, D, E, F, WN[b][r + 2]);
258 SHA2_64_F(F, G, H, A, B, C, D, E, WN[b][r + 3]);
259 SHA2_64_F(E, F, G, H, A, B, C, D, WN[b][r + 4]);
260 SHA2_64_F(D, E, F, G, H, A, B, C, WN[b][r + 5]);
261 SHA2_64_F(C, D, E, F, G, H, A, B, WN[b][r + 6]);
262 SHA2_64_F(B, C, D, E, F, G, H, A, WN[b][r + 7]);
263 SHA2_64_F(A, B, C, D, E, F, G, H, WN[b][r + 8]);
264 SHA2_64_F(H, A, B, C, D, E, F, G, WN[b][r + 9]);
265 SHA2_64_F(G, H, A, B, C, D, E, F, WN[b][r + 10]);
266 SHA2_64_F(F, G, H, A, B, C, D, E, WN[b][r + 11]);
267 SHA2_64_F(E, F, G, H, A, B, C, D, WN[b][r + 12]);
268 SHA2_64_F(D, E, F, G, H, A, B, C, WN[b][r + 13]);
269 SHA2_64_F(C, D, E, F, G, H, A, B, WN[b][r + 14]);
270 SHA2_64_F(B, C, D, E, F, G, H, A, WN[b][r + 15]);
271 }
272
273 A = (digest[0] += A);
274 B = (digest[1] += B);
275 C = (digest[2] += C);
276 D = (digest[3] += D);
277 E = (digest[4] += E);
278 F = (digest[5] += F);
279 G = (digest[6] += G);
280 H = (digest[7] += H);
281 }
282 }
283
284 while(blocks > 0) {
285 SIMD_2x64 WS[8];
286
287 for(size_t i = 0; i < 8; i++) {
288 WS[i] = SIMD_2x64::load_be(&data[16 * i]);
289 auto WK = WS[i] + SIMD_2x64::load_le(&K[2 * i]);
290 WK.store_le(&W[2 * i]);
291 }
292
293 data += 128;
294 blocks -= 1;
295
296 // First 64 rounds of SHA-512
297 for(size_t r = 0; r != 64; r += 16) {
298 auto w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 16]);
299 SHA2_64_F(A, B, C, D, E, F, G, H, W[0]);
300 SHA2_64_F(H, A, B, C, D, E, F, G, W[1]);
301 w.store_le(&W[0]);
302
303 w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 18]);
304 SHA2_64_F(G, H, A, B, C, D, E, F, W[2]);
305 SHA2_64_F(F, G, H, A, B, C, D, E, W[3]);
306 w.store_le(&W[2]);
307
308 w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 20]);
309 SHA2_64_F(E, F, G, H, A, B, C, D, W[4]);
310 SHA2_64_F(D, E, F, G, H, A, B, C, W[5]);
311 w.store_le(&W[4]);
312
313 w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 22]);
314 SHA2_64_F(C, D, E, F, G, H, A, B, W[6]);
315 SHA2_64_F(B, C, D, E, F, G, H, A, W[7]);
316 w.store_le(&W[6]);
317
318 w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 24]);
319 SHA2_64_F(A, B, C, D, E, F, G, H, W[8]);
320 SHA2_64_F(H, A, B, C, D, E, F, G, W[9]);
321 w.store_le(&W[8]);
322
323 w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 26]);
324 SHA2_64_F(G, H, A, B, C, D, E, F, W[10]);
325 SHA2_64_F(F, G, H, A, B, C, D, E, W[11]);
326 w.store_le(&W[10]);
327
328 w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 28]);
329 SHA2_64_F(E, F, G, H, A, B, C, D, W[12]);
330 SHA2_64_F(D, E, F, G, H, A, B, C, W[13]);
331 w.store_le(&W[12]);
332
333 w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 30]);
334 SHA2_64_F(C, D, E, F, G, H, A, B, W[14]);
335 SHA2_64_F(B, C, D, E, F, G, H, A, W[15]);
336 w.store_le(&W[14]);
337 }
338
339 // Final 16 rounds of SHA-512
340 SHA2_64_F(A, B, C, D, E, F, G, H, W[0]);
341 SHA2_64_F(H, A, B, C, D, E, F, G, W[1]);
342 SHA2_64_F(G, H, A, B, C, D, E, F, W[2]);
343 SHA2_64_F(F, G, H, A, B, C, D, E, W[3]);
344 SHA2_64_F(E, F, G, H, A, B, C, D, W[4]);
345 SHA2_64_F(D, E, F, G, H, A, B, C, W[5]);
346 SHA2_64_F(C, D, E, F, G, H, A, B, W[6]);
347 SHA2_64_F(B, C, D, E, F, G, H, A, W[7]);
348 SHA2_64_F(A, B, C, D, E, F, G, H, W[8]);
349 SHA2_64_F(H, A, B, C, D, E, F, G, W[9]);
350 SHA2_64_F(G, H, A, B, C, D, E, F, W[10]);
351 SHA2_64_F(F, G, H, A, B, C, D, E, W[11]);
352 SHA2_64_F(E, F, G, H, A, B, C, D, W[12]);
353 SHA2_64_F(D, E, F, G, H, A, B, C, W[13]);
354 SHA2_64_F(C, D, E, F, G, H, A, B, W[14]);
355 SHA2_64_F(B, C, D, E, F, G, H, A, W[15]);
356
357 A = (digest[0] += A);
358 B = (digest[1] += B);
359 C = (digest[2] += C);
360 D = (digest[3] += D);
361 E = (digest[4] += E);
362 F = (digest[5] += F);
363 G = (digest[6] += G);
364 H = (digest[7] += H);
365 }
366}
367
368} // namespace Botan
static SIMD_2x64 load_le(const void *in)
Definition simd_2x64.h:38
static SIMD_2x64 load_be(const void *in)
Definition simd_2x64.h:42
static BOTAN_FN_ISA_SIMD_8X64 SIMD_8x64 load_be4(const void *in0, const void *in1, const void *in2, const void *in3)
Definition simd_8x64.h:46
static BOTAN_FN_ISA_SIMD_8X64 SIMD_8x64 load_le(const void *in)
Definition simd_8x64.h:53
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
BOTAN_FORCE_INLINE void SHA2_64_F(uint64_t A, uint64_t B, uint64_t C, uint64_t &D, uint64_t E, uint64_t F, uint64_t G, uint64_t &H, uint64_t &M1, uint64_t M2, uint64_t M3, uint64_t M4, uint64_t magic)
Definition sha2_64_f.h:19
BOTAN_FORCE_INLINE constexpr T rotr(T input)
Definition rotate.h:35