Botan 3.9.0
Crypto and TLS for C&
sha2_64_avx512.cpp
Go to the documentation of this file.
1/*
2* (C) 2025 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/sha2_64.h>
8
9#include <botan/internal/bit_ops.h>
10#include <botan/internal/isa_extn.h>
11#include <botan/internal/rotate.h>
12#include <botan/internal/sha2_64_f.h>
13#include <botan/internal/simd_2x64.h>
14#include <botan/internal/simd_8x64.h>
15
16namespace Botan {
17
18namespace {
19
20template <typename SIMD_T>
21BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_BMI2 SIMD_T sha512_next_w_avx512(SIMD_T x[8]) {
22 auto t0 = SIMD_T::alignr8(x[1], x[0]);
23 auto t1 = SIMD_T::alignr8(x[5], x[4]);
24
25 auto s0 = t0.template rotr<1>() ^ t0.template rotr<8>() ^ t0.template shr<7>();
26 auto s1 = x[7].template rotr<19>() ^ x[7].template rotr<61>() ^ x[7].template shr<6>();
27
28 auto nx = x[0] + s0 + s1 + t1;
29
30 x[0] = x[1];
31 x[1] = x[2];
32 x[2] = x[3];
33 x[3] = x[4];
34 x[4] = x[5];
35 x[5] = x[6];
36 x[6] = x[7];
37 x[7] = nx;
38
39 return x[7];
40}
41
42} // namespace
43
44BOTAN_FN_ISA_AVX512_BMI2 void SHA_512::compress_digest_x86_avx512(digest_type& digest,
45 std::span<const uint8_t> input,
46 size_t blocks) {
47 // clang-format off
48 alignas(64) const uint64_t K[80] = {
49 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
50 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
51 0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
52 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
53 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
54 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
55 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
56 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
57 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
58 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
59 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
60 0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
61 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
62 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
63 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
64 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
65 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
66 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
67 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
68 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817,
69 };
70
71 // K4 is each pair of elements in K repeated 4 times, since we are performing
72 // 4 parallel message expansions
73 alignas(64) const uint64_t K4[4 * 80] = {
74 0x428A2F98D728AE22, 0x7137449123EF65CD, 0x428A2F98D728AE22, 0x7137449123EF65CD,
75 0x428A2F98D728AE22, 0x7137449123EF65CD, 0x428A2F98D728AE22, 0x7137449123EF65CD,
76 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
77 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
78 0x3956C25BF348B538, 0x59F111F1B605D019, 0x3956C25BF348B538, 0x59F111F1B605D019,
79 0x3956C25BF348B538, 0x59F111F1B605D019, 0x3956C25BF348B538, 0x59F111F1B605D019,
80 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
81 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
82 0xD807AA98A3030242, 0x12835B0145706FBE, 0xD807AA98A3030242, 0x12835B0145706FBE,
83 0xD807AA98A3030242, 0x12835B0145706FBE, 0xD807AA98A3030242, 0x12835B0145706FBE,
84 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
85 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
86 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
87 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
88 0x9BDC06A725C71235, 0xC19BF174CF692694, 0x9BDC06A725C71235, 0xC19BF174CF692694,
89 0x9BDC06A725C71235, 0xC19BF174CF692694, 0x9BDC06A725C71235, 0xC19BF174CF692694,
90 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
91 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
92 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
93 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
94 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
95 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
96 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
97 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
98 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0x983E5152EE66DFAB, 0xA831C66D2DB43210,
99 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0x983E5152EE66DFAB, 0xA831C66D2DB43210,
100 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
101 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
102 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
103 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
104 0x06CA6351E003826F, 0x142929670A0E6E70, 0x06CA6351E003826F, 0x142929670A0E6E70,
105 0x06CA6351E003826F, 0x142929670A0E6E70, 0x06CA6351E003826F, 0x142929670A0E6E70,
106 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x27B70A8546D22FFC, 0x2E1B21385C26C926,
107 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x27B70A8546D22FFC, 0x2E1B21385C26C926,
108 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
109 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
110 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
111 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
112 0x81C2C92E47EDAEE6, 0x92722C851482353B, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
113 0x81C2C92E47EDAEE6, 0x92722C851482353B, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
114 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xA2BFE8A14CF10364, 0xA81A664BBC423001,
115 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xA2BFE8A14CF10364, 0xA81A664BBC423001,
116 0xC24B8B70D0F89791, 0xC76C51A30654BE30, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
117 0xC24B8B70D0F89791, 0xC76C51A30654BE30, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
118 0xD192E819D6EF5218, 0xD69906245565A910, 0xD192E819D6EF5218, 0xD69906245565A910,
119 0xD192E819D6EF5218, 0xD69906245565A910, 0xD192E819D6EF5218, 0xD69906245565A910,
120 0xF40E35855771202A, 0x106AA07032BBD1B8, 0xF40E35855771202A, 0x106AA07032BBD1B8,
121 0xF40E35855771202A, 0x106AA07032BBD1B8, 0xF40E35855771202A, 0x106AA07032BBD1B8,
122 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
123 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
124 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
125 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
126 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
127 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
128 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
129 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
130 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
131 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
132 0x84C87814A1F0AB72, 0x8CC702081A6439EC, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
133 0x84C87814A1F0AB72, 0x8CC702081A6439EC, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
134 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
135 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
136 0xBEF9A3F7B2C67915, 0xC67178F2E372532B, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
137 0xBEF9A3F7B2C67915, 0xC67178F2E372532B, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
138 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xCA273ECEEA26619C, 0xD186B8C721C0C207,
139 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xCA273ECEEA26619C, 0xD186B8C721C0C207,
140 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
141 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
142 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
143 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
144 0x113F9804BEF90DAE, 0x1B710B35131C471B, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
145 0x113F9804BEF90DAE, 0x1B710B35131C471B, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
146 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x28DB77F523047D84, 0x32CAAB7B40C72493,
147 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x28DB77F523047D84, 0x32CAAB7B40C72493,
148 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
149 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
150 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
151 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
152 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817,
153 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817,
154 };
155 // clang-format on
156
157 alignas(64) uint64_t W[16] = {0};
158 alignas(64) uint64_t WN[3][80];
159
160 uint64_t A = digest[0];
161 uint64_t B = digest[1];
162 uint64_t C = digest[2];
163 uint64_t D = digest[3];
164 uint64_t E = digest[4];
165 uint64_t F = digest[5];
166 uint64_t G = digest[6];
167 uint64_t H = digest[7];
168
169 const uint8_t* data = input.data();
170
171 while(blocks >= 4) {
172 SIMD_8x64 WS[8];
173
174 for(size_t i = 0; i < 8; i++) {
175 WS[i] = SIMD_8x64::load_be4(
176 &data[16 * i], &data[1 * 128 + 16 * i], &data[2 * 128 + 16 * i], &data[3 * 128 + 16 * i]);
177 auto WK = WS[i] + SIMD_8x64::load_le(&K4[8 * i]);
178 WK.store_le4(&W[2 * i], &WN[0][2 * i], &WN[1][2 * i], &WN[2][2 * i]);
179 }
180
181 data += 4 * 128;
182 blocks -= 4;
183
184 // First 64 rounds of SHA-512
185 for(size_t r = 0; r != 64; r += 16) {
186 auto w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 16)]);
187 SHA2_64_F(A, B, C, D, E, F, G, H, W[0]);
188 SHA2_64_F(H, A, B, C, D, E, F, G, W[1]);
189 w.store_le4(&W[0], &WN[0][r + 16], &WN[1][r + 16], &WN[2][r + 16]);
190
191 w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 18)]);
192 SHA2_64_F(G, H, A, B, C, D, E, F, W[2]);
193 SHA2_64_F(F, G, H, A, B, C, D, E, W[3]);
194 w.store_le4(&W[2], &WN[0][r + 18], &WN[1][r + 18], &WN[2][r + 18]);
195
196 w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 20)]);
197 SHA2_64_F(E, F, G, H, A, B, C, D, W[4]);
198 SHA2_64_F(D, E, F, G, H, A, B, C, W[5]);
199 w.store_le4(&W[4], &WN[0][r + 20], &WN[1][r + 20], &WN[2][r + 20]);
200
201 w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 22)]);
202 SHA2_64_F(C, D, E, F, G, H, A, B, W[6]);
203 SHA2_64_F(B, C, D, E, F, G, H, A, W[7]);
204 w.store_le4(&W[6], &WN[0][r + 22], &WN[1][r + 22], &WN[2][r + 22]);
205
206 w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 24)]);
207 SHA2_64_F(A, B, C, D, E, F, G, H, W[8]);
208 SHA2_64_F(H, A, B, C, D, E, F, G, W[9]);
209 w.store_le4(&W[8], &WN[0][r + 24], &WN[1][r + 24], &WN[2][r + 24]);
210
211 w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 26)]);
212 SHA2_64_F(G, H, A, B, C, D, E, F, W[10]);
213 SHA2_64_F(F, G, H, A, B, C, D, E, W[11]);
214 w.store_le4(&W[10], &WN[0][r + 26], &WN[1][r + 26], &WN[2][r + 26]);
215
216 w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 28)]);
217 SHA2_64_F(E, F, G, H, A, B, C, D, W[12]);
218 SHA2_64_F(D, E, F, G, H, A, B, C, W[13]);
219 w.store_le4(&W[12], &WN[0][r + 28], &WN[1][r + 28], &WN[2][r + 28]);
220
221 w = sha512_next_w_avx512(WS) + SIMD_8x64::load_le(&K4[4 * (r + 30)]);
222 SHA2_64_F(C, D, E, F, G, H, A, B, W[14]);
223 SHA2_64_F(B, C, D, E, F, G, H, A, W[15]);
224 w.store_le4(&W[14], &WN[0][r + 30], &WN[1][r + 30], &WN[2][r + 30]);
225 }
226
227 // Final 16 rounds of SHA-512
228 SHA2_64_F(A, B, C, D, E, F, G, H, W[0]);
229 SHA2_64_F(H, A, B, C, D, E, F, G, W[1]);
230 SHA2_64_F(G, H, A, B, C, D, E, F, W[2]);
231 SHA2_64_F(F, G, H, A, B, C, D, E, W[3]);
232 SHA2_64_F(E, F, G, H, A, B, C, D, W[4]);
233 SHA2_64_F(D, E, F, G, H, A, B, C, W[5]);
234 SHA2_64_F(C, D, E, F, G, H, A, B, W[6]);
235 SHA2_64_F(B, C, D, E, F, G, H, A, W[7]);
236 SHA2_64_F(A, B, C, D, E, F, G, H, W[8]);
237 SHA2_64_F(H, A, B, C, D, E, F, G, W[9]);
238 SHA2_64_F(G, H, A, B, C, D, E, F, W[10]);
239 SHA2_64_F(F, G, H, A, B, C, D, E, W[11]);
240 SHA2_64_F(E, F, G, H, A, B, C, D, W[12]);
241 SHA2_64_F(D, E, F, G, H, A, B, C, W[13]);
242 SHA2_64_F(C, D, E, F, G, H, A, B, W[14]);
243 SHA2_64_F(B, C, D, E, F, G, H, A, W[15]);
244
245 A = (digest[0] += A);
246 B = (digest[1] += B);
247 C = (digest[2] += C);
248 D = (digest[3] += D);
249 E = (digest[4] += E);
250 F = (digest[5] += F);
251 G = (digest[6] += G);
252 H = (digest[7] += H);
253
254 // Block 2,3,4 of SHA-512 compression, with pre-expanded messages
255 for(size_t b = 0; b != 3; ++b) { // NOLINT(*-loop-convert)
256 for(size_t r = 0; r != 80; r += 16) {
257 SHA2_64_F(A, B, C, D, E, F, G, H, WN[b][r + 0]);
258 SHA2_64_F(H, A, B, C, D, E, F, G, WN[b][r + 1]);
259 SHA2_64_F(G, H, A, B, C, D, E, F, WN[b][r + 2]);
260 SHA2_64_F(F, G, H, A, B, C, D, E, WN[b][r + 3]);
261 SHA2_64_F(E, F, G, H, A, B, C, D, WN[b][r + 4]);
262 SHA2_64_F(D, E, F, G, H, A, B, C, WN[b][r + 5]);
263 SHA2_64_F(C, D, E, F, G, H, A, B, WN[b][r + 6]);
264 SHA2_64_F(B, C, D, E, F, G, H, A, WN[b][r + 7]);
265 SHA2_64_F(A, B, C, D, E, F, G, H, WN[b][r + 8]);
266 SHA2_64_F(H, A, B, C, D, E, F, G, WN[b][r + 9]);
267 SHA2_64_F(G, H, A, B, C, D, E, F, WN[b][r + 10]);
268 SHA2_64_F(F, G, H, A, B, C, D, E, WN[b][r + 11]);
269 SHA2_64_F(E, F, G, H, A, B, C, D, WN[b][r + 12]);
270 SHA2_64_F(D, E, F, G, H, A, B, C, WN[b][r + 13]);
271 SHA2_64_F(C, D, E, F, G, H, A, B, WN[b][r + 14]);
272 SHA2_64_F(B, C, D, E, F, G, H, A, WN[b][r + 15]);
273 }
274
275 A = (digest[0] += A);
276 B = (digest[1] += B);
277 C = (digest[2] += C);
278 D = (digest[3] += D);
279 E = (digest[4] += E);
280 F = (digest[5] += F);
281 G = (digest[6] += G);
282 H = (digest[7] += H);
283 }
284 }
285
286 while(blocks > 0) {
287 SIMD_2x64 WS[8];
288
289 for(size_t i = 0; i < 8; i++) {
290 WS[i] = SIMD_2x64::load_be(&data[16 * i]);
291 auto WK = WS[i] + SIMD_2x64::load_le(&K[2 * i]);
292 WK.store_le(&W[2 * i]);
293 }
294
295 data += 128;
296 blocks -= 1;
297
298 // First 64 rounds of SHA-512
299 for(size_t r = 0; r != 64; r += 16) {
300 auto w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 16]);
301 SHA2_64_F(A, B, C, D, E, F, G, H, W[0]);
302 SHA2_64_F(H, A, B, C, D, E, F, G, W[1]);
303 w.store_le(&W[0]);
304
305 w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 18]);
306 SHA2_64_F(G, H, A, B, C, D, E, F, W[2]);
307 SHA2_64_F(F, G, H, A, B, C, D, E, W[3]);
308 w.store_le(&W[2]);
309
310 w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 20]);
311 SHA2_64_F(E, F, G, H, A, B, C, D, W[4]);
312 SHA2_64_F(D, E, F, G, H, A, B, C, W[5]);
313 w.store_le(&W[4]);
314
315 w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 22]);
316 SHA2_64_F(C, D, E, F, G, H, A, B, W[6]);
317 SHA2_64_F(B, C, D, E, F, G, H, A, W[7]);
318 w.store_le(&W[6]);
319
320 w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 24]);
321 SHA2_64_F(A, B, C, D, E, F, G, H, W[8]);
322 SHA2_64_F(H, A, B, C, D, E, F, G, W[9]);
323 w.store_le(&W[8]);
324
325 w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 26]);
326 SHA2_64_F(G, H, A, B, C, D, E, F, W[10]);
327 SHA2_64_F(F, G, H, A, B, C, D, E, W[11]);
328 w.store_le(&W[10]);
329
330 w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 28]);
331 SHA2_64_F(E, F, G, H, A, B, C, D, W[12]);
332 SHA2_64_F(D, E, F, G, H, A, B, C, W[13]);
333 w.store_le(&W[12]);
334
335 w = sha512_next_w_avx512(WS) + SIMD_2x64::load_le(&K[r + 30]);
336 SHA2_64_F(C, D, E, F, G, H, A, B, W[14]);
337 SHA2_64_F(B, C, D, E, F, G, H, A, W[15]);
338 w.store_le(&W[14]);
339 }
340
341 // Final 16 rounds of SHA-512
342 SHA2_64_F(A, B, C, D, E, F, G, H, W[0]);
343 SHA2_64_F(H, A, B, C, D, E, F, G, W[1]);
344 SHA2_64_F(G, H, A, B, C, D, E, F, W[2]);
345 SHA2_64_F(F, G, H, A, B, C, D, E, W[3]);
346 SHA2_64_F(E, F, G, H, A, B, C, D, W[4]);
347 SHA2_64_F(D, E, F, G, H, A, B, C, W[5]);
348 SHA2_64_F(C, D, E, F, G, H, A, B, W[6]);
349 SHA2_64_F(B, C, D, E, F, G, H, A, W[7]);
350 SHA2_64_F(A, B, C, D, E, F, G, H, W[8]);
351 SHA2_64_F(H, A, B, C, D, E, F, G, W[9]);
352 SHA2_64_F(G, H, A, B, C, D, E, F, W[10]);
353 SHA2_64_F(F, G, H, A, B, C, D, E, W[11]);
354 SHA2_64_F(E, F, G, H, A, B, C, D, W[12]);
355 SHA2_64_F(D, E, F, G, H, A, B, C, W[13]);
356 SHA2_64_F(C, D, E, F, G, H, A, B, W[14]);
357 SHA2_64_F(B, C, D, E, F, G, H, A, W[15]);
358
359 A = (digest[0] += A);
360 B = (digest[1] += B);
361 C = (digest[2] += C);
362 D = (digest[3] += D);
363 E = (digest[4] += E);
364 F = (digest[5] += F);
365 G = (digest[6] += G);
366 H = (digest[7] += H);
367 }
368}
369
370} // namespace Botan
static SIMD_2x64 load_le(const void *in)
Definition simd_2x64.h:38
static SIMD_2x64 load_be(const void *in)
Definition simd_2x64.h:42
static BOTAN_FN_ISA_SIMD_8X64 SIMD_8x64 load_be4(const void *in0, const void *in1, const void *in2, const void *in3)
Definition simd_8x64.h:44
static BOTAN_FN_ISA_SIMD_8X64 SIMD_8x64 load_le(const void *in)
Definition simd_8x64.h:51
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
BOTAN_FORCE_INLINE void SHA2_64_F(uint64_t A, uint64_t B, uint64_t C, uint64_t &D, uint64_t E, uint64_t F, uint64_t G, uint64_t &H, uint64_t &M1, uint64_t M2, uint64_t M3, uint64_t M4, uint64_t magic)
Definition sha2_64_f.h:19
BOTAN_FORCE_INLINE constexpr T rotr(T input)
Definition rotate.h:35