Botan 3.10.0
Crypto and TLS for C&
sha2_64_armv8.cpp
Go to the documentation of this file.
1/*
2* SHA-512 using CPU instructions in ARMv8
3*
4* (C) 2023 René Fischer
5*
6* Botan is released under the Simplified BSD License (see license.txt)
7*/
8
9#include <botan/internal/sha2_64.h>
10
11#include <botan/internal/isa_extn.h>
12#include <arm_neon.h>
13
14namespace Botan {
15
16/*
17* SHA-512 using CPU instructions in ARMv8
18*/
19void BOTAN_FN_ISA_SHA512 SHA_512::compress_digest_armv8(digest_type& digest,
20 std::span<const uint8_t> input8,
21 size_t blocks) {
22 alignas(128) static const uint64_t K[] = {
23 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, 0x3956C25BF348B538,
24 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, 0xD807AA98A3030242, 0x12835B0145706FBE,
25 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235,
26 0xC19BF174CF692694, 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
27 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5, 0x983E5152EE66DFAB,
28 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, 0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
29 0x06CA6351E003826F, 0x142929670A0E6E70, 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED,
30 0x53380D139D95B3DF, 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
31 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30, 0xD192E819D6EF5218,
32 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8, 0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
33 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373,
34 0x682E6FF3D6B2B8A3, 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
35 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B, 0xCA273ECEEA26619C,
36 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, 0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
37 0x113F9804BEF90DAE, 0x1B710B35131C471B, 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC,
38 0x431D67C49C100D4C, 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817};
39
40 // Load initial values
41 uint64x2_t STATE0 = vld1q_u64(&digest[0]); // ab NOLINT(*-container-data-pointer)
42 uint64x2_t STATE1 = vld1q_u64(&digest[2]); // cd
43 uint64x2_t STATE2 = vld1q_u64(&digest[4]); // ef
44 uint64x2_t STATE3 = vld1q_u64(&digest[6]); // gh
45
46 const uint64_t* input64 = reinterpret_cast<const uint64_t*>(input8.data());
47
48 while(blocks > 0) {
49 // Save current state
50 const uint64x2_t AB_SAVE = STATE0;
51 const uint64x2_t CD_SAVE = STATE1;
52 const uint64x2_t EF_SAVE = STATE2;
53 const uint64x2_t GH_SAVE = STATE3;
54
55 uint64x2_t MSG0 = vld1q_u64(input64 + 0);
56 uint64x2_t MSG1 = vld1q_u64(input64 + 2);
57 uint64x2_t MSG2 = vld1q_u64(input64 + 4);
58 uint64x2_t MSG3 = vld1q_u64(input64 + 6);
59 uint64x2_t MSG4 = vld1q_u64(input64 + 8);
60 uint64x2_t MSG5 = vld1q_u64(input64 + 10);
61 uint64x2_t MSG6 = vld1q_u64(input64 + 12);
62 uint64x2_t MSG7 = vld1q_u64(input64 + 14);
63
64 MSG0 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG0)));
65 MSG1 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG1)));
66 MSG2 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG2)));
67 MSG3 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG3)));
68 MSG4 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG4)));
69 MSG5 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG5)));
70 MSG6 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG6)));
71 MSG7 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG7)));
72
73 uint64x2_t MSG_K;
74 uint64x2_t TSTATE0;
75 uint64x2_t TSTATE1;
76
77 // Rounds 0-1
78 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 0]));
79 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
80 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
81 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
82 STATE1 = vaddq_u64(STATE1, TSTATE1);
83 MSG0 = vsha512su1q_u64(vsha512su0q_u64(MSG0, MSG1), MSG7, vextq_u64(MSG4, MSG5, 1));
84
85 // Rounds 2-3
86 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 1]));
87 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
88 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
89 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
90 STATE0 = vaddq_u64(STATE0, TSTATE1);
91 MSG1 = vsha512su1q_u64(vsha512su0q_u64(MSG1, MSG2), MSG0, vextq_u64(MSG5, MSG6, 1));
92
93 // Rounds 4-5
94 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 2]));
95 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
96 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
97 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
98 STATE3 = vaddq_u64(STATE3, TSTATE1);
99 MSG2 = vsha512su1q_u64(vsha512su0q_u64(MSG2, MSG3), MSG1, vextq_u64(MSG6, MSG7, 1));
100
101 // Rounds 6-7
102 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 3]));
103 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
104 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
105 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
106 STATE2 = vaddq_u64(STATE2, TSTATE1);
107 MSG3 = vsha512su1q_u64(vsha512su0q_u64(MSG3, MSG4), MSG2, vextq_u64(MSG7, MSG0, 1));
108
109 // Rounds 8-9
110 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 4]));
111 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
112 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
113 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
114 STATE1 = vaddq_u64(STATE1, TSTATE1);
115 MSG4 = vsha512su1q_u64(vsha512su0q_u64(MSG4, MSG5), MSG3, vextq_u64(MSG0, MSG1, 1));
116
117 // Rounds 10-11
118 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 5]));
119 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
120 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
121 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
122 STATE0 = vaddq_u64(STATE0, TSTATE1);
123 MSG5 = vsha512su1q_u64(vsha512su0q_u64(MSG5, MSG6), MSG4, vextq_u64(MSG1, MSG2, 1));
124
125 // Rounds 12-13
126 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 6]));
127 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
128 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
129 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
130 STATE3 = vaddq_u64(STATE3, TSTATE1);
131 MSG6 = vsha512su1q_u64(vsha512su0q_u64(MSG6, MSG7), MSG5, vextq_u64(MSG2, MSG3, 1));
132
133 // Rounds 14-15
134 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 7]));
135 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
136 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
137 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
138 STATE2 = vaddq_u64(STATE2, TSTATE1);
139 MSG7 = vsha512su1q_u64(vsha512su0q_u64(MSG7, MSG0), MSG6, vextq_u64(MSG3, MSG4, 1));
140
141 // Rounds 16-17
142 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 8]));
143 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
144 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
145 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
146 STATE1 = vaddq_u64(STATE1, TSTATE1);
147 MSG0 = vsha512su1q_u64(vsha512su0q_u64(MSG0, MSG1), MSG7, vextq_u64(MSG4, MSG5, 1));
148
149 // Rounds 18-19
150 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 9]));
151 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
152 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
153 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
154 STATE0 = vaddq_u64(STATE0, TSTATE1);
155 MSG1 = vsha512su1q_u64(vsha512su0q_u64(MSG1, MSG2), MSG0, vextq_u64(MSG5, MSG6, 1));
156
157 // Rounds 20-21
158 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 10]));
159 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
160 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
161 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
162 STATE3 = vaddq_u64(STATE3, TSTATE1);
163 MSG2 = vsha512su1q_u64(vsha512su0q_u64(MSG2, MSG3), MSG1, vextq_u64(MSG6, MSG7, 1));
164
165 // Rounds 22-23
166 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 11]));
167 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
168 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
169 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
170 STATE2 = vaddq_u64(STATE2, TSTATE1);
171 MSG3 = vsha512su1q_u64(vsha512su0q_u64(MSG3, MSG4), MSG2, vextq_u64(MSG7, MSG0, 1));
172
173 // Rounds 24-25
174 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 12]));
175 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
176 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
177 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
178 STATE1 = vaddq_u64(STATE1, TSTATE1);
179 MSG4 = vsha512su1q_u64(vsha512su0q_u64(MSG4, MSG5), MSG3, vextq_u64(MSG0, MSG1, 1));
180
181 // Rounds 26-27
182 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 13]));
183 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
184 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
185 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
186 STATE0 = vaddq_u64(STATE0, TSTATE1);
187 MSG5 = vsha512su1q_u64(vsha512su0q_u64(MSG5, MSG6), MSG4, vextq_u64(MSG1, MSG2, 1));
188
189 // Rounds 28-29
190 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 14]));
191 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
192 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
193 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
194 STATE3 = vaddq_u64(STATE3, TSTATE1);
195 MSG6 = vsha512su1q_u64(vsha512su0q_u64(MSG6, MSG7), MSG5, vextq_u64(MSG2, MSG3, 1));
196
197 // Rounds 30-31
198 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 15]));
199 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
200 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
201 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
202 STATE2 = vaddq_u64(STATE2, TSTATE1);
203 MSG7 = vsha512su1q_u64(vsha512su0q_u64(MSG7, MSG0), MSG6, vextq_u64(MSG3, MSG4, 1));
204
205 // Rounds 32-33
206 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 16]));
207 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
208 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
209 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
210 STATE1 = vaddq_u64(STATE1, TSTATE1);
211 MSG0 = vsha512su1q_u64(vsha512su0q_u64(MSG0, MSG1), MSG7, vextq_u64(MSG4, MSG5, 1));
212
213 // Rounds 34-35
214 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 17]));
215 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
216 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
217 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
218 STATE0 = vaddq_u64(STATE0, TSTATE1);
219 MSG1 = vsha512su1q_u64(vsha512su0q_u64(MSG1, MSG2), MSG0, vextq_u64(MSG5, MSG6, 1));
220
221 // Rounds 36-37
222 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 18]));
223 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
224 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
225 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
226 STATE3 = vaddq_u64(STATE3, TSTATE1);
227 MSG2 = vsha512su1q_u64(vsha512su0q_u64(MSG2, MSG3), MSG1, vextq_u64(MSG6, MSG7, 1));
228
229 // Rounds 38-39
230 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 19]));
231 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
232 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
233 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
234 STATE2 = vaddq_u64(STATE2, TSTATE1);
235 MSG3 = vsha512su1q_u64(vsha512su0q_u64(MSG3, MSG4), MSG2, vextq_u64(MSG7, MSG0, 1));
236
237 // Rounds 40-41
238 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 20]));
239 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
240 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
241 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
242 STATE1 = vaddq_u64(STATE1, TSTATE1);
243 MSG4 = vsha512su1q_u64(vsha512su0q_u64(MSG4, MSG5), MSG3, vextq_u64(MSG0, MSG1, 1));
244
245 // Rounds 42-43
246 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 21]));
247 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
248 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
249 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
250 STATE0 = vaddq_u64(STATE0, TSTATE1);
251 MSG5 = vsha512su1q_u64(vsha512su0q_u64(MSG5, MSG6), MSG4, vextq_u64(MSG1, MSG2, 1));
252
253 // Rounds 44-45
254 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 22]));
255 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
256 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
257 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
258 STATE3 = vaddq_u64(STATE3, TSTATE1);
259 MSG6 = vsha512su1q_u64(vsha512su0q_u64(MSG6, MSG7), MSG5, vextq_u64(MSG2, MSG3, 1));
260
261 // Rounds 46-47
262 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 23]));
263 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
264 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
265 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
266 STATE2 = vaddq_u64(STATE2, TSTATE1);
267 MSG7 = vsha512su1q_u64(vsha512su0q_u64(MSG7, MSG0), MSG6, vextq_u64(MSG3, MSG4, 1));
268
269 // Rounds 48-49
270 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 24]));
271 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
272 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
273 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
274 STATE1 = vaddq_u64(STATE1, TSTATE1);
275 MSG0 = vsha512su1q_u64(vsha512su0q_u64(MSG0, MSG1), MSG7, vextq_u64(MSG4, MSG5, 1));
276
277 // Rounds 50-51
278 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 25]));
279 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
280 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
281 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
282 STATE0 = vaddq_u64(STATE0, TSTATE1);
283 MSG1 = vsha512su1q_u64(vsha512su0q_u64(MSG1, MSG2), MSG0, vextq_u64(MSG5, MSG6, 1));
284
285 // Rounds 52-53
286 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 26]));
287 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
288 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
289 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
290 STATE3 = vaddq_u64(STATE3, TSTATE1);
291 MSG2 = vsha512su1q_u64(vsha512su0q_u64(MSG2, MSG3), MSG1, vextq_u64(MSG6, MSG7, 1));
292
293 // Rounds 54-55
294 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 27]));
295 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
296 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
297 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
298 STATE2 = vaddq_u64(STATE2, TSTATE1);
299 MSG3 = vsha512su1q_u64(vsha512su0q_u64(MSG3, MSG4), MSG2, vextq_u64(MSG7, MSG0, 1));
300
301 // Rounds 56-57
302 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 28]));
303 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
304 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
305 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
306 STATE1 = vaddq_u64(STATE1, TSTATE1);
307 MSG4 = vsha512su1q_u64(vsha512su0q_u64(MSG4, MSG5), MSG3, vextq_u64(MSG0, MSG1, 1));
308
309 // Rounds 58-59
310 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 29]));
311 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
312 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
313 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
314 STATE0 = vaddq_u64(STATE0, TSTATE1);
315 MSG5 = vsha512su1q_u64(vsha512su0q_u64(MSG5, MSG6), MSG4, vextq_u64(MSG1, MSG2, 1));
316
317 // Rounds 60-61
318 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 30]));
319 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
320 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
321 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
322 STATE3 = vaddq_u64(STATE3, TSTATE1);
323 MSG6 = vsha512su1q_u64(vsha512su0q_u64(MSG6, MSG7), MSG5, vextq_u64(MSG2, MSG3, 1));
324
325 // Rounds 62-63
326 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 31]));
327 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
328 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
329 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
330 STATE2 = vaddq_u64(STATE2, TSTATE1);
331 MSG7 = vsha512su1q_u64(vsha512su0q_u64(MSG7, MSG0), MSG6, vextq_u64(MSG3, MSG4, 1));
332
333 // Rounds 64-65
334 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 32]));
335 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
336 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
337 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
338 STATE1 = vaddq_u64(STATE1, TSTATE1);
339
340 // Rounds 66-67
341 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 33]));
342 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
343 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
344 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
345 STATE0 = vaddq_u64(STATE0, TSTATE1);
346
347 // Rounds 68-69
348 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 34]));
349 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
350 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
351 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
352 STATE3 = vaddq_u64(STATE3, TSTATE1);
353
354 // Rounds 70-71
355 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 35]));
356 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
357 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
358 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
359 STATE2 = vaddq_u64(STATE2, TSTATE1);
360
361 // Rounds 72-73
362 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 36]));
363 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
364 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
365 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
366 STATE1 = vaddq_u64(STATE1, TSTATE1);
367
368 // Rounds 74-75
369 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 37]));
370 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
371 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
372 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
373 STATE0 = vaddq_u64(STATE0, TSTATE1);
374
375 // Rounds 76-77
376 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 38]));
377 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
378 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
379 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
380 STATE3 = vaddq_u64(STATE3, TSTATE1);
381
382 // Rounds 78-79
383 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 39]));
384 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
385 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
386 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
387 STATE2 = vaddq_u64(STATE2, TSTATE1);
388
389 // Add back to state
390 STATE0 = vaddq_u64(STATE0, AB_SAVE);
391 STATE1 = vaddq_u64(STATE1, CD_SAVE);
392 STATE2 = vaddq_u64(STATE2, EF_SAVE);
393 STATE3 = vaddq_u64(STATE3, GH_SAVE);
394
395 input64 += 64 / 4;
396 blocks--;
397 }
398
399 // Save state
400 vst1q_u64(&digest[0], STATE0); // NOLINT(*-container-data-pointer)
401 vst1q_u64(&digest[2], STATE1);
402 vst1q_u64(&digest[4], STATE2);
403 vst1q_u64(&digest[6], STATE3);
404}
405
406} // namespace Botan