Botan 3.9.0
Crypto and TLS for C&
sha2_64_armv8.cpp
Go to the documentation of this file.
1/*
2* SHA-512 using CPU instructions in ARMv8
3*
4* (C) 2023 René Fischer
5*
6* Botan is released under the Simplified BSD License (see license.txt)
7*/
8
9#include <botan/internal/sha2_64.h>
10
11#include <botan/internal/isa_extn.h>
12#include <arm_neon.h>
13
14namespace Botan {
15
16/*
17* SHA-512 using CPU instructions in ARMv8
18*/
19void BOTAN_FN_ISA_SHA512 SHA_512::compress_digest_armv8(digest_type& digest,
20 std::span<const uint8_t> input8,
21 size_t blocks) {
22 alignas(128) static const uint64_t K[] = {
23 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, 0x3956C25BF348B538,
24 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, 0xD807AA98A3030242, 0x12835B0145706FBE,
25 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235,
26 0xC19BF174CF692694, 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
27 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5, 0x983E5152EE66DFAB,
28 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, 0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
29 0x06CA6351E003826F, 0x142929670A0E6E70, 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED,
30 0x53380D139D95B3DF, 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
31 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30, 0xD192E819D6EF5218,
32 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8, 0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
33 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373,
34 0x682E6FF3D6B2B8A3, 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
35 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B, 0xCA273ECEEA26619C,
36 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, 0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
37 0x113F9804BEF90DAE, 0x1B710B35131C471B, 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC,
38 0x431D67C49C100D4C, 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817};
39
40 // Load initial values
41 uint64x2_t STATE0 = vld1q_u64(&digest[0]); // ab
42 uint64x2_t STATE1 = vld1q_u64(&digest[2]); // cd
43 uint64x2_t STATE2 = vld1q_u64(&digest[4]); // ef
44 uint64x2_t STATE3 = vld1q_u64(&digest[6]); // gh
45
46 const uint64_t* input64 = reinterpret_cast<const uint64_t*>(input8.data());
47
48 while(blocks > 0) {
49 // Save current state
50 const uint64x2_t AB_SAVE = STATE0;
51 const uint64x2_t CD_SAVE = STATE1;
52 const uint64x2_t EF_SAVE = STATE2;
53 const uint64x2_t GH_SAVE = STATE3;
54
55 uint64x2_t MSG0 = vld1q_u64(input64 + 0);
56 uint64x2_t MSG1 = vld1q_u64(input64 + 2);
57 uint64x2_t MSG2 = vld1q_u64(input64 + 4);
58 uint64x2_t MSG3 = vld1q_u64(input64 + 6);
59 uint64x2_t MSG4 = vld1q_u64(input64 + 8);
60 uint64x2_t MSG5 = vld1q_u64(input64 + 10);
61 uint64x2_t MSG6 = vld1q_u64(input64 + 12);
62 uint64x2_t MSG7 = vld1q_u64(input64 + 14);
63
64 MSG0 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG0)));
65 MSG1 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG1)));
66 MSG2 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG2)));
67 MSG3 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG3)));
68 MSG4 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG4)));
69 MSG5 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG5)));
70 MSG6 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG6)));
71 MSG7 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG7)));
72
73 uint64x2_t MSG_K, TSTATE0, TSTATE1;
74
75 // Rounds 0-1
76 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 0]));
77 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
78 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
79 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
80 STATE1 = vaddq_u64(STATE1, TSTATE1);
81 MSG0 = vsha512su1q_u64(vsha512su0q_u64(MSG0, MSG1), MSG7, vextq_u64(MSG4, MSG5, 1));
82
83 // Rounds 2-3
84 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 1]));
85 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
86 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
87 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
88 STATE0 = vaddq_u64(STATE0, TSTATE1);
89 MSG1 = vsha512su1q_u64(vsha512su0q_u64(MSG1, MSG2), MSG0, vextq_u64(MSG5, MSG6, 1));
90
91 // Rounds 4-5
92 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 2]));
93 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
94 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
95 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
96 STATE3 = vaddq_u64(STATE3, TSTATE1);
97 MSG2 = vsha512su1q_u64(vsha512su0q_u64(MSG2, MSG3), MSG1, vextq_u64(MSG6, MSG7, 1));
98
99 // Rounds 6-7
100 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 3]));
101 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
102 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
103 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
104 STATE2 = vaddq_u64(STATE2, TSTATE1);
105 MSG3 = vsha512su1q_u64(vsha512su0q_u64(MSG3, MSG4), MSG2, vextq_u64(MSG7, MSG0, 1));
106
107 // Rounds 8-9
108 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 4]));
109 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
110 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
111 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
112 STATE1 = vaddq_u64(STATE1, TSTATE1);
113 MSG4 = vsha512su1q_u64(vsha512su0q_u64(MSG4, MSG5), MSG3, vextq_u64(MSG0, MSG1, 1));
114
115 // Rounds 10-11
116 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 5]));
117 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
118 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
119 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
120 STATE0 = vaddq_u64(STATE0, TSTATE1);
121 MSG5 = vsha512su1q_u64(vsha512su0q_u64(MSG5, MSG6), MSG4, vextq_u64(MSG1, MSG2, 1));
122
123 // Rounds 12-13
124 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 6]));
125 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
126 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
127 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
128 STATE3 = vaddq_u64(STATE3, TSTATE1);
129 MSG6 = vsha512su1q_u64(vsha512su0q_u64(MSG6, MSG7), MSG5, vextq_u64(MSG2, MSG3, 1));
130
131 // Rounds 14-15
132 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 7]));
133 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
134 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
135 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
136 STATE2 = vaddq_u64(STATE2, TSTATE1);
137 MSG7 = vsha512su1q_u64(vsha512su0q_u64(MSG7, MSG0), MSG6, vextq_u64(MSG3, MSG4, 1));
138
139 // Rounds 16-17
140 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 8]));
141 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
142 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
143 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
144 STATE1 = vaddq_u64(STATE1, TSTATE1);
145 MSG0 = vsha512su1q_u64(vsha512su0q_u64(MSG0, MSG1), MSG7, vextq_u64(MSG4, MSG5, 1));
146
147 // Rounds 18-19
148 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 9]));
149 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
150 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
151 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
152 STATE0 = vaddq_u64(STATE0, TSTATE1);
153 MSG1 = vsha512su1q_u64(vsha512su0q_u64(MSG1, MSG2), MSG0, vextq_u64(MSG5, MSG6, 1));
154
155 // Rounds 20-21
156 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 10]));
157 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
158 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
159 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
160 STATE3 = vaddq_u64(STATE3, TSTATE1);
161 MSG2 = vsha512su1q_u64(vsha512su0q_u64(MSG2, MSG3), MSG1, vextq_u64(MSG6, MSG7, 1));
162
163 // Rounds 22-23
164 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 11]));
165 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
166 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
167 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
168 STATE2 = vaddq_u64(STATE2, TSTATE1);
169 MSG3 = vsha512su1q_u64(vsha512su0q_u64(MSG3, MSG4), MSG2, vextq_u64(MSG7, MSG0, 1));
170
171 // Rounds 24-25
172 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 12]));
173 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
174 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
175 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
176 STATE1 = vaddq_u64(STATE1, TSTATE1);
177 MSG4 = vsha512su1q_u64(vsha512su0q_u64(MSG4, MSG5), MSG3, vextq_u64(MSG0, MSG1, 1));
178
179 // Rounds 26-27
180 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 13]));
181 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
182 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
183 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
184 STATE0 = vaddq_u64(STATE0, TSTATE1);
185 MSG5 = vsha512su1q_u64(vsha512su0q_u64(MSG5, MSG6), MSG4, vextq_u64(MSG1, MSG2, 1));
186
187 // Rounds 28-29
188 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 14]));
189 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
190 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
191 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
192 STATE3 = vaddq_u64(STATE3, TSTATE1);
193 MSG6 = vsha512su1q_u64(vsha512su0q_u64(MSG6, MSG7), MSG5, vextq_u64(MSG2, MSG3, 1));
194
195 // Rounds 30-31
196 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 15]));
197 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
198 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
199 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
200 STATE2 = vaddq_u64(STATE2, TSTATE1);
201 MSG7 = vsha512su1q_u64(vsha512su0q_u64(MSG7, MSG0), MSG6, vextq_u64(MSG3, MSG4, 1));
202
203 // Rounds 32-33
204 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 16]));
205 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
206 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
207 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
208 STATE1 = vaddq_u64(STATE1, TSTATE1);
209 MSG0 = vsha512su1q_u64(vsha512su0q_u64(MSG0, MSG1), MSG7, vextq_u64(MSG4, MSG5, 1));
210
211 // Rounds 34-35
212 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 17]));
213 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
214 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
215 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
216 STATE0 = vaddq_u64(STATE0, TSTATE1);
217 MSG1 = vsha512su1q_u64(vsha512su0q_u64(MSG1, MSG2), MSG0, vextq_u64(MSG5, MSG6, 1));
218
219 // Rounds 36-37
220 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 18]));
221 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
222 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
223 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
224 STATE3 = vaddq_u64(STATE3, TSTATE1);
225 MSG2 = vsha512su1q_u64(vsha512su0q_u64(MSG2, MSG3), MSG1, vextq_u64(MSG6, MSG7, 1));
226
227 // Rounds 38-39
228 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 19]));
229 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
230 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
231 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
232 STATE2 = vaddq_u64(STATE2, TSTATE1);
233 MSG3 = vsha512su1q_u64(vsha512su0q_u64(MSG3, MSG4), MSG2, vextq_u64(MSG7, MSG0, 1));
234
235 // Rounds 40-41
236 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 20]));
237 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
238 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
239 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
240 STATE1 = vaddq_u64(STATE1, TSTATE1);
241 MSG4 = vsha512su1q_u64(vsha512su0q_u64(MSG4, MSG5), MSG3, vextq_u64(MSG0, MSG1, 1));
242
243 // Rounds 42-43
244 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 21]));
245 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
246 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
247 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
248 STATE0 = vaddq_u64(STATE0, TSTATE1);
249 MSG5 = vsha512su1q_u64(vsha512su0q_u64(MSG5, MSG6), MSG4, vextq_u64(MSG1, MSG2, 1));
250
251 // Rounds 44-45
252 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 22]));
253 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
254 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
255 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
256 STATE3 = vaddq_u64(STATE3, TSTATE1);
257 MSG6 = vsha512su1q_u64(vsha512su0q_u64(MSG6, MSG7), MSG5, vextq_u64(MSG2, MSG3, 1));
258
259 // Rounds 46-47
260 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 23]));
261 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
262 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
263 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
264 STATE2 = vaddq_u64(STATE2, TSTATE1);
265 MSG7 = vsha512su1q_u64(vsha512su0q_u64(MSG7, MSG0), MSG6, vextq_u64(MSG3, MSG4, 1));
266
267 // Rounds 48-49
268 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 24]));
269 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
270 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
271 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
272 STATE1 = vaddq_u64(STATE1, TSTATE1);
273 MSG0 = vsha512su1q_u64(vsha512su0q_u64(MSG0, MSG1), MSG7, vextq_u64(MSG4, MSG5, 1));
274
275 // Rounds 50-51
276 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 25]));
277 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
278 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
279 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
280 STATE0 = vaddq_u64(STATE0, TSTATE1);
281 MSG1 = vsha512su1q_u64(vsha512su0q_u64(MSG1, MSG2), MSG0, vextq_u64(MSG5, MSG6, 1));
282
283 // Rounds 52-53
284 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 26]));
285 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
286 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
287 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
288 STATE3 = vaddq_u64(STATE3, TSTATE1);
289 MSG2 = vsha512su1q_u64(vsha512su0q_u64(MSG2, MSG3), MSG1, vextq_u64(MSG6, MSG7, 1));
290
291 // Rounds 54-55
292 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 27]));
293 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
294 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
295 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
296 STATE2 = vaddq_u64(STATE2, TSTATE1);
297 MSG3 = vsha512su1q_u64(vsha512su0q_u64(MSG3, MSG4), MSG2, vextq_u64(MSG7, MSG0, 1));
298
299 // Rounds 56-57
300 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 28]));
301 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
302 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
303 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
304 STATE1 = vaddq_u64(STATE1, TSTATE1);
305 MSG4 = vsha512su1q_u64(vsha512su0q_u64(MSG4, MSG5), MSG3, vextq_u64(MSG0, MSG1, 1));
306
307 // Rounds 58-59
308 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 29]));
309 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
310 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
311 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
312 STATE0 = vaddq_u64(STATE0, TSTATE1);
313 MSG5 = vsha512su1q_u64(vsha512su0q_u64(MSG5, MSG6), MSG4, vextq_u64(MSG1, MSG2, 1));
314
315 // Rounds 60-61
316 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 30]));
317 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
318 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
319 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
320 STATE3 = vaddq_u64(STATE3, TSTATE1);
321 MSG6 = vsha512su1q_u64(vsha512su0q_u64(MSG6, MSG7), MSG5, vextq_u64(MSG2, MSG3, 1));
322
323 // Rounds 62-63
324 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 31]));
325 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
326 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
327 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
328 STATE2 = vaddq_u64(STATE2, TSTATE1);
329 MSG7 = vsha512su1q_u64(vsha512su0q_u64(MSG7, MSG0), MSG6, vextq_u64(MSG3, MSG4, 1));
330
331 // Rounds 64-65
332 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 32]));
333 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
334 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
335 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
336 STATE1 = vaddq_u64(STATE1, TSTATE1);
337
338 // Rounds 66-67
339 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 33]));
340 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
341 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
342 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
343 STATE0 = vaddq_u64(STATE0, TSTATE1);
344
345 // Rounds 68-69
346 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 34]));
347 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
348 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
349 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
350 STATE3 = vaddq_u64(STATE3, TSTATE1);
351
352 // Rounds 70-71
353 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 35]));
354 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
355 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
356 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
357 STATE2 = vaddq_u64(STATE2, TSTATE1);
358
359 // Rounds 72-73
360 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 36]));
361 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
362 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
363 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
364 STATE1 = vaddq_u64(STATE1, TSTATE1);
365
366 // Rounds 74-75
367 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 37]));
368 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
369 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
370 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
371 STATE0 = vaddq_u64(STATE0, TSTATE1);
372
373 // Rounds 76-77
374 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 38]));
375 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
376 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
377 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
378 STATE3 = vaddq_u64(STATE3, TSTATE1);
379
380 // Rounds 78-79
381 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 39]));
382 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
383 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
384 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
385 STATE2 = vaddq_u64(STATE2, TSTATE1);
386
387 // Add back to state
388 STATE0 = vaddq_u64(STATE0, AB_SAVE);
389 STATE1 = vaddq_u64(STATE1, CD_SAVE);
390 STATE2 = vaddq_u64(STATE2, EF_SAVE);
391 STATE3 = vaddq_u64(STATE3, GH_SAVE);
392
393 input64 += 64 / 4;
394 blocks--;
395 }
396
397 // Save state
398 vst1q_u64(&digest[0], STATE0);
399 vst1q_u64(&digest[2], STATE1);
400 vst1q_u64(&digest[4], STATE2);
401 vst1q_u64(&digest[6], STATE3);
402}
403
404} // namespace Botan