Botan 3.4.0
Crypto and TLS for C&
sha2_64_armv8.cpp
Go to the documentation of this file.
1/*
2* SHA-512 using CPU instructions in ARMv8
3*
4* (C) 2023 René Fischer
5*
6* Botan is released under the Simplified BSD License (see license.txt)
7*/
8
9#include <botan/internal/sha2_64.h>
10#include <arm_neon.h>
11
12namespace Botan {
13
14/*
15* SHA-512 using CPU instructions in ARMv8
16*/
17BOTAN_FUNC_ISA("arch=armv8.2-a+sha3")
18void SHA_512::compress_digest_armv8(digest_type& digest, std::span<const uint8_t> input8, size_t blocks) {
19 alignas(128) static const uint64_t K[] = {
20 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, 0x3956C25BF348B538,
21 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, 0xD807AA98A3030242, 0x12835B0145706FBE,
22 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235,
23 0xC19BF174CF692694, 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
24 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5, 0x983E5152EE66DFAB,
25 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, 0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
26 0x06CA6351E003826F, 0x142929670A0E6E70, 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED,
27 0x53380D139D95B3DF, 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
28 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30, 0xD192E819D6EF5218,
29 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8, 0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
30 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373,
31 0x682E6FF3D6B2B8A3, 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
32 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B, 0xCA273ECEEA26619C,
33 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, 0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
34 0x113F9804BEF90DAE, 0x1B710B35131C471B, 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC,
35 0x431D67C49C100D4C, 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817};
36
37 // Load initial values
38 uint64x2_t STATE0 = vld1q_u64(&digest[0]); // ab
39 uint64x2_t STATE1 = vld1q_u64(&digest[2]); // cd
40 uint64x2_t STATE2 = vld1q_u64(&digest[4]); // ef
41 uint64x2_t STATE3 = vld1q_u64(&digest[6]); // gh
42
43 const uint64_t* input64 = reinterpret_cast<const uint64_t*>(input8.data());
44
45 while(blocks > 0) {
46 // Save current state
47 const uint64x2_t AB_SAVE = STATE0;
48 const uint64x2_t CD_SAVE = STATE1;
49 const uint64x2_t EF_SAVE = STATE2;
50 const uint64x2_t GH_SAVE = STATE3;
51
52 uint64x2_t MSG0 = vld1q_u64(input64 + 0);
53 uint64x2_t MSG1 = vld1q_u64(input64 + 2);
54 uint64x2_t MSG2 = vld1q_u64(input64 + 4);
55 uint64x2_t MSG3 = vld1q_u64(input64 + 6);
56 uint64x2_t MSG4 = vld1q_u64(input64 + 8);
57 uint64x2_t MSG5 = vld1q_u64(input64 + 10);
58 uint64x2_t MSG6 = vld1q_u64(input64 + 12);
59 uint64x2_t MSG7 = vld1q_u64(input64 + 14);
60
61 MSG0 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG0)));
62 MSG1 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG1)));
63 MSG2 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG2)));
64 MSG3 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG3)));
65 MSG4 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG4)));
66 MSG5 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG5)));
67 MSG6 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG6)));
68 MSG7 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(MSG7)));
69
70 uint64x2_t MSG_K, TSTATE0, TSTATE1;
71
72 // Rounds 0-1
73 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 0]));
74 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
75 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
76 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
77 STATE1 = vaddq_u64(STATE1, TSTATE1);
78 MSG0 = vsha512su1q_u64(vsha512su0q_u64(MSG0, MSG1), MSG7, vextq_u64(MSG4, MSG5, 1));
79
80 // Rounds 2-3
81 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 1]));
82 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
83 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
84 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
85 STATE0 = vaddq_u64(STATE0, TSTATE1);
86 MSG1 = vsha512su1q_u64(vsha512su0q_u64(MSG1, MSG2), MSG0, vextq_u64(MSG5, MSG6, 1));
87
88 // Rounds 4-5
89 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 2]));
90 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
91 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
92 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
93 STATE3 = vaddq_u64(STATE3, TSTATE1);
94 MSG2 = vsha512su1q_u64(vsha512su0q_u64(MSG2, MSG3), MSG1, vextq_u64(MSG6, MSG7, 1));
95
96 // Rounds 6-7
97 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 3]));
98 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
99 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
100 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
101 STATE2 = vaddq_u64(STATE2, TSTATE1);
102 MSG3 = vsha512su1q_u64(vsha512su0q_u64(MSG3, MSG4), MSG2, vextq_u64(MSG7, MSG0, 1));
103
104 // Rounds 8-9
105 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 4]));
106 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
107 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
108 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
109 STATE1 = vaddq_u64(STATE1, TSTATE1);
110 MSG4 = vsha512su1q_u64(vsha512su0q_u64(MSG4, MSG5), MSG3, vextq_u64(MSG0, MSG1, 1));
111
112 // Rounds 10-11
113 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 5]));
114 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
115 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
116 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
117 STATE0 = vaddq_u64(STATE0, TSTATE1);
118 MSG5 = vsha512su1q_u64(vsha512su0q_u64(MSG5, MSG6), MSG4, vextq_u64(MSG1, MSG2, 1));
119
120 // Rounds 12-13
121 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 6]));
122 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
123 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
124 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
125 STATE3 = vaddq_u64(STATE3, TSTATE1);
126 MSG6 = vsha512su1q_u64(vsha512su0q_u64(MSG6, MSG7), MSG5, vextq_u64(MSG2, MSG3, 1));
127
128 // Rounds 14-15
129 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 7]));
130 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
131 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
132 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
133 STATE2 = vaddq_u64(STATE2, TSTATE1);
134 MSG7 = vsha512su1q_u64(vsha512su0q_u64(MSG7, MSG0), MSG6, vextq_u64(MSG3, MSG4, 1));
135
136 // Rounds 16-17
137 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 8]));
138 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
139 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
140 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
141 STATE1 = vaddq_u64(STATE1, TSTATE1);
142 MSG0 = vsha512su1q_u64(vsha512su0q_u64(MSG0, MSG1), MSG7, vextq_u64(MSG4, MSG5, 1));
143
144 // Rounds 18-19
145 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 9]));
146 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
147 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
148 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
149 STATE0 = vaddq_u64(STATE0, TSTATE1);
150 MSG1 = vsha512su1q_u64(vsha512su0q_u64(MSG1, MSG2), MSG0, vextq_u64(MSG5, MSG6, 1));
151
152 // Rounds 20-21
153 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 10]));
154 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
155 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
156 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
157 STATE3 = vaddq_u64(STATE3, TSTATE1);
158 MSG2 = vsha512su1q_u64(vsha512su0q_u64(MSG2, MSG3), MSG1, vextq_u64(MSG6, MSG7, 1));
159
160 // Rounds 22-23
161 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 11]));
162 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
163 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
164 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
165 STATE2 = vaddq_u64(STATE2, TSTATE1);
166 MSG3 = vsha512su1q_u64(vsha512su0q_u64(MSG3, MSG4), MSG2, vextq_u64(MSG7, MSG0, 1));
167
168 // Rounds 24-25
169 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 12]));
170 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
171 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
172 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
173 STATE1 = vaddq_u64(STATE1, TSTATE1);
174 MSG4 = vsha512su1q_u64(vsha512su0q_u64(MSG4, MSG5), MSG3, vextq_u64(MSG0, MSG1, 1));
175
176 // Rounds 26-27
177 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 13]));
178 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
179 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
180 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
181 STATE0 = vaddq_u64(STATE0, TSTATE1);
182 MSG5 = vsha512su1q_u64(vsha512su0q_u64(MSG5, MSG6), MSG4, vextq_u64(MSG1, MSG2, 1));
183
184 // Rounds 28-29
185 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 14]));
186 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
187 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
188 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
189 STATE3 = vaddq_u64(STATE3, TSTATE1);
190 MSG6 = vsha512su1q_u64(vsha512su0q_u64(MSG6, MSG7), MSG5, vextq_u64(MSG2, MSG3, 1));
191
192 // Rounds 30-31
193 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 15]));
194 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
195 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
196 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
197 STATE2 = vaddq_u64(STATE2, TSTATE1);
198 MSG7 = vsha512su1q_u64(vsha512su0q_u64(MSG7, MSG0), MSG6, vextq_u64(MSG3, MSG4, 1));
199
200 // Rounds 32-33
201 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 16]));
202 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
203 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
204 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
205 STATE1 = vaddq_u64(STATE1, TSTATE1);
206 MSG0 = vsha512su1q_u64(vsha512su0q_u64(MSG0, MSG1), MSG7, vextq_u64(MSG4, MSG5, 1));
207
208 // Rounds 34-35
209 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 17]));
210 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
211 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
212 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
213 STATE0 = vaddq_u64(STATE0, TSTATE1);
214 MSG1 = vsha512su1q_u64(vsha512su0q_u64(MSG1, MSG2), MSG0, vextq_u64(MSG5, MSG6, 1));
215
216 // Rounds 36-37
217 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 18]));
218 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
219 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
220 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
221 STATE3 = vaddq_u64(STATE3, TSTATE1);
222 MSG2 = vsha512su1q_u64(vsha512su0q_u64(MSG2, MSG3), MSG1, vextq_u64(MSG6, MSG7, 1));
223
224 // Rounds 38-39
225 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 19]));
226 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
227 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
228 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
229 STATE2 = vaddq_u64(STATE2, TSTATE1);
230 MSG3 = vsha512su1q_u64(vsha512su0q_u64(MSG3, MSG4), MSG2, vextq_u64(MSG7, MSG0, 1));
231
232 // Rounds 40-41
233 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 20]));
234 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
235 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
236 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
237 STATE1 = vaddq_u64(STATE1, TSTATE1);
238 MSG4 = vsha512su1q_u64(vsha512su0q_u64(MSG4, MSG5), MSG3, vextq_u64(MSG0, MSG1, 1));
239
240 // Rounds 42-43
241 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 21]));
242 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
243 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
244 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
245 STATE0 = vaddq_u64(STATE0, TSTATE1);
246 MSG5 = vsha512su1q_u64(vsha512su0q_u64(MSG5, MSG6), MSG4, vextq_u64(MSG1, MSG2, 1));
247
248 // Rounds 44-45
249 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 22]));
250 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
251 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
252 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
253 STATE3 = vaddq_u64(STATE3, TSTATE1);
254 MSG6 = vsha512su1q_u64(vsha512su0q_u64(MSG6, MSG7), MSG5, vextq_u64(MSG2, MSG3, 1));
255
256 // Rounds 46-47
257 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 23]));
258 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
259 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
260 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
261 STATE2 = vaddq_u64(STATE2, TSTATE1);
262 MSG7 = vsha512su1q_u64(vsha512su0q_u64(MSG7, MSG0), MSG6, vextq_u64(MSG3, MSG4, 1));
263
264 // Rounds 48-49
265 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 24]));
266 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
267 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
268 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
269 STATE1 = vaddq_u64(STATE1, TSTATE1);
270 MSG0 = vsha512su1q_u64(vsha512su0q_u64(MSG0, MSG1), MSG7, vextq_u64(MSG4, MSG5, 1));
271
272 // Rounds 50-51
273 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 25]));
274 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
275 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
276 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
277 STATE0 = vaddq_u64(STATE0, TSTATE1);
278 MSG1 = vsha512su1q_u64(vsha512su0q_u64(MSG1, MSG2), MSG0, vextq_u64(MSG5, MSG6, 1));
279
280 // Rounds 52-53
281 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 26]));
282 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
283 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
284 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
285 STATE3 = vaddq_u64(STATE3, TSTATE1);
286 MSG2 = vsha512su1q_u64(vsha512su0q_u64(MSG2, MSG3), MSG1, vextq_u64(MSG6, MSG7, 1));
287
288 // Rounds 54-55
289 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 27]));
290 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
291 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
292 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
293 STATE2 = vaddq_u64(STATE2, TSTATE1);
294 MSG3 = vsha512su1q_u64(vsha512su0q_u64(MSG3, MSG4), MSG2, vextq_u64(MSG7, MSG0, 1));
295
296 // Rounds 56-57
297 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 28]));
298 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
299 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
300 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
301 STATE1 = vaddq_u64(STATE1, TSTATE1);
302 MSG4 = vsha512su1q_u64(vsha512su0q_u64(MSG4, MSG5), MSG3, vextq_u64(MSG0, MSG1, 1));
303
304 // Rounds 58-59
305 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 29]));
306 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
307 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
308 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
309 STATE0 = vaddq_u64(STATE0, TSTATE1);
310 MSG5 = vsha512su1q_u64(vsha512su0q_u64(MSG5, MSG6), MSG4, vextq_u64(MSG1, MSG2, 1));
311
312 // Rounds 60-61
313 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 30]));
314 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
315 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
316 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
317 STATE3 = vaddq_u64(STATE3, TSTATE1);
318 MSG6 = vsha512su1q_u64(vsha512su0q_u64(MSG6, MSG7), MSG5, vextq_u64(MSG2, MSG3, 1));
319
320 // Rounds 62-63
321 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 31]));
322 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
323 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
324 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
325 STATE2 = vaddq_u64(STATE2, TSTATE1);
326 MSG7 = vsha512su1q_u64(vsha512su0q_u64(MSG7, MSG0), MSG6, vextq_u64(MSG3, MSG4, 1));
327
328 // Rounds 64-65
329 MSG_K = vaddq_u64(MSG0, vld1q_u64(&K[2 * 32]));
330 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
331 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
332 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
333 STATE1 = vaddq_u64(STATE1, TSTATE1);
334
335 // Rounds 66-67
336 MSG_K = vaddq_u64(MSG1, vld1q_u64(&K[2 * 33]));
337 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
338 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
339 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
340 STATE0 = vaddq_u64(STATE0, TSTATE1);
341
342 // Rounds 68-69
343 MSG_K = vaddq_u64(MSG2, vld1q_u64(&K[2 * 34]));
344 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
345 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
346 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
347 STATE3 = vaddq_u64(STATE3, TSTATE1);
348
349 // Rounds 70-71
350 MSG_K = vaddq_u64(MSG3, vld1q_u64(&K[2 * 35]));
351 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
352 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
353 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
354 STATE2 = vaddq_u64(STATE2, TSTATE1);
355
356 // Rounds 72-73
357 MSG_K = vaddq_u64(MSG4, vld1q_u64(&K[2 * 36]));
358 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE3);
359 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE2, STATE3, 1), vextq_u64(STATE1, STATE2, 1));
360 STATE3 = vsha512h2q_u64(TSTATE1, STATE1, STATE0);
361 STATE1 = vaddq_u64(STATE1, TSTATE1);
362
363 // Rounds 74-75
364 MSG_K = vaddq_u64(MSG5, vld1q_u64(&K[2 * 37]));
365 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE2);
366 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE1, STATE2, 1), vextq_u64(STATE0, STATE1, 1));
367 STATE2 = vsha512h2q_u64(TSTATE1, STATE0, STATE3);
368 STATE0 = vaddq_u64(STATE0, TSTATE1);
369
370 // Rounds 76-77
371 MSG_K = vaddq_u64(MSG6, vld1q_u64(&K[2 * 38]));
372 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE1);
373 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE0, STATE1, 1), vextq_u64(STATE3, STATE0, 1));
374 STATE1 = vsha512h2q_u64(TSTATE1, STATE3, STATE2);
375 STATE3 = vaddq_u64(STATE3, TSTATE1);
376
377 // Rounds 78-79
378 MSG_K = vaddq_u64(MSG7, vld1q_u64(&K[2 * 39]));
379 TSTATE0 = vaddq_u64(vextq_u64(MSG_K, MSG_K, 1), STATE0);
380 TSTATE1 = vsha512hq_u64(TSTATE0, vextq_u64(STATE3, STATE0, 1), vextq_u64(STATE2, STATE3, 1));
381 STATE0 = vsha512h2q_u64(TSTATE1, STATE2, STATE1);
382 STATE2 = vaddq_u64(STATE2, TSTATE1);
383
384 // Add back to state
385 STATE0 = vaddq_u64(STATE0, AB_SAVE);
386 STATE1 = vaddq_u64(STATE1, CD_SAVE);
387 STATE2 = vaddq_u64(STATE2, EF_SAVE);
388 STATE3 = vaddq_u64(STATE3, GH_SAVE);
389
390 input64 += 64 / 4;
391 blocks--;
392 }
393
394 // Save state
395 vst1q_u64(&digest[0], STATE0);
396 vst1q_u64(&digest[2], STATE1);
397 vst1q_u64(&digest[4], STATE2);
398 vst1q_u64(&digest[6], STATE3);
399}
400
401} // namespace Botan
#define BOTAN_FUNC_ISA(isa)
Definition compiler.h:92