Botan 3.4.0
Crypto and TLS for C&
sha1_sse2.cpp
Go to the documentation of this file.
1/*
2* SHA-1 using SSE2
3* Based on public domain code by Dean Gaudet
4* (http://arctic.org/~dean/crypto/sha1.html)
5* (C) 2009-2011,2023 Jack Lloyd
6*
7* Botan is released under the Simplified BSD License (see license.txt)
8*/
9
10#include <botan/internal/sha1.h>
11
12#include <botan/internal/bit_ops.h>
13#include <botan/internal/rotate.h>
14#include <botan/internal/simd_32.h>
15#include <botan/internal/stl_util.h>
16#include <emmintrin.h>
17
18namespace Botan {
19
20namespace SHA1_SSE2_F {
21
22namespace {
23
24/*
25For each multiple of 4, t, we want to calculate this:
26
27W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
28W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1);
29W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1);
30W[t+3] = rol(W[t] ^ W[t-5] ^ W[t-11] ^ W[t-13], 1);
31
32we'll actually calculate this:
33
34W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
35W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1);
36W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1);
37W[t+3] = rol( 0 ^ W[t-5] ^ W[t-11] ^ W[t-13], 1);
38W[t+3] ^= rol(W[t+0], 1);
39
40the parameters are:
41
42W0 = &W[t-16];
43W1 = &W[t-12];
44W2 = &W[t- 8];
45W3 = &W[t- 4];
46
47and on output:
48prepared = W0 + K
49W0 = W[t]..W[t+3]
50*/
52 SIMD_4x32 T0 = XW0;
53 /* load W[t-4] 16-byte aligned, and shift */
54 SIMD_4x32 T2 = XW3.shift_elems_right<1>();
55 /* get high 64-bits of XW0 into low 64-bits */
56 SIMD_4x32 T1 = SIMD_4x32(_mm_shuffle_epi32(XW0.raw(), _MM_SHUFFLE(1, 0, 3, 2)));
57 /* load high 64-bits of T1 */
58 T1 = SIMD_4x32(_mm_unpacklo_epi64(T1.raw(), XW1.raw()));
59
60 T0 ^= T1;
61 T2 ^= XW2;
62 T0 ^= T2;
63 /* unrotated W[t]..W[t+2] in T0 ... still need W[t+3] */
64
65 T2 = T0.shift_elems_left<3>();
66 T0 = T0.rotl<1>();
67 T2 = T2.rotl<2>();
68
69 T0 ^= T2; /* T0 now has W[t+3] */
70
71 XW0 = T0;
72 return T0 + K;
73}
74
75/*
76* SHA-1 F1 Function
77*/
78inline void F1(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg) {
79 E += choose(B, C, D) + msg + rotl<5>(A);
80 B = rotl<30>(B);
81}
82
83/*
84* SHA-1 F2 Function
85*/
86inline void F2(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg) {
87 E += (B ^ C ^ D) + msg + rotl<5>(A);
88 B = rotl<30>(B);
89}
90
91/*
92* SHA-1 F3 Function
93*/
94inline void F3(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg) {
95 E += majority(B, C, D) + msg + rotl<5>(A);
96 B = rotl<30>(B);
97}
98
99/*
100* SHA-1 F4 Function
101*/
102inline void F4(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg) {
103 E += (B ^ C ^ D) + msg + rotl<5>(A);
104 B = rotl<30>(B);
105}
106
107} // namespace
108
109} // namespace SHA1_SSE2_F
110
111/*
112* SHA-1 Compression Function using SSE for message expansion
113*/
114//static
115BOTAN_FUNC_ISA("sse2") void SHA_1::sse2_compress_n(digest_type& digest, std::span<const uint8_t> input, size_t blocks) {
116 using namespace SHA1_SSE2_F;
117
118 const SIMD_4x32 K00_19 = SIMD_4x32::splat(0x5A827999);
119 const SIMD_4x32 K20_39 = SIMD_4x32::splat(0x6ED9EBA1);
120 const SIMD_4x32 K40_59 = SIMD_4x32::splat(0x8F1BBCDC);
121 const SIMD_4x32 K60_79 = SIMD_4x32::splat(0xCA62C1D6);
122
123 uint32_t A = digest[0], B = digest[1], C = digest[2], D = digest[3], E = digest[4];
124
125 BufferSlicer in(input);
126
127 for(size_t i = 0; i != blocks; ++i) {
128 uint32_t PT[4];
129
130 const auto block = in.take(block_bytes);
131
132 SIMD_4x32 W0 = SIMD_4x32::load_be(&block[0]);
133 SIMD_4x32 W1 = SIMD_4x32::load_be(&block[16]);
134 SIMD_4x32 W2 = SIMD_4x32::load_be(&block[32]);
135 SIMD_4x32 W3 = SIMD_4x32::load_be(&block[48]);
136
137 SIMD_4x32 P0 = W0 + K00_19;
138 SIMD_4x32 P1 = W1 + K00_19;
139 SIMD_4x32 P2 = W2 + K00_19;
140 SIMD_4x32 P3 = W3 + K00_19;
141
142 SIMD_4x32(P0).store_le(PT);
143 F1(A, B, C, D, E, PT[0]);
144 F1(E, A, B, C, D, PT[1]);
145 F1(D, E, A, B, C, PT[2]);
146 F1(C, D, E, A, B, PT[3]);
147 P0 = prep(W0, W1, W2, W3, K00_19);
148
149 SIMD_4x32(P1).store_le(PT);
150 F1(B, C, D, E, A, PT[0]);
151 F1(A, B, C, D, E, PT[1]);
152 F1(E, A, B, C, D, PT[2]);
153 F1(D, E, A, B, C, PT[3]);
154 P1 = prep(W1, W2, W3, W0, K20_39);
155
156 SIMD_4x32(P2).store_le(PT);
157 F1(C, D, E, A, B, PT[0]);
158 F1(B, C, D, E, A, PT[1]);
159 F1(A, B, C, D, E, PT[2]);
160 F1(E, A, B, C, D, PT[3]);
161 P2 = prep(W2, W3, W0, W1, K20_39);
162
163 SIMD_4x32(P3).store_le(PT);
164 F1(D, E, A, B, C, PT[0]);
165 F1(C, D, E, A, B, PT[1]);
166 F1(B, C, D, E, A, PT[2]);
167 F1(A, B, C, D, E, PT[3]);
168 P3 = prep(W3, W0, W1, W2, K20_39);
169
170 SIMD_4x32(P0).store_le(PT);
171 F1(E, A, B, C, D, PT[0]);
172 F1(D, E, A, B, C, PT[1]);
173 F1(C, D, E, A, B, PT[2]);
174 F1(B, C, D, E, A, PT[3]);
175 P0 = prep(W0, W1, W2, W3, K20_39);
176
177 SIMD_4x32(P1).store_le(PT);
178 F2(A, B, C, D, E, PT[0]);
179 F2(E, A, B, C, D, PT[1]);
180 F2(D, E, A, B, C, PT[2]);
181 F2(C, D, E, A, B, PT[3]);
182 P1 = prep(W1, W2, W3, W0, K20_39);
183
184 SIMD_4x32(P2).store_le(PT);
185 F2(B, C, D, E, A, PT[0]);
186 F2(A, B, C, D, E, PT[1]);
187 F2(E, A, B, C, D, PT[2]);
188 F2(D, E, A, B, C, PT[3]);
189 P2 = prep(W2, W3, W0, W1, K40_59);
190
191 SIMD_4x32(P3).store_le(PT);
192 F2(C, D, E, A, B, PT[0]);
193 F2(B, C, D, E, A, PT[1]);
194 F2(A, B, C, D, E, PT[2]);
195 F2(E, A, B, C, D, PT[3]);
196 P3 = prep(W3, W0, W1, W2, K40_59);
197
198 SIMD_4x32(P0).store_le(PT);
199 F2(D, E, A, B, C, PT[0]);
200 F2(C, D, E, A, B, PT[1]);
201 F2(B, C, D, E, A, PT[2]);
202 F2(A, B, C, D, E, PT[3]);
203 P0 = prep(W0, W1, W2, W3, K40_59);
204
205 SIMD_4x32(P1).store_le(PT);
206 F2(E, A, B, C, D, PT[0]);
207 F2(D, E, A, B, C, PT[1]);
208 F2(C, D, E, A, B, PT[2]);
209 F2(B, C, D, E, A, PT[3]);
210 P1 = prep(W1, W2, W3, W0, K40_59);
211
212 SIMD_4x32(P2).store_le(PT);
213 F3(A, B, C, D, E, PT[0]);
214 F3(E, A, B, C, D, PT[1]);
215 F3(D, E, A, B, C, PT[2]);
216 F3(C, D, E, A, B, PT[3]);
217 P2 = prep(W2, W3, W0, W1, K40_59);
218
219 SIMD_4x32(P3).store_le(PT);
220 F3(B, C, D, E, A, PT[0]);
221 F3(A, B, C, D, E, PT[1]);
222 F3(E, A, B, C, D, PT[2]);
223 F3(D, E, A, B, C, PT[3]);
224 P3 = prep(W3, W0, W1, W2, K60_79);
225
226 SIMD_4x32(P0).store_le(PT);
227 F3(C, D, E, A, B, PT[0]);
228 F3(B, C, D, E, A, PT[1]);
229 F3(A, B, C, D, E, PT[2]);
230 F3(E, A, B, C, D, PT[3]);
231 P0 = prep(W0, W1, W2, W3, K60_79);
232
233 SIMD_4x32(P1).store_le(PT);
234 F3(D, E, A, B, C, PT[0]);
235 F3(C, D, E, A, B, PT[1]);
236 F3(B, C, D, E, A, PT[2]);
237 F3(A, B, C, D, E, PT[3]);
238 P1 = prep(W1, W2, W3, W0, K60_79);
239
240 SIMD_4x32(P2).store_le(PT);
241 F3(E, A, B, C, D, PT[0]);
242 F3(D, E, A, B, C, PT[1]);
243 F3(C, D, E, A, B, PT[2]);
244 F3(B, C, D, E, A, PT[3]);
245 P2 = prep(W2, W3, W0, W1, K60_79);
246
247 SIMD_4x32(P3).store_le(PT);
248 F4(A, B, C, D, E, PT[0]);
249 F4(E, A, B, C, D, PT[1]);
250 F4(D, E, A, B, C, PT[2]);
251 F4(C, D, E, A, B, PT[3]);
252 P3 = prep(W3, W0, W1, W2, K60_79);
253
254 SIMD_4x32(P0).store_le(PT);
255 F4(B, C, D, E, A, PT[0]);
256 F4(A, B, C, D, E, PT[1]);
257 F4(E, A, B, C, D, PT[2]);
258 F4(D, E, A, B, C, PT[3]);
259
260 SIMD_4x32(P1).store_le(PT);
261 F4(C, D, E, A, B, PT[0]);
262 F4(B, C, D, E, A, PT[1]);
263 F4(A, B, C, D, E, PT[2]);
264 F4(E, A, B, C, D, PT[3]);
265
266 SIMD_4x32(P2).store_le(PT);
267 F4(D, E, A, B, C, PT[0]);
268 F4(C, D, E, A, B, PT[1]);
269 F4(B, C, D, E, A, PT[2]);
270 F4(A, B, C, D, E, PT[3]);
271
272 SIMD_4x32(P3).store_le(PT);
273 F4(E, A, B, C, D, PT[0]);
274 F4(D, E, A, B, C, PT[1]);
275 F4(C, D, E, A, B, PT[2]);
276 F4(B, C, D, E, A, PT[3]);
277
278 A = (digest[0] += A);
279 B = (digest[1] += B);
280 C = (digest[2] += C);
281 D = (digest[3] += D);
282 E = (digest[4] += E);
283 }
284}
285
286} // namespace Botan
static SIMD_4x32 load_be(const void *in) noexcept
Definition simd_32.h:175
native_simd_type raw() const noexcept
Definition simd_32.h:606
void store_le(uint32_t out[4]) const noexcept
Definition simd_32.h:190
SIMD_4x32 shift_elems_left() const noexcept
Definition simd_32.h:502
SIMD_4x32 shift_elems_right() const noexcept
Definition simd_32.h:523
SIMD_4x32 rotl() const noexcept
Definition simd_32.h:282
static SIMD_4x32 splat(uint32_t B) noexcept
Definition simd_32.h:132
#define BOTAN_FUNC_ISA(isa)
Definition compiler.h:92
#define BOTAN_FORCE_INLINE
Definition compiler.h:165
constexpr T choose(T mask, T a, T b)
Definition bit_ops.h:180
constexpr T majority(T a, T b, T c)
Definition bit_ops.h:186