Botan 3.11.1
Crypto and TLS for C&
whirlpool_avx2.cpp
Go to the documentation of this file.
1/*
2* (C) 2026 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/whirlpool.h>
8
9#include <botan/internal/isa_extn.h>
10#include <immintrin.h>
11
12namespace Botan {
13
14namespace WhirlpoolAVX2 {
15
16namespace {
17
18// NOLINTBEGIN(portability-simd-intrinsics)
19
20class WhirlpoolState {
21 public:
22 BOTAN_FN_ISA_AVX2
23 WhirlpoolState() : m_lo(_mm256_setzero_si256()), m_hi(_mm256_setzero_si256()) {}
24
25 BOTAN_FN_ISA_AVX2
26 WhirlpoolState(__m256i lo, __m256i hi) : m_lo(lo), m_hi(hi) {}
27
28 WhirlpoolState(const WhirlpoolState& other) = default;
29 WhirlpoolState(WhirlpoolState&& other) = default;
30 WhirlpoolState& operator=(const WhirlpoolState& other) = default;
31 WhirlpoolState& operator=(WhirlpoolState&& other) = default;
32 ~WhirlpoolState() = default;
33
34 BOTAN_FN_ISA_AVX2
35 static WhirlpoolState load_bytes(const uint8_t src[64]) {
36 return WhirlpoolState(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)),
37 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 32)));
38 }
39
40 BOTAN_FN_ISA_AVX2
41 static WhirlpoolState load_be(const uint64_t src[8]) {
42 return WhirlpoolState(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)),
43 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 4)))
44 .bswap();
45 }
46
47 BOTAN_FN_ISA_AVX2
48 void store_be(uint64_t dst[8]) const {
49 auto s = bswap();
50 _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), s.m_lo);
51 _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + 4), s.m_hi);
52 }
53
54 BOTAN_FN_ISA_AVX2
55 inline friend WhirlpoolState operator^(WhirlpoolState a, WhirlpoolState b) {
56 return WhirlpoolState(_mm256_xor_si256(a.m_lo, b.m_lo), _mm256_xor_si256(a.m_hi, b.m_hi));
57 }
58
59 BOTAN_FN_ISA_AVX2
60 inline friend WhirlpoolState operator^(WhirlpoolState a, uint64_t rc) {
61 return WhirlpoolState(_mm256_xor_si256(a.m_lo, _mm256_set_epi64x(0, 0, 0, rc)), a.m_hi);
62 }
63
64 BOTAN_FN_ISA_AVX2
65 inline WhirlpoolState& operator^=(WhirlpoolState other) {
66 m_lo = _mm256_xor_si256(m_lo, other.m_lo);
67 m_hi = _mm256_xor_si256(m_hi, other.m_hi);
68 return *this;
69 }
70
71 BOTAN_FN_ISA_AVX2
72 inline WhirlpoolState sub_bytes() const { return WhirlpoolState(sub_bytes(m_lo), sub_bytes(m_hi)); }
73
74 BOTAN_FN_ISA_AVX2
75 inline WhirlpoolState shift_columns() const {
76 /*
77 * This is a lot more complicated than the AVX-512 version since first we have
78 * the state split between two registers and also AVX2 permutes are much weaker
79 * than AVX512's due to mostly only working on 128 bit lanes
80 */
81
82 constexpr char non = -1;
83
84 const auto sc0 = _mm_setr_epi8(0x0, non, non, non, non, non, non, 0xF, 0x8, 0x1, non, non, non, non, non, non);
85 const auto sc1 = _mm_setr_epi8(non, 0x9, 0x2, non, non, non, non, non, non, non, 0xA, 0x3, non, non, non, non);
86 const auto sc2 = _mm_setr_epi8(non, non, non, 0xB, 0x4, non, non, non, non, non, non, non, 0xC, 0x5, non, non);
87 const auto sc3 = _mm_setr_epi8(non, non, non, non, non, 0xD, 0x6, non, non, non, non, non, non, non, 0xE, 0x7);
88
89 const auto idx_same_lane = _mm256_broadcastsi128_si256(sc0);
90 const auto idx_other_half = _mm256_broadcastsi128_si256(sc2);
91 const auto idx_other_lane = _mm256_set_m128i(sc1, sc3);
92 const auto idx_other_both = _mm256_set_m128i(sc3, sc1);
93
94 // Swap the two lanes within the registers so we can get at the values we need via in-lane shuffles
95 const auto r_lo = _mm256_permute2x128_si256(m_lo, m_lo, 0x01);
96 const auto r_hi = _mm256_permute2x128_si256(m_hi, m_hi, 0x01);
97
98 /*
99 * Compute the shift column output by shuffling all 4 input lanes (lo[0], lo[1], hi[0], hi[1])
100 * to select out the values we want from each source lane, placing them in the
101 * index we want, and OR each into the result.
102 */
103 __m256i new_lo = _mm256_shuffle_epi8(m_lo, idx_same_lane);
104 new_lo = _mm256_or_si256(new_lo, _mm256_shuffle_epi8(r_lo, idx_other_lane));
105 new_lo = _mm256_or_si256(new_lo, _mm256_shuffle_epi8(m_hi, idx_other_half));
106 new_lo = _mm256_or_si256(new_lo, _mm256_shuffle_epi8(r_hi, idx_other_both));
107
108 // Same as above just with hi/lo swapped
109 __m256i new_hi = _mm256_shuffle_epi8(m_hi, idx_same_lane);
110 new_hi = _mm256_or_si256(new_hi, _mm256_shuffle_epi8(r_hi, idx_other_lane));
111 new_hi = _mm256_or_si256(new_hi, _mm256_shuffle_epi8(m_lo, idx_other_half));
112 new_hi = _mm256_or_si256(new_hi, _mm256_shuffle_epi8(r_lo, idx_other_both));
113
114 return WhirlpoolState(new_lo, new_hi);
115 }
116
117 BOTAN_FN_ISA_AVX2
118 BOTAN_FORCE_INLINE WhirlpoolState mix_rows() const { return WhirlpoolState(mix_rows(m_lo), mix_rows(m_hi)); }
119
120 BOTAN_FN_ISA_AVX2
121 BOTAN_FORCE_INLINE WhirlpoolState round() const { return sub_bytes().shift_columns().mix_rows(); }
122
123 private:
124 BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2 static __m256i sub_bytes(__m256i v) {
125 const auto Ebox =
126 _mm256_broadcastsi128_si256(_mm_setr_epi8(1, 11, 9, 12, 13, 6, 15, 3, 14, 8, 7, 4, 10, 2, 5, 0));
127 const auto Eibox =
128 _mm256_broadcastsi128_si256(_mm_setr_epi8(15, 0, 13, 7, 11, 14, 5, 10, 9, 2, 12, 1, 3, 4, 8, 6));
129 const auto Rbox =
130 _mm256_broadcastsi128_si256(_mm_setr_epi8(7, 12, 11, 13, 14, 4, 9, 15, 6, 3, 8, 10, 2, 5, 1, 0));
131
132 const auto lo_mask = _mm256_set1_epi8(0x0F);
133
134 const auto lo_nib = _mm256_and_si256(v, lo_mask);
135 const auto hi_nib = _mm256_and_si256(_mm256_srli_epi16(v, 4), lo_mask);
136
137 const auto L = _mm256_shuffle_epi8(Ebox, hi_nib);
138 const auto R = _mm256_shuffle_epi8(Eibox, lo_nib);
139 const auto T = _mm256_shuffle_epi8(Rbox, _mm256_xor_si256(L, R));
140
141 const auto out_hi = _mm256_shuffle_epi8(Ebox, _mm256_xor_si256(L, T));
142 const auto out_lo = _mm256_shuffle_epi8(Eibox, _mm256_xor_si256(R, T));
143
144 return _mm256_or_si256(_mm256_slli_epi16(out_hi, 4), out_lo);
145 }
146
147 BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2 static __m256i mix_rows(__m256i v) {
148 // Shuffles for 64-bit rotations
149 const auto rot1 =
150 _mm256_broadcastsi128_si256(_mm_setr_epi8(7, 0, 1, 2, 3, 4, 5, 6, 15, 8, 9, 10, 11, 12, 13, 14));
151 const auto rot2 =
152 _mm256_broadcastsi128_si256(_mm_setr_epi8(6, 7, 0, 1, 2, 3, 4, 5, 14, 15, 8, 9, 10, 11, 12, 13));
153 const auto rot3 =
154 _mm256_broadcastsi128_si256(_mm_setr_epi8(5, 6, 7, 0, 1, 2, 3, 4, 13, 14, 15, 8, 9, 10, 11, 12));
155 const auto rot4 =
156 _mm256_broadcastsi128_si256(_mm_setr_epi8(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11));
157 const auto rot5 =
158 _mm256_broadcastsi128_si256(_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10));
159 const auto rot6 =
160 _mm256_broadcastsi128_si256(_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9));
161 const auto rot7 =
162 _mm256_broadcastsi128_si256(_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 0, 9, 10, 11, 12, 13, 14, 15, 8));
163
164 const auto x2 = xtime(v);
165 const auto x4 = xtime(x2);
166 const auto x8 = xtime(x4);
167 const auto x5 = _mm256_xor_si256(x4, v);
168 const auto x9 = _mm256_xor_si256(x8, v);
169
170 const auto t01 = _mm256_xor_si256(v, _mm256_shuffle_epi8(v, rot1));
171 const auto t23 = _mm256_xor_si256(_mm256_shuffle_epi8(x4, rot2), _mm256_shuffle_epi8(v, rot3));
172 const auto t45 = _mm256_xor_si256(_mm256_shuffle_epi8(x8, rot4), _mm256_shuffle_epi8(x5, rot5));
173 const auto t67 = _mm256_xor_si256(_mm256_shuffle_epi8(x2, rot6), _mm256_shuffle_epi8(x9, rot7));
174
175 return _mm256_xor_si256(_mm256_xor_si256(t01, t23), _mm256_xor_si256(t45, t67));
176 }
177
178 BOTAN_FN_ISA_AVX2
179 WhirlpoolState bswap() const {
180 // 64-bit byteswap
181 const auto tbl =
182 _mm256_broadcastsi128_si256(_mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8));
183
184 return WhirlpoolState(_mm256_shuffle_epi8(m_lo, tbl), _mm256_shuffle_epi8(m_hi, tbl));
185 }
186
187 BOTAN_FN_ISA_AVX2
188 static __m256i xtime(__m256i a) {
189 const auto poly = _mm256_set1_epi8(0x1D);
190 const auto shifted = _mm256_add_epi8(a, a); // shifted = a << 1
191 // blendv uses the top bit of the mask argument (a) to select between the inputs
192 return _mm256_blendv_epi8(shifted, _mm256_xor_si256(shifted, poly), a);
193 }
194
195 __m256i m_lo;
196 __m256i m_hi;
197};
198
199// NOLINTEND(portability-simd-intrinsics)
200
201} // namespace
202
203} // namespace WhirlpoolAVX2
204
205BOTAN_FN_ISA_AVX2
206void Whirlpool::compress_n_avx2(digest_type& digest, std::span<const uint8_t> input, size_t blocks) {
207 using WhirlpoolAVX2::WhirlpoolState;
208
209 auto H = WhirlpoolState::load_be(digest.data());
210
211 for(size_t i = 0; i != blocks; ++i) {
212 const auto M = WhirlpoolState::load_bytes(input.data() + i * 64);
213
214 auto K = H;
215 H ^= M;
216 auto B = H; // B = M ^ K
217
218 K = K.round() ^ 0x4F01B887E8C62318;
219 B = B.round() ^ K;
220
221 K = K.round() ^ 0x52916F79F5D2A636;
222 B = B.round() ^ K;
223
224 K = K.round() ^ 0x357B0CA38E9BBC60;
225 B = B.round() ^ K;
226
227 K = K.round() ^ 0x57FE4B2EC2D7E01D;
228 B = B.round() ^ K;
229
230 K = K.round() ^ 0xDA4AF09FE5377715;
231 B = B.round() ^ K;
232
233 K = K.round() ^ 0x856BA0B10A29C958;
234 B = B.round() ^ K;
235
236 K = K.round() ^ 0x67053ECBF4105DBD;
237 B = B.round() ^ K;
238
239 K = K.round() ^ 0xD8957DA78B4127E4;
240 B = B.round() ^ K;
241
242 K = K.round() ^ 0x9E4717DD667CEEFB;
243 B = B.round() ^ K;
244
245 K = K.round() ^ 0x33835AAD07BF2DCA;
246 B = B.round() ^ K;
247
248 H ^= B;
249 }
250
251 H.store_be(digest.data());
252}
253
254} // namespace Botan
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
OctetString operator^(const OctetString &k1, const OctetString &k2)
Definition symkey.cpp:109
std::vector< uint8_t, Alloc > & operator^=(std::vector< uint8_t, Alloc > &out, const std::vector< uint8_t, Alloc2 > &in)
Definition mem_ops.h:445
constexpr auto store_be(ParamTs &&... params)
Definition loadstor.h:745
constexpr auto load_be(ParamTs &&... params)
Definition loadstor.h:504