Botan 3.11.0
Crypto and TLS for C&
poly1305_avx2.cpp
Go to the documentation of this file.
1/*
2* (C) 2026 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/poly1305.h>
8
9#include <botan/internal/isa_extn.h>
10#include <immintrin.h>
11
12namespace Botan {
13
14// NOLINTBEGIN(portability-simd-intrinsics)
15
16namespace {
17
18constexpr uint32_t MASK26 = 0x3FFFFFF;
19
20/*
21* 4x26 values packed in a 256-bit register
22*
23* The 26 bit is somewhat a lie; we actually use the full 64 bit width
24* but assume that after a 32x32->64 multiply there is still enough
25* space to store sums into 64 bits. We could pack slightly more bits,
26* but 26x5 = 130 is enough.
27*/
28class SIMD_4x26 final {
29 public:
30 BOTAN_FN_ISA_AVX2 SIMD_4x26() : m_v(_mm256_setzero_si256()) {}
31
32 // Construct from raw __m256i (for vectorized loading)
33 static BOTAN_FN_ISA_AVX2 SIMD_4x26 from_raw(__m256i v) { return SIMD_4x26(v); }
34
35 // Pack 4 values into lanes (high to low: v3, v2, v1, v0)
36 static BOTAN_FN_ISA_AVX2 SIMD_4x26 set(uint32_t v3, uint32_t v2, uint32_t v1, uint32_t v0) {
37 return SIMD_4x26(_mm256_set_epi32(0, v3, 0, v2, 0, v1, 0, v0));
38 }
39
40 // Multiply by 5: 5*x = (x << 2) + x
41 BOTAN_FN_ISA_AVX2 SIMD_4x26 mul_5() const { return SIMD_4x26(_mm256_add_epi32(_mm256_slli_epi32(m_v, 2), m_v)); }
42
43 friend SIMD_4x26 BOTAN_FN_ISA_AVX2 operator+(const SIMD_4x26& x, const SIMD_4x26& y) {
44 return SIMD_4x26(_mm256_add_epi64(x.raw(), y.raw()));
45 }
46
47 friend SIMD_4x26 BOTAN_FN_ISA_AVX2 operator*(const SIMD_4x26& x, const SIMD_4x26& y) {
48 return SIMD_4x26(_mm256_mul_epi32(x.raw(), y.raw()));
49 }
50
51 // Horizontal sum of 4x64-bit values
52 BOTAN_FN_ISA_AVX2 uint64_t horizontal_add64() const {
53 uint64_t tmp[4];
54 _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp), m_v);
55 return tmp[0] + tmp[1] + tmp[2] + tmp[3];
56 }
57
58 __m256i BOTAN_FN_ISA_AVX2 raw() const { return m_v; }
59
60 private:
61 explicit BOTAN_FN_ISA_AVX2 SIMD_4x26(__m256i v) : m_v(v) {}
62
63 __m256i m_v;
64};
65
66/*
67* Vectorized load of 4 message blocks into radix 2^26 representation
68*
69* Loads 64 bytes (4 blocks), deinterleaves t0/t1 halves, and converts
70* to radix 2^26 using vector shift/mask operations.
71*
72* Lane ordering: block 0 in lane 3, block 3 in lane 0 (reversed for multiply)
73*/
74BOTAN_FN_ISA_AVX2 void load_4_blocks_26(SIMD_4x26& msg_0,
75 SIMD_4x26& msg_1,
76 SIMD_4x26& msg_2,
77 SIMD_4x26& msg_3,
78 SIMD_4x26& msg_4,
79 const uint8_t* m,
80 std::array<uint32_t, 5> h) {
81 // Load 64 bytes (4 blocks of 16 bytes each)
82 const __m256i d0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(m));
83 const __m256i d1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(m + 32));
84
85 // Deinterleave: extract low 64-bit (t0) and high 64-bit (t1) from each block
86 // unpacklo/hi work within 128-bit lanes: pairs adjacent blocks
87 const __m256i t0_mixed = _mm256_unpacklo_epi64(d0, d1); // [blk3_lo, blk1_lo, blk2_lo, blk0_lo]
88 const __m256i t1_mixed = _mm256_unpackhi_epi64(d0, d1); // [blk3_hi, blk1_hi, blk2_hi, blk0_hi]
89
90 const __m256i t0 = _mm256_permute4x64_epi64(t0_mixed, 0b00100111);
91 const __m256i t1 = _mm256_permute4x64_epi64(t1_mixed, 0b00100111);
92
93 // Constants for radix conversion
94 const __m256i mask26 = _mm256_set1_epi64x(MASK26);
95 const __m256i hibit_vec = _mm256_set1_epi64x(1 << 24);
96
97 // Convert to radix 2^26:
98 // limb0 = t0[25:0]
99 // limb1 = t0[51:26]
100 // limb2 = t0[63:52] | t1[13:0] << 12 (bits 52-77)
101 // limb3 = t1[39:14] (bits 78-103)
102 // limb4 = t1[63:40] | hibit (bits 104-127 + 2^128 marker)
103 __m256i limb0 = _mm256_and_si256(t0, mask26);
104 __m256i limb1 = _mm256_and_si256(_mm256_srli_epi64(t0, 26), mask26);
105 __m256i limb2 = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi64(t0, 52), _mm256_slli_epi64(t1, 12)), mask26);
106 __m256i limb3 = _mm256_and_si256(_mm256_srli_epi64(t1, 14), mask26);
107 __m256i limb4 = _mm256_or_si256(_mm256_srli_epi64(t1, 40), hibit_vec);
108
109 // Add h to lane 3 (block 0): h + m[0] before multiply by r^4
110 limb0 = _mm256_add_epi64(limb0, _mm256_set_epi64x(h[0], 0, 0, 0));
111 limb1 = _mm256_add_epi64(limb1, _mm256_set_epi64x(h[1], 0, 0, 0));
112 limb2 = _mm256_add_epi64(limb2, _mm256_set_epi64x(h[2], 0, 0, 0));
113 limb3 = _mm256_add_epi64(limb3, _mm256_set_epi64x(h[3], 0, 0, 0));
114 limb4 = _mm256_add_epi64(limb4, _mm256_set_epi64x(h[4], 0, 0, 0));
115
116 msg_0 = SIMD_4x26::from_raw(limb0);
117 msg_1 = SIMD_4x26::from_raw(limb1);
118 msg_2 = SIMD_4x26::from_raw(limb2);
119 msg_3 = SIMD_4x26::from_raw(limb3);
120 msg_4 = SIMD_4x26::from_raw(limb4);
121}
122
123// NOLINTEND(portability-simd-intrinsics)
124
125// Convert radix-2^26 limbs back to radix-2^44
126BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX2 void convert_26_to_44(uint64_t& r0,
127 uint64_t& r1,
128 uint64_t& r2,
129 const std::array<uint32_t, 5> in) {
130 constexpr uint64_t M44 = 0xFFFFFFFFFFF;
131 constexpr uint64_t M42 = 0x3FFFFFFFFFF;
132
133 // Expand to 64 bits
134 const uint64_t i0 = in[0];
135 const uint64_t i1 = in[1];
136 const uint64_t i2 = in[2];
137 const uint64_t i3 = in[3];
138 const uint64_t i4 = in[4];
139
140 r0 = (i0 | (i1 << 26)) & M44;
141 r1 = ((i1 >> 18) | (i2 << 8) | (i3 << 34)) & M44;
142 r2 = ((i3 >> 10) | (i4 << 16)) & M42;
143}
144
145// Convert radix-2^44 limbs to radix-2^26
146BOTAN_FORCE_INLINE std::array<uint32_t, 5> convert_44_to_26(uint64_t r0, uint64_t r1, uint64_t r2) {
147 std::array<uint32_t, 5> out{};
148 out[0] = static_cast<uint32_t>(r0) & MASK26; // bits 0-25
149 out[1] = static_cast<uint32_t>((r0 >> 26) | (r1 << 18)) & MASK26; // bits 26-51
150 out[2] = static_cast<uint32_t>(r1 >> 8) & MASK26; // bits 52-77
151 out[3] = static_cast<uint32_t>((r1 >> 34) | (r2 << 10)) & MASK26; // bits 78-103
152 out[4] = static_cast<uint32_t>(r2 >> 16) & MASK26; // bits 104-129
153 return out;
154}
155
156inline void BOTAN_FN_ISA_AVX2
157load_r(SIMD_4x26& r0, SIMD_4x26& r1, SIMD_4x26& r2, SIMD_4x26& r3, SIMD_4x26& r4, const secure_vector<uint64_t>& X) {
158 // TODO do this in vector registers instead
159 const auto t = convert_44_to_26(X[5], X[6], X[7]);
160 const auto t2 = convert_44_to_26(X[8], X[9], X[10]);
161 const auto t3 = convert_44_to_26(X[11], X[12], X[13]);
162 const auto t4 = convert_44_to_26(X[14], X[15], X[16]);
163
164 r0 = SIMD_4x26::set(t4[0], t3[0], t2[0], t[0]);
165 r1 = SIMD_4x26::set(t4[1], t3[1], t2[1], t[1]);
166 r2 = SIMD_4x26::set(t4[2], t3[2], t2[2], t[2]);
167 r3 = SIMD_4x26::set(t4[3], t3[3], t2[3], t[3]);
168 r4 = SIMD_4x26::set(t4[4], t3[4], t2[4], t[4]);
169}
170
171} // namespace
172
173/*
174* Process 4 blocks at a time using AVX2
175* h = (h + m[0]) * r^4 + m[1] * r^3 + m[2] * r^2 + m[3] * r
176*/
177size_t BOTAN_FN_ISA_AVX2 Poly1305::poly1305_avx2_blocks(secure_vector<uint64_t>& X, const uint8_t m[], size_t blocks) {
178 if(blocks < 4) {
179 return 0;
180 }
181
182 const size_t incoming_blocks = blocks;
183
184 auto h = convert_44_to_26(X[2], X[3], X[4]);
185
186 SIMD_4x26 r0;
187 SIMD_4x26 r1;
188 SIMD_4x26 r2;
189 SIMD_4x26 r3;
190 SIMD_4x26 r4;
191 load_r(r0, r1, r2, r3, r4, X);
192
193 const auto r1_5 = r1.mul_5();
194 const auto r2_5 = r2.mul_5();
195 const auto r3_5 = r3.mul_5();
196 const auto r4_5 = r4.mul_5();
197
198 while(blocks >= 4) {
199 // Load 4 message blocks, convert to radix 2^26, and add h to block 0
200 SIMD_4x26 m0;
201 SIMD_4x26 m1;
202 SIMD_4x26 m2;
203 SIMD_4x26 m3;
204 SIMD_4x26 m4;
205 load_4_blocks_26(m0, m1, m2, m3, m4, m, h);
206
207 const auto d0 = m0 * r0 + m1 * r4_5 + m2 * r3_5 + m3 * r2_5 + m4 * r1_5;
208 const auto d1 = m0 * r1 + m1 * r0 + m2 * r4_5 + m3 * r3_5 + m4 * r2_5;
209 const auto d2 = m0 * r2 + m1 * r1 + m2 * r0 + m3 * r4_5 + m4 * r3_5;
210 const auto d3 = m0 * r3 + m1 * r2 + m2 * r1 + m3 * r0 + m4 * r4_5;
211 const auto d4 = m0 * r4 + m1 * r3 + m2 * r2 + m3 * r1 + m4 * r0;
212
213 const uint64_t h0_64 = d0.horizontal_add64();
214 uint64_t h1_64 = d1.horizontal_add64();
215 uint64_t h2_64 = d2.horizontal_add64();
216 uint64_t h3_64 = d3.horizontal_add64();
217 uint64_t h4_64 = d4.horizontal_add64();
218
219 h1_64 += h0_64 >> 26;
220 h[0] = static_cast<uint32_t>(h0_64) & MASK26;
221 h2_64 += h1_64 >> 26;
222 h[1] = static_cast<uint32_t>(h1_64) & MASK26;
223 h3_64 += h2_64 >> 26;
224 h[2] = static_cast<uint32_t>(h2_64) & MASK26;
225 h4_64 += h3_64 >> 26;
226 h[3] = static_cast<uint32_t>(h3_64) & MASK26;
227
228 const uint64_t c = h4_64 >> 26;
229 h[4] = static_cast<uint32_t>(h4_64) & MASK26;
230
231 uint64_t carry = c * 5;
232 carry += h[0];
233 h[0] = static_cast<uint32_t>(carry) & MASK26;
234 carry >>= 26;
235 carry += h[1];
236 h[1] = static_cast<uint32_t>(carry) & MASK26;
237 carry >>= 26;
238 carry += h[2];
239 h[2] = static_cast<uint32_t>(carry) & MASK26;
240 carry >>= 26;
241 carry += h[3];
242 h[3] = static_cast<uint32_t>(carry) & MASK26;
243 carry >>= 26;
244 h[4] += static_cast<uint32_t>(carry);
245
246 m += 64;
247 blocks -= 4;
248 }
249
250 convert_26_to_44(X[2], X[3], X[4], h);
251
252 return (incoming_blocks - blocks);
253}
254
255} // namespace Botan
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
BigInt operator*(const BigInt &x, const BigInt &y)
Definition big_ops3.cpp:57
OctetString operator+(const OctetString &k1, const OctetString &k2)
Definition symkey.cpp:99
void carry(int64_t &h0, int64_t &h1)
std::vector< T, secure_allocator< T > > secure_vector
Definition secmem.h:68