Botan 3.11.0
Crypto and TLS for C&
poly1305_avx512.cpp
Go to the documentation of this file.
1/*
2* (C) 2026 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/poly1305.h>
8
9#include <botan/internal/isa_extn.h>
10#include <immintrin.h>
11
12namespace Botan {
13
14namespace {
15
16// NOLINTBEGIN(portability-simd-intrinsics)
17
18class SIMD_8x44 final {
19 public:
20 BOTAN_FN_ISA_AVX512 SIMD_8x44() : m_v(_mm512_setzero_si512()) {}
21
22 static BOTAN_FN_ISA_AVX512 SIMD_8x44 splat(uint64_t x) { return SIMD_8x44(_mm512_set1_epi64(x)); }
23
24 static BOTAN_FN_ISA_AVX512 SIMD_8x44 load(const void* p) {
25 return SIMD_8x44(_mm512_loadu_si512(reinterpret_cast<const __m512i*>(p)));
26 }
27
28 BOTAN_FN_ISA_AVX512 SIMD_8x44(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) :
29 SIMD_8x44(_mm512_set_epi64(e7, e6, e5, e4, e3, e2, e1, e0)) {}
30
31 // Permute across two vectors using index vector
32 static BOTAN_FN_ISA_AVX512 SIMD_8x44 permute2(const SIMD_8x44& idx, const SIMD_8x44& a, const SIMD_8x44& b) {
33 return SIMD_8x44(_mm512_permutex2var_epi64(a.m_v, idx.m_v, b.m_v));
34 }
35
36 static BOTAN_FN_ISA_AVX512 SIMD_8x44 permute3(
37 const SIMD_8x44& idx0, const SIMD_8x44& idx1, const SIMD_8x44& a, const SIMD_8x44& b, const SIMD_8x44& c) {
38 return SIMD_8x44::permute2(idx1, SIMD_8x44::permute2(idx0, a, b), c);
39 }
40
41 // VBMI2 double shift right: concatenate (b:a) and shift right by count
42 template <int COUNT>
43 static BOTAN_FN_ISA_AVX512 SIMD_8x44 shrdi(const SIMD_8x44& a, const SIMD_8x44& b) {
44 return SIMD_8x44(_mm512_shrdi_epi64(a.m_v, b.m_v, COUNT));
45 }
46
47 BOTAN_FN_ISA_AVX512 SIMD_8x44 add_lane_zero(uint64_t b) {
48 return SIMD_8x44(_mm512_mask_add_epi64(m_v, 0x01, m_v, _mm512_set1_epi64(b)));
49 }
50
51 // IFMA: accumulator += (a * b) low 52 bits
52 BOTAN_FN_ISA_AVX512 SIMD_8x44& ifma_lo(const SIMD_8x44& a, const SIMD_8x44& b) {
53 m_v = _mm512_madd52lo_epu64(m_v, a.m_v, b.m_v);
54 return *this;
55 }
56
57 // IFMA: accumulator += (a * b) high 52 bits
58 BOTAN_FN_ISA_AVX512 SIMD_8x44& ifma_hi(const SIMD_8x44& a, const SIMD_8x44& b) {
59 m_v = _mm512_madd52hi_epu64(m_v, a.m_v, b.m_v);
60 return *this;
61 }
62
63 // Multiply by 20: 20*x = (x << 4) + (x << 2)
64 BOTAN_FN_ISA_AVX512 SIMD_8x44 mul_20() const {
65 return SIMD_8x44(_mm512_add_epi64(_mm512_slli_epi64(m_v, 4), _mm512_slli_epi64(m_v, 2)));
66 }
67
68 template <size_t S>
69 BOTAN_FN_ISA_AVX512 SIMD_8x44 shr() const {
70 return SIMD_8x44(_mm512_srli_epi64(m_v, S));
71 }
72
73 BOTAN_FN_ISA_AVX512 uint64_t horizontal_add() const { return _mm512_reduce_add_epi64(m_v); }
74
75 BOTAN_FN_ISA_AVX512 SIMD_8x44 operator&(const SIMD_8x44& other) const {
76 return SIMD_8x44(_mm512_and_si512(m_v, other.m_v));
77 }
78
79 BOTAN_FN_ISA_AVX512 SIMD_8x44 operator|(const SIMD_8x44& other) const {
80 return SIMD_8x44(_mm512_or_si512(m_v, other.m_v));
81 }
82
83 static BOTAN_FN_ISA_AVX512 void interleave_3x8(SIMD_8x44& r0, SIMD_8x44& r1, SIMD_8x44& r2) {
84 const auto idx1_z0 = SIMD_8x44(0, 3, 6, 9, 12, 15, -1, -1);
85 const auto idx2_z0 = SIMD_8x44(7, 6, 5, 4, 3, 2, 10, 13);
86 const auto idx1_z1 = SIMD_8x44(1, 4, 7, 10, 13, -1, -1, -1);
87 const auto idx2_z1 = SIMD_8x44(7, 6, 5, 4, 3, 8, 11, 14);
88 const auto idx1_z2 = SIMD_8x44(2, 5, 8, 11, 14, -1, -1, -1);
89 const auto idx2_z2 = SIMD_8x44(7, 6, 5, 4, 3, 9, 12, 15);
90
91 // NOLINTBEGIN(*-suspicious-call-argument)
92 auto z0 = SIMD_8x44::permute3(idx1_z0, idx2_z0, r0, r1, r2);
93 auto z1 = SIMD_8x44::permute3(idx1_z1, idx2_z1, r0, r1, r2);
94 auto z2 = SIMD_8x44::permute3(idx1_z2, idx2_z2, r0, r1, r2);
95 // NOLINTEND(*-suspicious-call-argument)
96
97 r0 = z0;
98 r1 = z1;
99 r2 = z2;
100 }
101
102 private:
103 __m512i BOTAN_FN_ISA_AVX512 raw() const { return m_v; }
104
105 explicit BOTAN_FN_ISA_AVX512 SIMD_8x44(__m512i v) : m_v(v) {}
106
107 __m512i m_v;
108};
109
110// NOLINTEND(portability-simd-intrinsics)
111
112} // namespace
113
114/*
115* Process 8 blocks at a time using AVX-512 IFMA
116* h = (h + m[0]) * r^8 + m[1] * r^7 + ... + m[7] * r
117*/
118size_t BOTAN_FN_ISA_AVX512 Poly1305::poly1305_avx512_blocks(secure_vector<uint64_t>& X,
119 const uint8_t* m,
120 size_t blocks) {
121 constexpr uint64_t M44 = 0xFFFFFFFFFFF;
122 constexpr uint64_t M42 = 0x3FFFFFFFFFF;
123 constexpr uint64_t hibit64 = static_cast<uint64_t>(1) << 40;
124
125 if(blocks < 8) {
126 return 0;
127 }
128
129 const size_t original_blocks = blocks;
130
131 // Load h from state
132 uint64_t h0 = X[2];
133 uint64_t h1 = X[3];
134 uint64_t h2 = X[4];
135
136 SIMD_8x44 r0 = SIMD_8x44::load(&X[5]);
137 SIMD_8x44 r1 = SIMD_8x44::load(&X[5 + 8]);
138 SIMD_8x44 r2 = SIMD_8x44::load(&X[5 + 2 * 8]);
139 SIMD_8x44::interleave_3x8(r0, r1, r2);
140
141 const auto s1 = r1.mul_20();
142 const auto s2 = r2.mul_20();
143
144 // Constants for vectorized message loading
145 // Deinterleave indices: separate low (t0) and high (t1) 64-bit halves of each 128-bit block
146 // Memory layout: [t0_0, t1_0, t0_1, t1_1, ...] -> want [t0_0..t0_7] and [t1_0..t1_7]
147 const auto idx_lo = SIMD_8x44(14, 12, 10, 8, 6, 4, 2, 0);
148 const auto idx_hi = SIMD_8x44(15, 13, 11, 9, 7, 5, 3, 1);
149 const auto mask44 = SIMD_8x44::splat(M44);
150 const auto mask42 = SIMD_8x44::splat(M42);
151 const auto hibit = SIMD_8x44::splat(hibit64);
152
153 while(blocks >= 8) {
154 // Load 8 message blocks (128 bytes) with two 512-bit loads
155 const auto data0 = SIMD_8x44::load(m);
156 const auto data1 = SIMD_8x44::load(m + 64);
157
158 // Deinterleave: separate low and high 64-bit halves of each 128-bit block
159 const auto t0 = SIMD_8x44::permute2(idx_lo, data0, data1);
160 const auto t1 = SIMD_8x44::permute2(idx_hi, data0, data1);
161
162 // Convert to radix 2^44 representation using VBMI2
163 // limb0 = t0[43:0]
164 // limb1 = t1[23:0]:t0[63:44] (bits 44-87 of block)
165 // limb2 = t1[63:24] | hibit (bits 88-129 of block + high bit)
166 auto m0 = t0 & mask44;
167 auto m1 = SIMD_8x44::shrdi<44>(t0, t1) & mask44;
168 auto m2 = (t1.shr<24>() & mask42) | hibit;
169
170 // Add h to first block
171 m0 = m0.add_lane_zero(h0);
172 m1 = m1.add_lane_zero(h1);
173 m2 = m2.add_lane_zero(h2);
174
175 // d0 = m0*r0 + m1*s2 + m2*s1
176 const SIMD_8x44 d0_lo = SIMD_8x44().ifma_lo(m0, r0).ifma_lo(m1, s2).ifma_lo(m2, s1);
177 const SIMD_8x44 d0_hi = SIMD_8x44().ifma_hi(m0, r0).ifma_hi(m1, s2).ifma_hi(m2, s1);
178
179 // d1 = m0*r1 + m1*r0 + m2*s2
180 const SIMD_8x44 d1_lo = SIMD_8x44().ifma_lo(m0, r1).ifma_lo(m1, r0).ifma_lo(m2, s2);
181 const SIMD_8x44 d1_hi = SIMD_8x44().ifma_hi(m0, r1).ifma_hi(m1, r0).ifma_hi(m2, s2);
182
183 // d2 = m0*r2 + m1*r1 + m2*r0
184 const SIMD_8x44 d2_lo = SIMD_8x44().ifma_lo(m0, r2).ifma_lo(m1, r1).ifma_lo(m2, r0);
185 const SIMD_8x44 d2_hi = SIMD_8x44().ifma_hi(m0, r2).ifma_hi(m1, r1).ifma_hi(m2, r0);
186
187 // Horizontal adds can't overflow - at most 8*3*(2**52-1) ~= 2**57
188 const uint64_t sum0_lo = d0_lo.horizontal_add();
189 const uint64_t sum0_hi = d0_hi.horizontal_add();
190 uint64_t sum1_lo = d1_lo.horizontal_add();
191 const uint64_t sum1_hi = d1_hi.horizontal_add();
192 uint64_t sum2_lo = d2_lo.horizontal_add();
193 const uint64_t sum2_hi = d2_hi.horizontal_add();
194
195 h0 = sum0_lo & M44;
196 sum1_lo += (sum0_lo >> 44) + (sum0_hi << 8);
197 h1 = sum1_lo & M44;
198 sum2_lo += (sum1_lo >> 44) + (sum1_hi << 8);
199 h2 = sum2_lo & M42;
200
201 // Wrap-around reduction: carry * 5 goes back to h0
202 uint64_t carry = ((sum2_lo >> 42) + (sum2_hi << 10)) * 5;
203 carry += h0;
204 h0 = carry & M44;
205 carry >>= 44;
206 carry += h1;
207 h1 = carry & M44;
208 carry >>= 44;
209 h2 += carry;
210
211 m += 8 * 16;
212 blocks -= 8;
213 }
214
215 X[2] = h0;
216 X[3] = h1;
217 X[4] = h2;
218
219 return (original_blocks - blocks);
220}
221
222} // namespace Botan
ASN1_Type operator|(ASN1_Type x, ASN1_Type y)
Definition asn1_obj.h:74
void carry(int64_t &h0, int64_t &h1)
std::vector< T, secure_allocator< T > > secure_vector
Definition secmem.h:68
ECIES_Flags operator&(ECIES_Flags a, ECIES_Flags b)
Definition ecies.h:70